diff options
author | mpotterc0k | 2013-08-19 15:36:25 +0000 |
---|---|---|
committer | Gerrit Code Review @ Eclipse.org | 2013-08-29 16:13:13 +0000 |
commit | b7c17cd7038b4adfc1e9b14f568540baed73b556 (patch) | |
tree | 880537cc77a652f721b2db17168dae276dce5e43 | |
parent | 419c045952758bcd1c7a2aba685d890ea20db86a (diff) | |
download | org.eclipse.osee-b7c17cd7038b4adfc1e9b14f568540baed73b556.tar.gz org.eclipse.osee-b7c17cd7038b4adfc1e9b14f568540baed73b556.tar.xz org.eclipse.osee-b7c17cd7038b4adfc1e9b14f568540baed73b556.zip |
feature[ats_GVNXN]: Update WORDML and DOORS to HTML conversions
Change-Id: I3467c58e808298b3706fe024618d5cf4796944ed
Signed-off-by: mpotterc0k <marc.a.potter@boeing.com>
6 files changed, 674 insertions, 155 deletions
diff --git a/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtmlTest.java b/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtmlTest.java index 3d31f0aa098..1e100be7cd2 100644 --- a/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtmlTest.java +++ b/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtmlTest.java @@ -15,8 +15,8 @@ import java.io.BufferedInputStream; import java.io.IOException; import java.io.InputStream; import java.net.URL; -import org.junit.Assert; import org.eclipse.osee.framework.jdk.core.util.Lib; +import org.junit.Assert; import org.junit.Test; /** @@ -34,7 +34,6 @@ public class NormalizeHtmlTest { String expected = getResource(CONVERTED_HTML); input = NormalizeHtml.convertToNormalizedHTML(input); - input = bodyOnly(input); expected = bodyOnly(expected); input = input.replaceAll("\r", ""); diff --git a/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_converted.htm b/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_converted.htm index 092f5e515e8..075877b41d5 100644 --- a/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_converted.htm +++ b/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_converted.htm @@ -187,45 +187,39 @@ </span> </span> </p> - <table width="4369" border="1" bordercolor="#000000" cellpadding="0" cellspacing="3"> - <colgroup> - <col width="2180" /> - <col width="2178" /> - </colgroup> - <tbody> - <tr valign="TOP"> - <td width="2180"> - <p style="margin-top: 0.07in"> - <span style=" font-family: Times New Roman, serif"> - <span style=" font-size: xx-large;">26 pt in table with border</span> - </span> - </p> - </td> - <td width="2178"> - <p style="margin-top: 0.07in"> - <span style=" font-family: Times New Roman, serif"> - <span style=" font-size: xx-large;">26 pt in table with border</span> - </span> - </p> - </td> - </tr> - <tr valign="TOP"> - <td width="2180"> - <p style="margin-top: 0.07in"> - <span style=" font-family: Times New Roman, serif"> - <span style=" font-size: xx-large;">28 pt in table with border</span> - </span> - </p> - </td> - <td width="2178"> - <p style="margin-top: 0.07in"> - <span style=" font-family: Times New Roman, serif"> - <span style=" font-size: xx-large;">28 pt in table with border</span> - </span> - </p> - </td> - </tr> - </tbody> + <table border="1"> + <tr valign="top"> + <td> + <p style="margin-top: 0.07in"> + <span style=" font-family: Times New Roman, serif"> + <span style=" font-size: xx-large;">26 pt in table with border</span> + </span> + </p> + </td> + <td> + <p style="margin-top: 0.07in"> + <span style=" font-family: Times New Roman, serif"> + <span style=" font-size: xx-large;">26 pt in table with border</span> + </span> + </p> + </td> + </tr> + <tr valign="top"> + <td> + <p style="margin-top: 0.07in"> + <span style=" font-family: Times New Roman, serif"> + <span style=" font-size: xx-large;">28 pt in table with border</span> + </span> + </p> + </td> + <td> + <p style="margin-top: 0.07in"> + <span style=" font-family: Times New Roman, serif"> + <span style=" font-size: xx-large;">28 pt in table with border</span> + </span> + </p> + </td> + </tr> </table> <p style="margin-top: 0.07in; margin-bottom: 0in; line-height: 100%"> <br /> @@ -239,45 +233,39 @@ </span> </span> </p> - <table width="4369" border="1" bordercolor="#000000" cellpadding="0" cellspacing="0"> - <colgroup> - <col width="2183" /> - <col width="2184" /> - </colgroup> - <tbody> - <tr valign="TOP"> - <td width="2183"> - <p style="margin-top: 0.07in"> - <span style=" font-family: Times New Roman, serif"> - <span style=" font-size: 300%;">36 pt in table without border</span> - </span> - </p> - </td> - <td width="2184"> - <p style="margin-top: 0.07in"> - <span style=" font-family: Times New Roman, serif"> - <span style=" font-size: 300%;">36 pt in table without border</span> - </span> - </p> - </td> - </tr> - <tr valign="TOP"> - <td width="2183"> - <p style="margin-top: 0.07in"> - <span style=" font-family: Times New Roman, serif"> - <span style=" font-size: 300%;">40 pt in table w/o border</span> - </span> - </p> - </td> - <td width="2184"> - <p style="margin-top: 0.07in"> - <span style=" font-family: Times New Roman, serif"> - <span style=" font-size: 300%;">40 pt in table w/o border</span> - </span> - </p> - </td> - </tr> - </tbody> + <table> + <tr valign="top"> + <td> + <p style="margin-top: 0.07in"> + <span style=" font-family: Times New Roman, serif"> + <span style=" font-size: 300%;">36 pt in table without border</span> + </span> + </p> + </td> + <td> + <p style="margin-top: 0.07in"> + <span style=" font-family: Times New Roman, serif"> + <span style=" font-size: 300%;">36 pt in table without border</span> + </span> + </p> + </td> + </tr> + <tr valign="top"> + <td> + <p style="margin-top: 0.07in"> + <span style=" font-family: Times New Roman, serif"> + <span style=" font-size: 300%;">40 pt in table w/o border</span> + </span> + </p> + </td> + <td> + <p style="margin-top: 0.07in"> + <span style=" font-family: Times New Roman, serif"> + <span style=" font-size: 300%;">40 pt in table w/o border</span> + </span> + </p> + </td> + </tr> </table> <p style="margin-top: 0.07in; margin-bottom: 0in; line-height: 100%"> <br /> diff --git a/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_test_doc.htm b/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_test_doc.htm index ef338dbdb40..39b724f614f 100644 --- a/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_test_doc.htm +++ b/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_test_doc.htm @@ -90,7 +90,7 @@ pt font normal </SPAN></FONT></FONT></FONT> </P> <P STYLE="margin-top: 0.07in; margin-bottom: 0in; line-height: 100%"><FONT COLOR="#000000"><FONT FACE="Times New Roman, serif"><FONT SIZE=7 STYLE="font-size: 32pt"><SPAN STYLE="background: #ffffff">32 pt font</SPAN></FONT></FONT></FONT></P> -<TABLE WIDTH=4369 BORDER=1 BORDERCOLOR="#000000" CELLPADDING=0 CELLSPACING=0> +<TABLE WIDTH=4369 BORDER=0 BORDERCOLOR="#000000" CELLPADDING=0 CELLSPACING=0> <COL WIDTH=2183> <COL WIDTH=2184> <TR VALIGN=TOP> diff --git a/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/artifact/Artifact.java b/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/artifact/Artifact.java index d2f3f7c9d12..992658e8888 100644 --- a/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/artifact/Artifact.java +++ b/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/artifact/Artifact.java @@ -963,8 +963,8 @@ public class Artifact extends NamedIdentity<String> implements IArtifact, IAdapt * enumerated and value is already present */ private final <T> void setOrAddAttribute(IAttributeType attributeType, T value) throws OseeCoreException { - List<Attribute<String>> attributes = getAttributes(attributeType); - for (Attribute<String> canidateAttribute : attributes) { + List<Attribute<Object>> attributes = getAttributes(attributeType); + for (Attribute<?> canidateAttribute : attributes) { if (canidateAttribute.getValue().equals(value)) { return; } diff --git a/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/importing/parsers/DoorsArtifactExtractor.java b/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/importing/parsers/DoorsArtifactExtractor.java index 6396979a1af..30cbfcb8f16 100644 --- a/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/importing/parsers/DoorsArtifactExtractor.java +++ b/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/importing/parsers/DoorsArtifactExtractor.java @@ -12,6 +12,7 @@ package org.eclipse.osee.framework.skynet.core.importing.parsers; import java.io.File; import java.io.FileFilter; +import java.io.UnsupportedEncodingException; import java.net.URI; import java.net.URISyntaxException; import java.util.HashMap; @@ -54,7 +55,10 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor { private final static String BR_TAG = "<br />"; private final static String BODY_START_TAG = "<body>"; private final static String BODY_END_TAG = "</body>"; + private final static String LIST_ITEM_TAG = "<li>"; + private final static String LIST_ITEM_END_TAG = "</li>"; private final static String IMAGE_BASE_NAME = "Image Content_"; + private final String BLANK_HTML_LINE = "<br />"; private final static String[] VERIFICATION_KEYWORDS = { "Effectivity:", "Verf Method:", @@ -223,7 +227,9 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor { tableFound = true; } } else if (qName.equalsIgnoreCase("tr")) { - // Do nothing here -- no processing needed + if (embededTableCount > 0) { + cell.append("<tr>"); + } } else if (qName.equalsIgnoreCase("th")) { if (embededTableCount > 0) { // table within the table @@ -274,6 +280,8 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor { throw new SAXException(ex); } } + } else { + cell.append("</tr>"); } } else if (qName.equalsIgnoreCase("th")) { if (embededTableCount > 0) { @@ -415,7 +423,7 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor { /*************************************************************** * First check the document applicability box, if it is empty this is a header row */ - boolean isHeaderRow = false, foundDataType = false; + boolean isHeaderRow = false, foundDataType = false, isList; int rowIndex; for (rowIndex = 0; rowIndex < row.length; rowIndex++) { RowTypeEnum rowType = rowIndexToRowTypeMap.get(rowIndex); @@ -438,7 +446,12 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor { } } if (!rowIndexToRowTypeMap.isEmpty()) { + isList = false; + int requirementIndex = -1; + String requirementColumn = ""; + boolean isRequirementColumn = false; for (rowIndex = 0; rowIndex < row.length; rowIndex++) { + isRequirementColumn = false; RowTypeEnum rowType = rowIndexToRowTypeMap.get(rowIndex); String rowValue = row[rowIndex]; @@ -465,6 +478,9 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor { } rowValue = ""; } + requirementIndex = rowIndex; + isRequirementColumn = true; + requirementColumn = rowValue; break; @@ -513,7 +529,6 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor { case TABLE: case INFORMATION: - case LIST: case FIGURE: isRequirement = lastDataType.equals(DataTypeEnum.REQUIREMENT); break; @@ -524,6 +539,11 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor { theArtifact.clear(); return; + case LIST: + isRequirement = lastDataType.equals(DataTypeEnum.REQUIREMENT); + isList = true; + break; + case OTHER: foundDataType = false; lastDataType = DataTypeEnum.OTHER; @@ -543,20 +563,414 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor { break; } + if (!isRequirementColumn) { + if (inArtifact) { + ListIterator<String> iter = theArtifact.listIterator(rowIndex); + String theColumnValue = iter.next(); + theColumnValue += " " + rowValue.trim(); + iter.set(theColumnValue); + } else { + theArtifact.add(rowValue.trim()); + } + } else { + if (!inArtifact) { + theArtifact.add(""); + } + } + + } + // process requirement column -- functionally always inArtifact because of the empty add above + ListIterator<String> iter = theArtifact.listIterator(requirementIndex); + String theColumnValue = iter.next(); + if (isList) { + requirementColumn = processList(requirementColumn); + } + theColumnValue += " " + requirementColumn.trim(); + iter.set(theColumnValue); + } + inArtifact = true; + } + + private String processList(String inputValue) { + inputValue = normalizeHtml(inputValue); + /************************************************************************************** + * The way Doors export works with lists is that there is badly spaced <div> statements -- remove them + */ + inputValue = inputValue.replaceAll("<div>", ""); + inputValue = inputValue.replaceAll("</div>", ""); + /********************************************************************************* + * Remove extra blank lines too + */ + inputValue = inputValue.replaceAll(BLANK_HTML_LINE + "\\s+" + BLANK_HTML_LINE, BLANK_HTML_LINE); + StringBuilder returnString = new StringBuilder(inputValue.trim()); + //@formatter:off + /******************************************************************************** + * The Doors export outputs a list as pure text (e.g. a. list item). Convert this to an HTML list + * + * Assumptions: + * 1) The format of the list is either a. or 1. + * 2) There is no embedded 1. or a. in the text of the list. + * That is if 1. shows up in a alpha list it means there is a new list starting or if b. + * shows up after a. then it the next item + */ + //@formatter:on + // find first text char + char[] theChars = stringBuilderToChars(returnString); + int[] startEnd = findEndOfList(theChars, 0); + int iPos = startEnd[0]; + int endOfList = startEnd[1]; + int startOfNextList = startEnd[2]; + boolean isNumeric = Character.isDigit(theChars[iPos]); + boolean isLowerCase = Character.isLowerCase(theChars[iPos]); + int currentNumber = 0; + String currentLetter = ""; + if (isNumeric) { + int startPos = iPos; + while ((theChars[iPos] != '.') && theChars[iPos] != ')') { + iPos++; + } + String theNumber = returnString.substring(startPos, iPos); + currentNumber = Integer.parseInt(theNumber); + } else { + int startPos = iPos; + while ((theChars[iPos] != '.') && theChars[iPos] != ')') { + iPos++; + } + currentLetter = returnString.substring(startPos, iPos); + } + int nextItem = 0; + returnString.delete(iPos - 1, iPos + 1); + endOfList -= 2; + startOfNextList -= 2; + String insertValue = null; + if (isNumeric) { + insertValue = "<ol>"; + } else if (isLowerCase) { + insertValue = "<ol type = \"a\">"; + } else { + insertValue = "<ol type = \"A\">"; + } + returnString.insert(iPos - 1, insertValue); + if (iPos < endOfList) { + endOfList = endOfList + insertValue.length(); + startOfNextList = startOfNextList + insertValue.length(); + } + iPos += insertValue.length(); - if (inArtifact) { - ListIterator<String> iter = theArtifact.listIterator(rowIndex); - String theColumnValue = iter.next(); - theColumnValue += "\n" + rowValue.trim(); - iter.set(theColumnValue); + listData theListData = new listData(); + boolean lastWasSublist = false; + while (nextItem != -1) { + if (theListData.getNewList()) { + lastWasSublist = true; + } else { + lastWasSublist = false; + returnString.insert(iPos - 1, LIST_ITEM_TAG); + if (iPos < endOfList) { + endOfList = endOfList + LIST_ITEM_TAG.length(); + startOfNextList = startOfNextList + LIST_ITEM_TAG.length(); + } + iPos += LIST_ITEM_TAG.length() - 1; + } + theChars = stringBuilderToChars(returnString); + nextItem = findNextListItem(theChars, iPos, isNumeric, isLowerCase, currentNumber, currentLetter, theListData); + if (nextItem == -1) { + break; + } + + if (theListData.getNewList()) { + int startPoint = (nextItem < startOfNextList) ? nextItem : startOfNextList; + String theSublist = returnString.substring(0, startPoint); + int end = theListData.getNextItem(); + if (end >= returnString.length()) { + end = returnString.length() - 1; + } + + String theRawSublist = new String(theChars, startPoint, end - startPoint + 1); + int initialLen = theRawSublist.length(); + theRawSublist = processList(theRawSublist); + theSublist += theRawSublist; + theSublist += LIST_ITEM_END_TAG; + int delta = (theRawSublist.length() - initialLen) + LIST_ITEM_END_TAG.length(); + endOfList += delta; + startOfNextList += delta; + if ((theListData.getNextItem() != -1) && (theListData.getNextItem() < returnString.length())) { + theSublist += returnString.substring(theListData.getNextItem() + 1); + } + returnString.delete(0, returnString.length()); + returnString.append(theSublist); + + } else { + if (isNumeric) { + currentNumber = + Integer.valueOf(returnString.substring(nextItem, nextItem + theListData.getItemLength() - 1)); } else { - theArtifact.add(rowValue.trim()); + currentLetter = returnString.substring(nextItem, nextItem + theListData.getItemLength() - 1); + } + returnString.delete(nextItem, nextItem + theListData.getItemLength()); + endOfList -= theListData.getItemLength(); + startOfNextList -= theListData.getItemLength(); + /************************************************************* + * Since we are converting a line of text, there is a blank line after it. Delete the <BR> + * </BR> + */ + if (!lastWasSublist) { + int end = nextItem; + if (end > returnString.length()) { + end = returnString.length(); + } + String test = returnString.substring(0, end); + int lastPoint = test.lastIndexOf(BLANK_HTML_LINE); + if (lastPoint != -1) { + returnString.delete(lastPoint, end); + int delta = test.length() - lastPoint; + endOfList -= delta; + nextItem -= delta; + startOfNextList -= delta; + } } + if (!lastWasSublist) { + returnString.insert(nextItem, LIST_ITEM_END_TAG); + if (nextItem < endOfList) { + endOfList = endOfList + LIST_ITEM_END_TAG.length(); + startOfNextList = startOfNextList + LIST_ITEM_END_TAG.length(); + } + nextItem = nextItem + LIST_ITEM_END_TAG.length(); + } + iPos = nextItem + 1; + } + theChars = stringBuilderToChars(returnString); + } + // find the insertion point for list end + String tokenToInsert = "</li></ol>"; + if (theListData.getNewList()) { + tokenToInsert = "</ol>"; + } + + if (endOfList < theChars.length) { + returnString.insert(endOfList, tokenToInsert); + } else { + // verify the list doesn't end with <BR></BR> + String test = returnString.toString(); + int lastPoint = test.lastIndexOf(BLANK_HTML_LINE); + if (lastPoint == (test.length() - BLANK_HTML_LINE.length())) { + returnString.delete(lastPoint, returnString.length()); + } + returnString.append(tokenToInsert); + } + + return returnString.toString(); + } + private int[] findEndOfList(char[] theChars, int startPoint) { + int iPos = startPoint; + int[] iReturn = {0, theChars.length, theChars.length}; + int tagCount = 0; + boolean notFirst = false; + boolean foundNonTagItem = false; + while (iPos < theChars.length) { + while ((iPos < theChars.length) && ((theChars[iPos] == '\t') || (theChars[iPos] == '\n') || (Character.isWhitespace(theChars[iPos])))) { + iPos++; + } + if (iPos >= theChars.length) { + iReturn[1] = theChars.length; + break; + } + if (theChars[iPos] == '<') { + int startofCloseTag = iPos; + iPos++; + if (theChars[iPos] == '/') { + tagCount--; + while ((iPos < theChars.length) && (theChars[iPos] != '>')) { + iPos++; + } + if (((tagCount == 0) && foundNonTagItem) || (tagCount < 0)) { + iReturn[1] = startofCloseTag; + iReturn[2] = iPos; + while ((iReturn[2] < theChars.length) && (theChars[iReturn[2]] != '<')) { + iReturn[2] = iReturn[2] + 1; + } + break; + } + } else { + tagCount++; + } + while ((iPos < theChars.length) && (theChars[iPos] != '>')) { + iPos++; + } + iPos++; + } else if (notFirst) { + if (!foundNonTagItem) { + iReturn[0] = iPos; + foundNonTagItem = true; + } + if (tagCount == 0) { + break; + } else { + // find next tag + while ((iPos < theChars.length) && (theChars[iPos] != '<')) { + iPos++; + } + iReturn[1] = iPos - 1; + // find the end of the tag + iReturn[2] = iPos; + while ((iReturn[2] < theChars.length) && (theChars[iReturn[2]] != '>')) { + iReturn[2] = iReturn[2] + 1; + } + iReturn[2] = iReturn[2] + 1; + } + } else { + // no opening tags, therefore list not enclosed in tags. + iPos = theChars.length; } + notFirst = true; + } + return iReturn; + } + static char[] stringBuilderToChars(StringBuilder sb) { + char[] returnArray = new char[sb.length()]; + sb.getChars(0, sb.length(), returnArray, 0); + return returnArray; + } + + private class listData { + private boolean newList; + private int itemLength; + private int nextItem; + + public listData() { + this.newList = false; + this.itemLength = 0; } - inArtifact = true; + + public int getItemLength() { + return itemLength; + } + + public int getNextItem() { + return nextItem; + } + + public boolean getNewList() { + return newList; + } + + public void setNextItem(int nextItem) { + this.nextItem = nextItem; + } + + public void setItemLength(int itemLength) { + this.itemLength = itemLength; + } + + public void setNewList(boolean newList) { + this.newList = newList; + } + } + + private int findNextListItem(char[] theChars, int iPos, boolean isNumeric, boolean isLowerCase, int currentNumber, String currentLetter, listData listData) { + //@formatter:off + /**************************************************************************** + * Now the tricky part. We are looking for + * 1) <space><next value>.<space or or &#something> + * 2) <space><next level value>. + */ + //@formatter:on + + iPos++; + if (iPos >= theChars.length) { + return -1; + } + StringBuilder asString = new StringBuilder(); + asString.append(theChars, iPos, theChars.length - iPos); + int aListDot = asString.toString().toLowerCase().indexOf("a."); + int aListParen = asString.toString().toLowerCase().indexOf("a."); + int aList = -1; + if (aListDot == -1) { + aList = aListParen; + } else if (aListParen == -1) { + aList = aListDot; + } else { + aList = (aListDot < aListParen) ? aListDot : aListParen; + } + int oneListDot = asString.indexOf("1."); + int oneListParen = asString.indexOf("1)"); + int oneList = -1; + if (oneListDot == -1) { + oneList = oneListParen; + } else if (aListParen == -1) { + oneList = oneListDot; + } else { + oneList = (oneListDot < oneListParen) ? oneListDot : oneListParen; + } + + int nextListItem = -1; + String nextItem = ""; + if (isNumeric) { + nextItem = Integer.toString(currentNumber + 1) + "."; + } else { + // assume Ascii -- that is, that the letters are contiguous + byte[] theLetters = null; + try { + theLetters = currentLetter.getBytes("UTF-8"); + } catch (UnsupportedEncodingException e) { + theLetters = currentLetter.getBytes(); + } + int theCharToChange = theLetters.length - 1; + if (currentLetter.toLowerCase().charAt(theCharToChange) == 'z') { + if (theCharToChange > 0) { + theLetters[theCharToChange - 1]++; + if (isLowerCase) { + theLetters[theCharToChange] = "a".getBytes()[0]; + } else { + theLetters[theCharToChange] = "A".getBytes()[0]; + } + } else { + byte[] newLetterArray = new byte[theLetters.length + 1]; + for (int i = 0; i < newLetterArray.length; i++) { + if (isLowerCase) { + newLetterArray[i] = "a".getBytes()[0]; + } else { + newLetterArray[i] = "A".getBytes()[0]; + } + } + theLetters = newLetterArray; + } + } else { + theLetters[0]++; + } + nextItem = new String(theLetters) + "."; + } + nextListItem = asString.indexOf(nextItem); + if (nextListItem != -1) { + // verify this is not just a char and period + char prev = asString.charAt(nextListItem - 1); + while (!(Character.isWhitespace(prev) || (prev == ';') || (prev == '>'))) { + nextListItem = asString.indexOf(nextItem, nextListItem + 1); + if (nextListItem == -1) { + break; + } + prev = asString.charAt(nextListItem - 1); + } + } + if ((aList == -1) && (oneList == -1) && (nextListItem == -1)) { + return -1; + } + aList = (aList != -1) ? aList + iPos : theChars.length + 1; + oneList = (oneList != -1) ? oneList + iPos : theChars.length + 1; + nextListItem = (nextListItem != -1) ? nextListItem + iPos : theChars.length + 1; + int iReturn = (aList < oneList) ? aList : oneList; + iReturn = (iReturn < nextListItem) ? iReturn : nextListItem; + if (iReturn == nextListItem) { + listData.setNewList(false); + listData.setItemLength(nextItem.length()); + listData.setNextItem(nextListItem); + } else { + listData.setNewList(true); + listData.setItemLength(2); + listData.setNextItem(nextListItem - 1); + } + return iReturn; } private void processArtifact() throws OseeCoreException { @@ -585,7 +999,7 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor { case REQUIREMENTS: StringBuffer imageFileList = new StringBuffer(""); getImageList(rowValue, imageFileList); - rowValue = normailizeHtml(rowValue); + rowValue = normalizeHtml(rowValue); String imageFile = imageFileList.toString(); if (!imageFile.isEmpty()) { String theImage; @@ -783,9 +1197,9 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor { return returnValue; } - private String normailizeHtml(String inputHtml) { + private String normalizeHtml(String inputHtml) { - String returnValue = NormalizeHtml.convertToNormalizedHTML(inputHtml); + String returnValue = NormalizeHtml.convertToNormalizedHTML(inputHtml, true, true, true); int bodyStart = returnValue.indexOf(BODY_START_TAG); int bodyEnd = returnValue.indexOf(BODY_END_TAG); if (bodyStart != -1) { @@ -821,10 +1235,10 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor { * tags these are not meaningful */ returnValue = returnValue.trim(); - int brTag = returnValue.lastIndexOf(BR_TAG); - while (brTag == returnValue.length() - BR_TAG.length()) { + int brTag = returnValue.toLowerCase().lastIndexOf(BR_TAG); + while ((brTag != -1) && (brTag == returnValue.length() - BR_TAG.length())) { returnValue = returnValue.substring(0, brTag).trim(); - brTag = returnValue.lastIndexOf(BR_TAG); + brTag = returnValue.toLowerCase().lastIndexOf(BR_TAG); } return returnValue; } diff --git a/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtml.java b/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtml.java index 2724c31da23..04e3af90248 100644 --- a/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtml.java +++ b/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtml.java @@ -10,6 +10,7 @@ *******************************************************************************/ package org.eclipse.osee.framework.skynet.core.utility; +import java.util.ArrayList; import java.util.TreeMap; import org.jsoup.Jsoup; import org.jsoup.nodes.Attribute; @@ -19,7 +20,6 @@ import org.jsoup.nodes.Document.OutputSettings; import org.jsoup.nodes.Document.QuirksMode; import org.jsoup.nodes.Element; import org.jsoup.nodes.Entities.EscapeMode; -import org.jsoup.nodes.TextNode; import org.jsoup.parser.Tag; import org.jsoup.select.Elements; @@ -63,11 +63,15 @@ public final class NormalizeHtml { private static final String rdquo = String.valueOf('\u201D'); private static final String lsquo = String.valueOf('\u2018'); private static final String rsquo = String.valueOf('\u2019'); + private static final String figureDash = String.valueOf('\u2012'); + private static final String enDash = String.valueOf('\u2013'); + private static final String emDash = String.valueOf('\u2014'); private static final String NON_BREAK_SPACE = String.valueOf('\u00A0'); private static final String NON_BREAK_FIGURE_SPACE = String.valueOf('\u2007'); private static final String NON_BREAK_NARROW_SPACE = String.valueOf('\u202F'); private static final String NON_BREAK_WORD_JOINER = String.valueOf('\u2060'); private static final String NON_BREAK_ZERO_WIDTH = String.valueOf('\uFEFF'); + private static ArrayList<String> allowedAttributes = null; private NormalizeHtml() { // Utility Class @@ -93,7 +97,7 @@ public final class NormalizeHtml { * @return Normalized HTML */ public static String convertToNormalizedHTML(String inputHTML) { - return convertToNormalizedHTML(inputHTML, false, false); + return convertToNormalizedHTML(inputHTML, false, false, false); } /** @@ -103,10 +107,10 @@ public final class NormalizeHtml { * * @param inputHTML HTML source to be normalized * @param removeInitialStyle Remove initial style information. - * @param removeEmptyStyle Remove any empty (containing no text) style sections + * @param removeEmptyTags Remove any empty (containing no text) style sections * @return Normalized HTML */ - public static String convertToNormalizedHTML(String inputHTML, boolean removeInitialStyle, boolean removeEmptyStyle) { + public static String convertToNormalizedHTML(String inputHTML, boolean removeInitialStyle, boolean removeEmptyTags, boolean removeHeaderFooter) { Document doc = Jsoup.parse(inputHTML); doc.quirksMode(QuirksMode.noQuirks); OutputSettings outputSettings = doc.outputSettings(); @@ -134,77 +138,154 @@ public final class NormalizeHtml { e.tagName("span"); e.attr("style", "text-decoration: line-through;"); } + removeDepreactedTags(doc); + processTagsWithAttributes(doc); + processHeaderFooter(doc, removeHeaderFooter); processFontTags(doc); processInitialStyleTags(doc, removeInitialStyle); - processEmptyStyleTags(doc, removeEmptyStyle); + processEmptyTags(doc, removeEmptyTags); return processText(doc); } + static void removeDepreactedTags(Document doc) { + Elements center = doc.select("center"); + for (Element e : center) { + Elements children = e.children(); + for (Element c : children) { + e.before(c); + } + e.remove(); + } + } + + private static void processTagsWithAttributes(Document doc) { + /**************************************************************************** + * HTML allows the same table to be represented many ways. Normalize the information into a standard format. Note + * this will simplify the table as well (that is some formatting may be lost) Remember, the goal is to reduce the + * HTML to the point that it is the same regardless of the source editor Also images have similar issues -- + * normalize to the basic keyword + */ + + if (allowedAttributes == null) { + allowedAttributes = new ArrayList<String>(); + allowedAttributes.add("border"); + allowedAttributes.add("frame"); + allowedAttributes.add("rules"); + allowedAttributes.add("valign"); + allowedAttributes.add("src"); + } + Elements tables = doc.select("table"); + for (Element table : tables) { + removeUnsupportedAttributes(table, true); + // remove Colgroup + Elements colgroup = table.select("colgroup"); + for (Element c : colgroup) { + c.remove(); + } + // no support for header / footer -- just rows + + removeElements(table, "thead"); + removeElements(table, "tfoot"); + removeElements(table, "tbody"); + // remove unsupported attributes on tr and td tags and move the attributes from td to tr + Elements rows = table.select("td"); + for (Element row : rows) { + String[] attributeValues = removeUnsupportedAttributes(row, false); + Element tr = null; + Element parent = row.parent(); + if (parent.tagName().equals("tr")) { + tr = parent; + } else { + Elements siblings = row.siblingElements(); + for (Element e : siblings) { + if (e.tagName().equals("tr")) { + tr = e; + break; + } + } + } + if (tr != null) { + for (int i = 0; i < attributeValues.length; i++) { + if (attributeValues[i] != null) { + tr.attr(allowedAttributes.get(i), attributeValues[i].toLowerCase()); + } + } + } + } + rows = table.select("tr"); + for (Element row : rows) { + removeUnsupportedAttributes(row, true); + } + } + + Elements images = doc.select("img"); + for (Element image : images) { + removeUnsupportedAttributes(image, true); + } + } + static void processInitialStyleTags(Document doc, boolean removeInitialStyle) { if (removeInitialStyle) { - boolean foundText = false; Elements pTags = doc.select("p"); for (Element p : pTags) { - Elements style = p.getElementsByAttribute("style"); - for (Element e : style) { - Element parent = e.parent(); - if (!parent.tagName().equals("span")) { - if (e.hasText()) { - String text = e.text(); - TextNode newNode = new TextNode(text, e.baseUri()); - e.remove(); - // Insert newline between various text elements - if (foundText) { - Tag tag = Tag.valueOf("br"); - Element br = new Element(tag, parent.baseUri()); - parent.appendChild(br); - br = new Element(tag, parent.baseUri()); - parent.appendChild(br); - } - parent.appendChild(newNode); - foundText = true; - break; + if (!p.attr("style").equals("")) { + if (p.hasText()) { + if (!p.parent().tagName().equals("li")) { + Element cr = new Element(Tag.valueOf("br"), p.baseUri()); + p.after(cr); } + p.unwrap(); } } } Elements span = doc.select("span"); for (Element s : span) { - Elements style = s.getElementsByAttributeValueMatching("style", "font*|margin*"); - for (Element e : style) { - Element parent = e.parent(); - if (!parent.tagName().equals("p")) { - if (e.hasText()) { - String text = e.text(); - TextNode newNode = new TextNode(text, e.baseUri()); - e.remove(); - parent.appendChild(newNode); - // Insert newline between various text elements - if (foundText) { - Tag tag = Tag.valueOf("br"); - Element br = new Element(tag, parent.baseUri()); - parent.appendChild(br); - parent.appendChild(br); - } - foundText = true; - break; - } - } + if (!s.attr("style").equals("") && !s.hasText()) { + s.remove(); } } } } - private static void processEmptyStyleTags(Document doc, boolean removeEmptyStyle) { - if (removeEmptyStyle) { + private static void processEmptyTags(Document doc, boolean removeEmptyTags) { + if (removeEmptyTags) { Elements pTags = doc.select("p"); for (Element p : pTags) { + // Element cr = new Element(Tag.valueOf("br"), p.baseUri()); + //p.after(cr); deleteEmptyElemens(p); } Elements span = doc.select("span"); for (Element s : span) { deleteEmptyElemens(s); } + Elements div = doc.select("div"); + for (Element e : div) { + if (!e.hasText()) { + e.remove(); + } else { + e.unwrap(); + } + } + Elements aTags = doc.select("a"); + for (Element a : aTags) { + Attributes attr = a.attributes(); + if ((attr.size() == 1) && (!attr.get("name").equals(""))) { + a.unwrap(); + } + } + } + } + + static void processHeaderFooter(Document doc, boolean removeHeaderFooter) { + if (removeHeaderFooter) { + Elements div = doc.select("div"); + for (Element d : div) { + Elements headerFooter = d.getElementsByAttributeValueMatching("type", "HEADER*|FOOTER*"); + for (Element hf : headerFooter) { + hf.remove(); + } + } } } @@ -215,18 +296,12 @@ public final class NormalizeHtml { Elements style = elementToCheck.getElementsByAttributeValueMatching("style", "font*|margin*"); for (Element e : style) { if (e.hasText()) { - break; + continue; } else { - Elements images = e.select("img"); - Element parent = e.parent(); - e.remove(); - if (images.size() > 0) { - for (Element image : images) { - parent.appendChild(image); - } - } + e.unwrap(); } } + } private static String processText(Document doc) { @@ -243,6 +318,14 @@ public final class NormalizeHtml { theText = theText.replaceAll(rsquo, "'"); theText = theText.replaceAll("'", "'"); + /************************************************************************ + * Convert – and Unicode dashes to -. Not all editors handle this correctly + */ + theText = theText.replaceAll("–", "-"); + theText = theText.replaceAll(figureDash, "-"); + theText = theText.replaceAll(enDash, "-"); + theText = theText.replaceAll(emDash, "-"); + //@formatter:off /***************************************************************************** * Convert the non-blocking characters to the HTML value ( ) @@ -324,4 +407,39 @@ public final class NormalizeHtml { return theReturn; } + static void removeElements(Element table, String theElementParent) { + Elements parents = table.select(theElementParent); + for (Element p : parents) { + Elements children = p.children(); + for (Element c : children) { + p.before(c); + } + p.remove(); + } + + } + + static String[] removeUnsupportedAttributes(Element e, boolean addBack) { + String[] attributeValues = {null, null, null, null, null}; + // remove "unsupported" attributes + Attributes attr = e.attributes(); + for (Attribute a : attr) { + if (allowedAttributes.contains(a.getKey())) { + if (!(a.getKey().equals("border") && a.getValue().equals("0"))) { + attributeValues[allowedAttributes.indexOf(a.getKey())] = a.getValue(); + } + } + e.removeAttr(a.getKey()); + } + if (addBack) { + // set specific order for attributes + for (int i = 0; i < attributeValues.length; i++) { + if (attributeValues[i] != null) { + e.attr(allowedAttributes.get(i), attributeValues[i]); + } + } + } + return attributeValues; + } + } |