Skip to main content
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authormpotterc0k2013-08-19 15:36:25 +0000
committerGerrit Code Review @ Eclipse.org2013-08-29 16:13:13 +0000
commitb7c17cd7038b4adfc1e9b14f568540baed73b556 (patch)
tree880537cc77a652f721b2db17168dae276dce5e43
parent419c045952758bcd1c7a2aba685d890ea20db86a (diff)
downloadorg.eclipse.osee-b7c17cd7038b4adfc1e9b14f568540baed73b556.tar.gz
org.eclipse.osee-b7c17cd7038b4adfc1e9b14f568540baed73b556.tar.xz
org.eclipse.osee-b7c17cd7038b4adfc1e9b14f568540baed73b556.zip
feature[ats_GVNXN]: Update WORDML and DOORS to HTML conversions
Change-Id: I3467c58e808298b3706fe024618d5cf4796944ed Signed-off-by: mpotterc0k <marc.a.potter@boeing.com>
-rw-r--r--plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtmlTest.java3
-rw-r--r--plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_converted.htm144
-rw-r--r--plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_test_doc.htm2
-rw-r--r--plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/artifact/Artifact.java4
-rw-r--r--plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/importing/parsers/DoorsArtifactExtractor.java446
-rw-r--r--plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtml.java230
6 files changed, 674 insertions, 155 deletions
diff --git a/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtmlTest.java b/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtmlTest.java
index 3d31f0aa098..1e100be7cd2 100644
--- a/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtmlTest.java
+++ b/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtmlTest.java
@@ -15,8 +15,8 @@ import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
-import org.junit.Assert;
import org.eclipse.osee.framework.jdk.core.util.Lib;
+import org.junit.Assert;
import org.junit.Test;
/**
@@ -34,7 +34,6 @@ public class NormalizeHtmlTest {
String expected = getResource(CONVERTED_HTML);
input = NormalizeHtml.convertToNormalizedHTML(input);
-
input = bodyOnly(input);
expected = bodyOnly(expected);
input = input.replaceAll("\r", "");
diff --git a/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_converted.htm b/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_converted.htm
index 092f5e515e8..075877b41d5 100644
--- a/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_converted.htm
+++ b/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_converted.htm
@@ -187,45 +187,39 @@
</span>
</span>
</p>
- <table width="4369" border="1" bordercolor="#000000" cellpadding="0" cellspacing="3">
- <colgroup>
- <col width="2180" />
- <col width="2178" />
- </colgroup>
- <tbody>
- <tr valign="TOP">
- <td width="2180">
- <p style="margin-top: 0.07in">
- <span style=" font-family: Times New Roman, serif">
- <span style=" font-size: xx-large;">26 pt in table with border</span>
- </span>
- </p>
- </td>
- <td width="2178">
- <p style="margin-top: 0.07in">
- <span style=" font-family: Times New Roman, serif">
- <span style=" font-size: xx-large;">26 pt in table with border</span>
- </span>
- </p>
- </td>
- </tr>
- <tr valign="TOP">
- <td width="2180">
- <p style="margin-top: 0.07in">
- <span style=" font-family: Times New Roman, serif">
- <span style=" font-size: xx-large;">28 pt in table with border</span>
- </span>
- </p>
- </td>
- <td width="2178">
- <p style="margin-top: 0.07in">
- <span style=" font-family: Times New Roman, serif">
- <span style=" font-size: xx-large;">28 pt in table with border</span>
- </span>
- </p>
- </td>
- </tr>
- </tbody>
+ <table border="1">
+ <tr valign="top">
+ <td>
+ <p style="margin-top: 0.07in">
+ <span style=" font-family: Times New Roman, serif">
+ <span style=" font-size: xx-large;">26 pt in table with border</span>
+ </span>
+ </p>
+ </td>
+ <td>
+ <p style="margin-top: 0.07in">
+ <span style=" font-family: Times New Roman, serif">
+ <span style=" font-size: xx-large;">26 pt in table with border</span>
+ </span>
+ </p>
+ </td>
+ </tr>
+ <tr valign="top">
+ <td>
+ <p style="margin-top: 0.07in">
+ <span style=" font-family: Times New Roman, serif">
+ <span style=" font-size: xx-large;">28 pt in table with border</span>
+ </span>
+ </p>
+ </td>
+ <td>
+ <p style="margin-top: 0.07in">
+ <span style=" font-family: Times New Roman, serif">
+ <span style=" font-size: xx-large;">28 pt in table with border</span>
+ </span>
+ </p>
+ </td>
+ </tr>
</table>
<p style="margin-top: 0.07in; margin-bottom: 0in; line-height: 100%">
<br />
@@ -239,45 +233,39 @@
</span>
</span>
</p>
- <table width="4369" border="1" bordercolor="#000000" cellpadding="0" cellspacing="0">
- <colgroup>
- <col width="2183" />
- <col width="2184" />
- </colgroup>
- <tbody>
- <tr valign="TOP">
- <td width="2183">
- <p style="margin-top: 0.07in">
- <span style=" font-family: Times New Roman, serif">
- <span style=" font-size: 300%;">36 pt in table without border</span>
- </span>
- </p>
- </td>
- <td width="2184">
- <p style="margin-top: 0.07in">
- <span style=" font-family: Times New Roman, serif">
- <span style=" font-size: 300%;">36 pt in table without border</span>
- </span>
- </p>
- </td>
- </tr>
- <tr valign="TOP">
- <td width="2183">
- <p style="margin-top: 0.07in">
- <span style=" font-family: Times New Roman, serif">
- <span style=" font-size: 300%;">40 pt in table w/o border</span>
- </span>
- </p>
- </td>
- <td width="2184">
- <p style="margin-top: 0.07in">
- <span style=" font-family: Times New Roman, serif">
- <span style=" font-size: 300%;">40 pt in table w/o border</span>
- </span>
- </p>
- </td>
- </tr>
- </tbody>
+ <table>
+ <tr valign="top">
+ <td>
+ <p style="margin-top: 0.07in">
+ <span style=" font-family: Times New Roman, serif">
+ <span style=" font-size: 300%;">36 pt in table without border</span>
+ </span>
+ </p>
+ </td>
+ <td>
+ <p style="margin-top: 0.07in">
+ <span style=" font-family: Times New Roman, serif">
+ <span style=" font-size: 300%;">36 pt in table without border</span>
+ </span>
+ </p>
+ </td>
+ </tr>
+ <tr valign="top">
+ <td>
+ <p style="margin-top: 0.07in">
+ <span style=" font-family: Times New Roman, serif">
+ <span style=" font-size: 300%;">40 pt in table w/o border</span>
+ </span>
+ </p>
+ </td>
+ <td>
+ <p style="margin-top: 0.07in">
+ <span style=" font-family: Times New Roman, serif">
+ <span style=" font-size: 300%;">40 pt in table w/o border</span>
+ </span>
+ </p>
+ </td>
+ </tr>
</table>
<p style="margin-top: 0.07in; margin-bottom: 0in; line-height: 100%">
<br />
diff --git a/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_test_doc.htm b/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_test_doc.htm
index ef338dbdb40..39b724f614f 100644
--- a/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_test_doc.htm
+++ b/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_test_doc.htm
@@ -90,7 +90,7 @@ pt font normal </SPAN></FONT></FONT></FONT>
</P>
<P STYLE="margin-top: 0.07in; margin-bottom: 0in; line-height: 100%"><FONT COLOR="#000000"><FONT FACE="Times New Roman, serif"><FONT SIZE=7 STYLE="font-size: 32pt"><SPAN STYLE="background: #ffffff">32
pt font</SPAN></FONT></FONT></FONT></P>
-<TABLE WIDTH=4369 BORDER=1 BORDERCOLOR="#000000" CELLPADDING=0 CELLSPACING=0>
+<TABLE WIDTH=4369 BORDER=0 BORDERCOLOR="#000000" CELLPADDING=0 CELLSPACING=0>
<COL WIDTH=2183>
<COL WIDTH=2184>
<TR VALIGN=TOP>
diff --git a/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/artifact/Artifact.java b/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/artifact/Artifact.java
index d2f3f7c9d12..992658e8888 100644
--- a/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/artifact/Artifact.java
+++ b/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/artifact/Artifact.java
@@ -963,8 +963,8 @@ public class Artifact extends NamedIdentity<String> implements IArtifact, IAdapt
* enumerated and value is already present
*/
private final <T> void setOrAddAttribute(IAttributeType attributeType, T value) throws OseeCoreException {
- List<Attribute<String>> attributes = getAttributes(attributeType);
- for (Attribute<String> canidateAttribute : attributes) {
+ List<Attribute<Object>> attributes = getAttributes(attributeType);
+ for (Attribute<?> canidateAttribute : attributes) {
if (canidateAttribute.getValue().equals(value)) {
return;
}
diff --git a/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/importing/parsers/DoorsArtifactExtractor.java b/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/importing/parsers/DoorsArtifactExtractor.java
index 6396979a1af..30cbfcb8f16 100644
--- a/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/importing/parsers/DoorsArtifactExtractor.java
+++ b/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/importing/parsers/DoorsArtifactExtractor.java
@@ -12,6 +12,7 @@ package org.eclipse.osee.framework.skynet.core.importing.parsers;
import java.io.File;
import java.io.FileFilter;
+import java.io.UnsupportedEncodingException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashMap;
@@ -54,7 +55,10 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
private final static String BR_TAG = "<br />";
private final static String BODY_START_TAG = "<body>";
private final static String BODY_END_TAG = "</body>";
+ private final static String LIST_ITEM_TAG = "<li>";
+ private final static String LIST_ITEM_END_TAG = "</li>";
private final static String IMAGE_BASE_NAME = "Image Content_";
+ private final String BLANK_HTML_LINE = "<br />";
private final static String[] VERIFICATION_KEYWORDS = {
"Effectivity:",
"Verf Method:",
@@ -223,7 +227,9 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
tableFound = true;
}
} else if (qName.equalsIgnoreCase("tr")) {
- // Do nothing here -- no processing needed
+ if (embededTableCount > 0) {
+ cell.append("<tr>");
+ }
} else if (qName.equalsIgnoreCase("th")) {
if (embededTableCount > 0) {
// table within the table
@@ -274,6 +280,8 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
throw new SAXException(ex);
}
}
+ } else {
+ cell.append("</tr>");
}
} else if (qName.equalsIgnoreCase("th")) {
if (embededTableCount > 0) {
@@ -415,7 +423,7 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
/***************************************************************
* First check the document applicability box, if it is empty this is a header row
*/
- boolean isHeaderRow = false, foundDataType = false;
+ boolean isHeaderRow = false, foundDataType = false, isList;
int rowIndex;
for (rowIndex = 0; rowIndex < row.length; rowIndex++) {
RowTypeEnum rowType = rowIndexToRowTypeMap.get(rowIndex);
@@ -438,7 +446,12 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
}
}
if (!rowIndexToRowTypeMap.isEmpty()) {
+ isList = false;
+ int requirementIndex = -1;
+ String requirementColumn = "";
+ boolean isRequirementColumn = false;
for (rowIndex = 0; rowIndex < row.length; rowIndex++) {
+ isRequirementColumn = false;
RowTypeEnum rowType = rowIndexToRowTypeMap.get(rowIndex);
String rowValue = row[rowIndex];
@@ -465,6 +478,9 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
}
rowValue = "";
}
+ requirementIndex = rowIndex;
+ isRequirementColumn = true;
+ requirementColumn = rowValue;
break;
@@ -513,7 +529,6 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
case TABLE:
case INFORMATION:
- case LIST:
case FIGURE:
isRequirement = lastDataType.equals(DataTypeEnum.REQUIREMENT);
break;
@@ -524,6 +539,11 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
theArtifact.clear();
return;
+ case LIST:
+ isRequirement = lastDataType.equals(DataTypeEnum.REQUIREMENT);
+ isList = true;
+ break;
+
case OTHER:
foundDataType = false;
lastDataType = DataTypeEnum.OTHER;
@@ -543,20 +563,414 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
break;
}
+ if (!isRequirementColumn) {
+ if (inArtifact) {
+ ListIterator<String> iter = theArtifact.listIterator(rowIndex);
+ String theColumnValue = iter.next();
+ theColumnValue += " " + rowValue.trim();
+ iter.set(theColumnValue);
+ } else {
+ theArtifact.add(rowValue.trim());
+ }
+ } else {
+ if (!inArtifact) {
+ theArtifact.add("");
+ }
+ }
+
+ }
+ // process requirement column -- functionally always inArtifact because of the empty add above
+ ListIterator<String> iter = theArtifact.listIterator(requirementIndex);
+ String theColumnValue = iter.next();
+ if (isList) {
+ requirementColumn = processList(requirementColumn);
+ }
+ theColumnValue += " " + requirementColumn.trim();
+ iter.set(theColumnValue);
+ }
+ inArtifact = true;
+ }
+
+ private String processList(String inputValue) {
+ inputValue = normalizeHtml(inputValue);
+ /**************************************************************************************
+ * The way Doors export works with lists is that there is badly spaced <div> statements -- remove them
+ */
+ inputValue = inputValue.replaceAll("<div>", "");
+ inputValue = inputValue.replaceAll("</div>", "");
+ /*********************************************************************************
+ * Remove extra blank lines too
+ */
+ inputValue = inputValue.replaceAll(BLANK_HTML_LINE + "\\s+" + BLANK_HTML_LINE, BLANK_HTML_LINE);
+ StringBuilder returnString = new StringBuilder(inputValue.trim());
+ //@formatter:off
+ /********************************************************************************
+ * The Doors export outputs a list as pure text (e.g. a. list item). Convert this to an HTML list
+ *
+ * Assumptions:
+ * 1) The format of the list is either a. or 1.
+ * 2) There is no embedded 1. or a. in the text of the list.
+ * That is if 1. shows up in a alpha list it means there is a new list starting or if b.
+ * shows up after a. then it the next item
+ */
+ //@formatter:on
+ // find first text char
+ char[] theChars = stringBuilderToChars(returnString);
+ int[] startEnd = findEndOfList(theChars, 0);
+ int iPos = startEnd[0];
+ int endOfList = startEnd[1];
+ int startOfNextList = startEnd[2];
+ boolean isNumeric = Character.isDigit(theChars[iPos]);
+ boolean isLowerCase = Character.isLowerCase(theChars[iPos]);
+ int currentNumber = 0;
+ String currentLetter = "";
+ if (isNumeric) {
+ int startPos = iPos;
+ while ((theChars[iPos] != '.') && theChars[iPos] != ')') {
+ iPos++;
+ }
+ String theNumber = returnString.substring(startPos, iPos);
+ currentNumber = Integer.parseInt(theNumber);
+ } else {
+ int startPos = iPos;
+ while ((theChars[iPos] != '.') && theChars[iPos] != ')') {
+ iPos++;
+ }
+ currentLetter = returnString.substring(startPos, iPos);
+ }
+ int nextItem = 0;
+ returnString.delete(iPos - 1, iPos + 1);
+ endOfList -= 2;
+ startOfNextList -= 2;
+ String insertValue = null;
+ if (isNumeric) {
+ insertValue = "<ol>";
+ } else if (isLowerCase) {
+ insertValue = "<ol type = \"a\">";
+ } else {
+ insertValue = "<ol type = \"A\">";
+ }
+ returnString.insert(iPos - 1, insertValue);
+ if (iPos < endOfList) {
+ endOfList = endOfList + insertValue.length();
+ startOfNextList = startOfNextList + insertValue.length();
+ }
+ iPos += insertValue.length();
- if (inArtifact) {
- ListIterator<String> iter = theArtifact.listIterator(rowIndex);
- String theColumnValue = iter.next();
- theColumnValue += "\n" + rowValue.trim();
- iter.set(theColumnValue);
+ listData theListData = new listData();
+ boolean lastWasSublist = false;
+ while (nextItem != -1) {
+ if (theListData.getNewList()) {
+ lastWasSublist = true;
+ } else {
+ lastWasSublist = false;
+ returnString.insert(iPos - 1, LIST_ITEM_TAG);
+ if (iPos < endOfList) {
+ endOfList = endOfList + LIST_ITEM_TAG.length();
+ startOfNextList = startOfNextList + LIST_ITEM_TAG.length();
+ }
+ iPos += LIST_ITEM_TAG.length() - 1;
+ }
+ theChars = stringBuilderToChars(returnString);
+ nextItem = findNextListItem(theChars, iPos, isNumeric, isLowerCase, currentNumber, currentLetter, theListData);
+ if (nextItem == -1) {
+ break;
+ }
+
+ if (theListData.getNewList()) {
+ int startPoint = (nextItem < startOfNextList) ? nextItem : startOfNextList;
+ String theSublist = returnString.substring(0, startPoint);
+ int end = theListData.getNextItem();
+ if (end >= returnString.length()) {
+ end = returnString.length() - 1;
+ }
+
+ String theRawSublist = new String(theChars, startPoint, end - startPoint + 1);
+ int initialLen = theRawSublist.length();
+ theRawSublist = processList(theRawSublist);
+ theSublist += theRawSublist;
+ theSublist += LIST_ITEM_END_TAG;
+ int delta = (theRawSublist.length() - initialLen) + LIST_ITEM_END_TAG.length();
+ endOfList += delta;
+ startOfNextList += delta;
+ if ((theListData.getNextItem() != -1) && (theListData.getNextItem() < returnString.length())) {
+ theSublist += returnString.substring(theListData.getNextItem() + 1);
+ }
+ returnString.delete(0, returnString.length());
+ returnString.append(theSublist);
+
+ } else {
+ if (isNumeric) {
+ currentNumber =
+ Integer.valueOf(returnString.substring(nextItem, nextItem + theListData.getItemLength() - 1));
} else {
- theArtifact.add(rowValue.trim());
+ currentLetter = returnString.substring(nextItem, nextItem + theListData.getItemLength() - 1);
+ }
+ returnString.delete(nextItem, nextItem + theListData.getItemLength());
+ endOfList -= theListData.getItemLength();
+ startOfNextList -= theListData.getItemLength();
+ /*************************************************************
+ * Since we are converting a line of text, there is a blank line after it. Delete the <BR>
+ * </BR>
+ */
+ if (!lastWasSublist) {
+ int end = nextItem;
+ if (end > returnString.length()) {
+ end = returnString.length();
+ }
+ String test = returnString.substring(0, end);
+ int lastPoint = test.lastIndexOf(BLANK_HTML_LINE);
+ if (lastPoint != -1) {
+ returnString.delete(lastPoint, end);
+ int delta = test.length() - lastPoint;
+ endOfList -= delta;
+ nextItem -= delta;
+ startOfNextList -= delta;
+ }
}
+ if (!lastWasSublist) {
+ returnString.insert(nextItem, LIST_ITEM_END_TAG);
+ if (nextItem < endOfList) {
+ endOfList = endOfList + LIST_ITEM_END_TAG.length();
+ startOfNextList = startOfNextList + LIST_ITEM_END_TAG.length();
+ }
+ nextItem = nextItem + LIST_ITEM_END_TAG.length();
+ }
+ iPos = nextItem + 1;
+ }
+ theChars = stringBuilderToChars(returnString);
+ }
+ // find the insertion point for list end
+ String tokenToInsert = "</li></ol>";
+ if (theListData.getNewList()) {
+ tokenToInsert = "</ol>";
+ }
+
+ if (endOfList < theChars.length) {
+ returnString.insert(endOfList, tokenToInsert);
+ } else {
+ // verify the list doesn't end with <BR></BR>
+ String test = returnString.toString();
+ int lastPoint = test.lastIndexOf(BLANK_HTML_LINE);
+ if (lastPoint == (test.length() - BLANK_HTML_LINE.length())) {
+ returnString.delete(lastPoint, returnString.length());
+ }
+ returnString.append(tokenToInsert);
+ }
+
+ return returnString.toString();
+ }
+ private int[] findEndOfList(char[] theChars, int startPoint) {
+ int iPos = startPoint;
+ int[] iReturn = {0, theChars.length, theChars.length};
+ int tagCount = 0;
+ boolean notFirst = false;
+ boolean foundNonTagItem = false;
+ while (iPos < theChars.length) {
+ while ((iPos < theChars.length) && ((theChars[iPos] == '\t') || (theChars[iPos] == '\n') || (Character.isWhitespace(theChars[iPos])))) {
+ iPos++;
+ }
+ if (iPos >= theChars.length) {
+ iReturn[1] = theChars.length;
+ break;
+ }
+ if (theChars[iPos] == '<') {
+ int startofCloseTag = iPos;
+ iPos++;
+ if (theChars[iPos] == '/') {
+ tagCount--;
+ while ((iPos < theChars.length) && (theChars[iPos] != '>')) {
+ iPos++;
+ }
+ if (((tagCount == 0) && foundNonTagItem) || (tagCount < 0)) {
+ iReturn[1] = startofCloseTag;
+ iReturn[2] = iPos;
+ while ((iReturn[2] < theChars.length) && (theChars[iReturn[2]] != '<')) {
+ iReturn[2] = iReturn[2] + 1;
+ }
+ break;
+ }
+ } else {
+ tagCount++;
+ }
+ while ((iPos < theChars.length) && (theChars[iPos] != '>')) {
+ iPos++;
+ }
+ iPos++;
+ } else if (notFirst) {
+ if (!foundNonTagItem) {
+ iReturn[0] = iPos;
+ foundNonTagItem = true;
+ }
+ if (tagCount == 0) {
+ break;
+ } else {
+ // find next tag
+ while ((iPos < theChars.length) && (theChars[iPos] != '<')) {
+ iPos++;
+ }
+ iReturn[1] = iPos - 1;
+ // find the end of the tag
+ iReturn[2] = iPos;
+ while ((iReturn[2] < theChars.length) && (theChars[iReturn[2]] != '>')) {
+ iReturn[2] = iReturn[2] + 1;
+ }
+ iReturn[2] = iReturn[2] + 1;
+ }
+ } else {
+ // no opening tags, therefore list not enclosed in tags.
+ iPos = theChars.length;
}
+ notFirst = true;
+ }
+ return iReturn;
+ }
+ static char[] stringBuilderToChars(StringBuilder sb) {
+ char[] returnArray = new char[sb.length()];
+ sb.getChars(0, sb.length(), returnArray, 0);
+ return returnArray;
+ }
+
+ private class listData {
+ private boolean newList;
+ private int itemLength;
+ private int nextItem;
+
+ public listData() {
+ this.newList = false;
+ this.itemLength = 0;
}
- inArtifact = true;
+
+ public int getItemLength() {
+ return itemLength;
+ }
+
+ public int getNextItem() {
+ return nextItem;
+ }
+
+ public boolean getNewList() {
+ return newList;
+ }
+
+ public void setNextItem(int nextItem) {
+ this.nextItem = nextItem;
+ }
+
+ public void setItemLength(int itemLength) {
+ this.itemLength = itemLength;
+ }
+
+ public void setNewList(boolean newList) {
+ this.newList = newList;
+ }
+ }
+
+ private int findNextListItem(char[] theChars, int iPos, boolean isNumeric, boolean isLowerCase, int currentNumber, String currentLetter, listData listData) {
+ //@formatter:off
+ /****************************************************************************
+ * Now the tricky part. We are looking for
+ * 1) <space><next value>.<space or &nbsp; or &#something>
+ * 2) <space><next level value>.
+ */
+ //@formatter:on
+
+ iPos++;
+ if (iPos >= theChars.length) {
+ return -1;
+ }
+ StringBuilder asString = new StringBuilder();
+ asString.append(theChars, iPos, theChars.length - iPos);
+ int aListDot = asString.toString().toLowerCase().indexOf("a.");
+ int aListParen = asString.toString().toLowerCase().indexOf("a.");
+ int aList = -1;
+ if (aListDot == -1) {
+ aList = aListParen;
+ } else if (aListParen == -1) {
+ aList = aListDot;
+ } else {
+ aList = (aListDot < aListParen) ? aListDot : aListParen;
+ }
+ int oneListDot = asString.indexOf("1.");
+ int oneListParen = asString.indexOf("1)");
+ int oneList = -1;
+ if (oneListDot == -1) {
+ oneList = oneListParen;
+ } else if (aListParen == -1) {
+ oneList = oneListDot;
+ } else {
+ oneList = (oneListDot < oneListParen) ? oneListDot : oneListParen;
+ }
+
+ int nextListItem = -1;
+ String nextItem = "";
+ if (isNumeric) {
+ nextItem = Integer.toString(currentNumber + 1) + ".";
+ } else {
+ // assume Ascii -- that is, that the letters are contiguous
+ byte[] theLetters = null;
+ try {
+ theLetters = currentLetter.getBytes("UTF-8");
+ } catch (UnsupportedEncodingException e) {
+ theLetters = currentLetter.getBytes();
+ }
+ int theCharToChange = theLetters.length - 1;
+ if (currentLetter.toLowerCase().charAt(theCharToChange) == 'z') {
+ if (theCharToChange > 0) {
+ theLetters[theCharToChange - 1]++;
+ if (isLowerCase) {
+ theLetters[theCharToChange] = "a".getBytes()[0];
+ } else {
+ theLetters[theCharToChange] = "A".getBytes()[0];
+ }
+ } else {
+ byte[] newLetterArray = new byte[theLetters.length + 1];
+ for (int i = 0; i < newLetterArray.length; i++) {
+ if (isLowerCase) {
+ newLetterArray[i] = "a".getBytes()[0];
+ } else {
+ newLetterArray[i] = "A".getBytes()[0];
+ }
+ }
+ theLetters = newLetterArray;
+ }
+ } else {
+ theLetters[0]++;
+ }
+ nextItem = new String(theLetters) + ".";
+ }
+ nextListItem = asString.indexOf(nextItem);
+ if (nextListItem != -1) {
+ // verify this is not just a char and period
+ char prev = asString.charAt(nextListItem - 1);
+ while (!(Character.isWhitespace(prev) || (prev == ';') || (prev == '>'))) {
+ nextListItem = asString.indexOf(nextItem, nextListItem + 1);
+ if (nextListItem == -1) {
+ break;
+ }
+ prev = asString.charAt(nextListItem - 1);
+ }
+ }
+ if ((aList == -1) && (oneList == -1) && (nextListItem == -1)) {
+ return -1;
+ }
+ aList = (aList != -1) ? aList + iPos : theChars.length + 1;
+ oneList = (oneList != -1) ? oneList + iPos : theChars.length + 1;
+ nextListItem = (nextListItem != -1) ? nextListItem + iPos : theChars.length + 1;
+ int iReturn = (aList < oneList) ? aList : oneList;
+ iReturn = (iReturn < nextListItem) ? iReturn : nextListItem;
+ if (iReturn == nextListItem) {
+ listData.setNewList(false);
+ listData.setItemLength(nextItem.length());
+ listData.setNextItem(nextListItem);
+ } else {
+ listData.setNewList(true);
+ listData.setItemLength(2);
+ listData.setNextItem(nextListItem - 1);
+ }
+ return iReturn;
}
private void processArtifact() throws OseeCoreException {
@@ -585,7 +999,7 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
case REQUIREMENTS:
StringBuffer imageFileList = new StringBuffer("");
getImageList(rowValue, imageFileList);
- rowValue = normailizeHtml(rowValue);
+ rowValue = normalizeHtml(rowValue);
String imageFile = imageFileList.toString();
if (!imageFile.isEmpty()) {
String theImage;
@@ -783,9 +1197,9 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
return returnValue;
}
- private String normailizeHtml(String inputHtml) {
+ private String normalizeHtml(String inputHtml) {
- String returnValue = NormalizeHtml.convertToNormalizedHTML(inputHtml);
+ String returnValue = NormalizeHtml.convertToNormalizedHTML(inputHtml, true, true, true);
int bodyStart = returnValue.indexOf(BODY_START_TAG);
int bodyEnd = returnValue.indexOf(BODY_END_TAG);
if (bodyStart != -1) {
@@ -821,10 +1235,10 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
* tags these are not meaningful
*/
returnValue = returnValue.trim();
- int brTag = returnValue.lastIndexOf(BR_TAG);
- while (brTag == returnValue.length() - BR_TAG.length()) {
+ int brTag = returnValue.toLowerCase().lastIndexOf(BR_TAG);
+ while ((brTag != -1) && (brTag == returnValue.length() - BR_TAG.length())) {
returnValue = returnValue.substring(0, brTag).trim();
- brTag = returnValue.lastIndexOf(BR_TAG);
+ brTag = returnValue.toLowerCase().lastIndexOf(BR_TAG);
}
return returnValue;
}
diff --git a/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtml.java b/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtml.java
index 2724c31da23..04e3af90248 100644
--- a/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtml.java
+++ b/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtml.java
@@ -10,6 +10,7 @@
*******************************************************************************/
package org.eclipse.osee.framework.skynet.core.utility;
+import java.util.ArrayList;
import java.util.TreeMap;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Attribute;
@@ -19,7 +20,6 @@ import org.jsoup.nodes.Document.OutputSettings;
import org.jsoup.nodes.Document.QuirksMode;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities.EscapeMode;
-import org.jsoup.nodes.TextNode;
import org.jsoup.parser.Tag;
import org.jsoup.select.Elements;
@@ -63,11 +63,15 @@ public final class NormalizeHtml {
private static final String rdquo = String.valueOf('\u201D');
private static final String lsquo = String.valueOf('\u2018');
private static final String rsquo = String.valueOf('\u2019');
+ private static final String figureDash = String.valueOf('\u2012');
+ private static final String enDash = String.valueOf('\u2013');
+ private static final String emDash = String.valueOf('\u2014');
private static final String NON_BREAK_SPACE = String.valueOf('\u00A0');
private static final String NON_BREAK_FIGURE_SPACE = String.valueOf('\u2007');
private static final String NON_BREAK_NARROW_SPACE = String.valueOf('\u202F');
private static final String NON_BREAK_WORD_JOINER = String.valueOf('\u2060');
private static final String NON_BREAK_ZERO_WIDTH = String.valueOf('\uFEFF');
+ private static ArrayList<String> allowedAttributes = null;
private NormalizeHtml() {
// Utility Class
@@ -93,7 +97,7 @@ public final class NormalizeHtml {
* @return Normalized HTML
*/
public static String convertToNormalizedHTML(String inputHTML) {
- return convertToNormalizedHTML(inputHTML, false, false);
+ return convertToNormalizedHTML(inputHTML, false, false, false);
}
/**
@@ -103,10 +107,10 @@ public final class NormalizeHtml {
*
* @param inputHTML HTML source to be normalized
* @param removeInitialStyle Remove initial style information.
- * @param removeEmptyStyle Remove any empty (containing no text) style sections
+ * @param removeEmptyTags Remove any empty (containing no text) style sections
* @return Normalized HTML
*/
- public static String convertToNormalizedHTML(String inputHTML, boolean removeInitialStyle, boolean removeEmptyStyle) {
+ public static String convertToNormalizedHTML(String inputHTML, boolean removeInitialStyle, boolean removeEmptyTags, boolean removeHeaderFooter) {
Document doc = Jsoup.parse(inputHTML);
doc.quirksMode(QuirksMode.noQuirks);
OutputSettings outputSettings = doc.outputSettings();
@@ -134,77 +138,154 @@ public final class NormalizeHtml {
e.tagName("span");
e.attr("style", "text-decoration: line-through;");
}
+ removeDepreactedTags(doc);
+ processTagsWithAttributes(doc);
+ processHeaderFooter(doc, removeHeaderFooter);
processFontTags(doc);
processInitialStyleTags(doc, removeInitialStyle);
- processEmptyStyleTags(doc, removeEmptyStyle);
+ processEmptyTags(doc, removeEmptyTags);
return processText(doc);
}
+ static void removeDepreactedTags(Document doc) {
+ Elements center = doc.select("center");
+ for (Element e : center) {
+ Elements children = e.children();
+ for (Element c : children) {
+ e.before(c);
+ }
+ e.remove();
+ }
+ }
+
+ private static void processTagsWithAttributes(Document doc) {
+ /****************************************************************************
+ * HTML allows the same table to be represented many ways. Normalize the information into a standard format. Note
+ * this will simplify the table as well (that is some formatting may be lost) Remember, the goal is to reduce the
+ * HTML to the point that it is the same regardless of the source editor Also images have similar issues --
+ * normalize to the basic keyword
+ */
+
+ if (allowedAttributes == null) {
+ allowedAttributes = new ArrayList<String>();
+ allowedAttributes.add("border");
+ allowedAttributes.add("frame");
+ allowedAttributes.add("rules");
+ allowedAttributes.add("valign");
+ allowedAttributes.add("src");
+ }
+ Elements tables = doc.select("table");
+ for (Element table : tables) {
+ removeUnsupportedAttributes(table, true);
+ // remove Colgroup
+ Elements colgroup = table.select("colgroup");
+ for (Element c : colgroup) {
+ c.remove();
+ }
+ // no support for header / footer -- just rows
+
+ removeElements(table, "thead");
+ removeElements(table, "tfoot");
+ removeElements(table, "tbody");
+ // remove unsupported attributes on tr and td tags and move the attributes from td to tr
+ Elements rows = table.select("td");
+ for (Element row : rows) {
+ String[] attributeValues = removeUnsupportedAttributes(row, false);
+ Element tr = null;
+ Element parent = row.parent();
+ if (parent.tagName().equals("tr")) {
+ tr = parent;
+ } else {
+ Elements siblings = row.siblingElements();
+ for (Element e : siblings) {
+ if (e.tagName().equals("tr")) {
+ tr = e;
+ break;
+ }
+ }
+ }
+ if (tr != null) {
+ for (int i = 0; i < attributeValues.length; i++) {
+ if (attributeValues[i] != null) {
+ tr.attr(allowedAttributes.get(i), attributeValues[i].toLowerCase());
+ }
+ }
+ }
+ }
+ rows = table.select("tr");
+ for (Element row : rows) {
+ removeUnsupportedAttributes(row, true);
+ }
+ }
+
+ Elements images = doc.select("img");
+ for (Element image : images) {
+ removeUnsupportedAttributes(image, true);
+ }
+ }
+
static void processInitialStyleTags(Document doc, boolean removeInitialStyle) {
if (removeInitialStyle) {
- boolean foundText = false;
Elements pTags = doc.select("p");
for (Element p : pTags) {
- Elements style = p.getElementsByAttribute("style");
- for (Element e : style) {
- Element parent = e.parent();
- if (!parent.tagName().equals("span")) {
- if (e.hasText()) {
- String text = e.text();
- TextNode newNode = new TextNode(text, e.baseUri());
- e.remove();
- // Insert newline between various text elements
- if (foundText) {
- Tag tag = Tag.valueOf("br");
- Element br = new Element(tag, parent.baseUri());
- parent.appendChild(br);
- br = new Element(tag, parent.baseUri());
- parent.appendChild(br);
- }
- parent.appendChild(newNode);
- foundText = true;
- break;
+ if (!p.attr("style").equals("")) {
+ if (p.hasText()) {
+ if (!p.parent().tagName().equals("li")) {
+ Element cr = new Element(Tag.valueOf("br"), p.baseUri());
+ p.after(cr);
}
+ p.unwrap();
}
}
}
Elements span = doc.select("span");
for (Element s : span) {
- Elements style = s.getElementsByAttributeValueMatching("style", "font*|margin*");
- for (Element e : style) {
- Element parent = e.parent();
- if (!parent.tagName().equals("p")) {
- if (e.hasText()) {
- String text = e.text();
- TextNode newNode = new TextNode(text, e.baseUri());
- e.remove();
- parent.appendChild(newNode);
- // Insert newline between various text elements
- if (foundText) {
- Tag tag = Tag.valueOf("br");
- Element br = new Element(tag, parent.baseUri());
- parent.appendChild(br);
- parent.appendChild(br);
- }
- foundText = true;
- break;
- }
- }
+ if (!s.attr("style").equals("") && !s.hasText()) {
+ s.remove();
}
}
}
}
- private static void processEmptyStyleTags(Document doc, boolean removeEmptyStyle) {
- if (removeEmptyStyle) {
+ private static void processEmptyTags(Document doc, boolean removeEmptyTags) {
+ if (removeEmptyTags) {
Elements pTags = doc.select("p");
for (Element p : pTags) {
+ // Element cr = new Element(Tag.valueOf("br"), p.baseUri());
+ //p.after(cr);
deleteEmptyElemens(p);
}
Elements span = doc.select("span");
for (Element s : span) {
deleteEmptyElemens(s);
}
+ Elements div = doc.select("div");
+ for (Element e : div) {
+ if (!e.hasText()) {
+ e.remove();
+ } else {
+ e.unwrap();
+ }
+ }
+ Elements aTags = doc.select("a");
+ for (Element a : aTags) {
+ Attributes attr = a.attributes();
+ if ((attr.size() == 1) && (!attr.get("name").equals(""))) {
+ a.unwrap();
+ }
+ }
+ }
+ }
+
+ static void processHeaderFooter(Document doc, boolean removeHeaderFooter) {
+ if (removeHeaderFooter) {
+ Elements div = doc.select("div");
+ for (Element d : div) {
+ Elements headerFooter = d.getElementsByAttributeValueMatching("type", "HEADER*|FOOTER*");
+ for (Element hf : headerFooter) {
+ hf.remove();
+ }
+ }
}
}
@@ -215,18 +296,12 @@ public final class NormalizeHtml {
Elements style = elementToCheck.getElementsByAttributeValueMatching("style", "font*|margin*");
for (Element e : style) {
if (e.hasText()) {
- break;
+ continue;
} else {
- Elements images = e.select("img");
- Element parent = e.parent();
- e.remove();
- if (images.size() > 0) {
- for (Element image : images) {
- parent.appendChild(image);
- }
- }
+ e.unwrap();
}
}
+
}
private static String processText(Document doc) {
@@ -243,6 +318,14 @@ public final class NormalizeHtml {
theText = theText.replaceAll(rsquo, "'");
theText = theText.replaceAll("&apos;", "'");
+ /************************************************************************
+ * Convert &ndash; and Unicode dashes to -. Not all editors handle this correctly
+ */
+ theText = theText.replaceAll("&ndash;", "-");
+ theText = theText.replaceAll(figureDash, "-");
+ theText = theText.replaceAll(enDash, "-");
+ theText = theText.replaceAll(emDash, "-");
+
//@formatter:off
/*****************************************************************************
* Convert the non-blocking characters to the HTML value (&nbsp;)
@@ -324,4 +407,39 @@ public final class NormalizeHtml {
return theReturn;
}
+ static void removeElements(Element table, String theElementParent) {
+ Elements parents = table.select(theElementParent);
+ for (Element p : parents) {
+ Elements children = p.children();
+ for (Element c : children) {
+ p.before(c);
+ }
+ p.remove();
+ }
+
+ }
+
+ static String[] removeUnsupportedAttributes(Element e, boolean addBack) {
+ String[] attributeValues = {null, null, null, null, null};
+ // remove "unsupported" attributes
+ Attributes attr = e.attributes();
+ for (Attribute a : attr) {
+ if (allowedAttributes.contains(a.getKey())) {
+ if (!(a.getKey().equals("border") && a.getValue().equals("0"))) {
+ attributeValues[allowedAttributes.indexOf(a.getKey())] = a.getValue();
+ }
+ }
+ e.removeAttr(a.getKey());
+ }
+ if (addBack) {
+ // set specific order for attributes
+ for (int i = 0; i < attributeValues.length; i++) {
+ if (attributeValues[i] != null) {
+ e.attr(allowedAttributes.get(i), attributeValues[i]);
+ }
+ }
+ }
+ return attributeValues;
+ }
+
}

Back to the top