feature[ats_GVNXN]: Update WORDML and DOORS to HTML conversions

Change-Id: I3467c58e808298b3706fe024618d5cf4796944ed Signed-off-by: mpotterc0k <marc.a.potter@boeing.com>
author: mpotterc0k 2013-08-19 15:36:25 +0000
committer: Gerrit Code Review @ Eclipse.org 2013-08-29 16:13:13 +0000
commit: b7c17cd7038b4adfc1e9b14f568540baed73b556 (patch)
tree: 880537cc77a652f721b2db17168dae276dce5e43
parent: 419c045952758bcd1c7a2aba685d890ea20db86a (diff)
download: org.eclipse.osee-b7c17cd7038b4adfc1e9b14f568540baed73b556.tar.gz
org.eclipse.osee-b7c17cd7038b4adfc1e9b14f568540baed73b556.tar.xz
org.eclipse.osee-b7c17cd7038b4adfc1e9b14f568540baed73b556.zip
6 files changed, 674 insertions, 155 deletions
diff --git a/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtmlTest.java b/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtmlTest.java
index 3d31f0aa098..1e100be7cd2 100644
--- a/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtmlTest.java
+++ b/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtmlTest.java
@@ -15,8 +15,8 @@ import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
-import org.junit.Assert;
 import org.eclipse.osee.framework.jdk.core.util.Lib;
+import org.junit.Assert;
 import org.junit.Test;
 
 /**
@@ -34,7 +34,6 @@ public class NormalizeHtmlTest {
       String expected = getResource(CONVERTED_HTML);
 
       input = NormalizeHtml.convertToNormalizedHTML(input);
-
       input = bodyOnly(input);
       expected = bodyOnly(expected);
       input = input.replaceAll("\r", "");
diff --git a/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_converted.htm b/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_converted.htm
index 092f5e515e8..075877b41d5 100644
--- a/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_converted.htm
+++ b/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_converted.htm
@@ -187,45 +187,39 @@
                 </span>
             </span> 
         </p> 
-        <table width="4369" border="1" bordercolor="#000000" cellpadding="0" cellspacing="3"> 
-            <colgroup>
-                <col width="2180" /> 
-                <col width="2178" /> 
-            </colgroup>
-            <tbody>
-                <tr valign="TOP"> 
-                    <td width="2180"> 
-                        <p style="margin-top: 0.07in">
-                            <span style=" font-family: Times New Roman, serif">
-                                <span style=" font-size: xx-large;">26 pt in table with border</span>
-                            </span>
-                        </p> 
-                    </td> 
-                    <td width="2178"> 
-                        <p style="margin-top: 0.07in">
-                            <span style=" font-family: Times New Roman, serif">
-                                <span style=" font-size: xx-large;">26 pt in table with border</span>
-                            </span>
-                        </p> 
-                    </td> 
-                </tr> 
-                <tr valign="TOP"> 
-                    <td width="2180"> 
-                        <p style="margin-top: 0.07in">
-                            <span style=" font-family: Times New Roman, serif">
-                                <span style=" font-size: xx-large;">28 pt in table with border</span>
-                            </span>
-                        </p> 
-                    </td> 
-                    <td width="2178"> 
-                        <p style="margin-top: 0.07in">
-                            <span style=" font-family: Times New Roman, serif">
-                                <span style=" font-size: xx-large;">28 pt in table with border</span>
-                            </span>
-                        </p> 
-                    </td> 
-                </tr> 
-            </tbody>
+        <table border="1"> 
+            <tr valign="top"> 
+                <td> 
+                    <p style="margin-top: 0.07in">
+                        <span style=" font-family: Times New Roman, serif">
+                            <span style=" font-size: xx-large;">26 pt in table with border</span>
+                        </span>
+                    </p> 
+                </td> 
+                <td> 
+                    <p style="margin-top: 0.07in">
+                        <span style=" font-family: Times New Roman, serif">
+                            <span style=" font-size: xx-large;">26 pt in table with border</span>
+                        </span>
+                    </p> 
+                </td> 
+            </tr>
+            <tr valign="top"> 
+                <td> 
+                    <p style="margin-top: 0.07in">
+                        <span style=" font-family: Times New Roman, serif">
+                            <span style=" font-size: xx-large;">28 pt in table with border</span>
+                        </span>
+                    </p> 
+                </td> 
+                <td> 
+                    <p style="margin-top: 0.07in">
+                        <span style=" font-family: Times New Roman, serif">
+                            <span style=" font-size: xx-large;">28 pt in table with border</span>
+                        </span>
+                    </p> 
+                </td> 
+            </tr>
         </table> 
         <p style="margin-top: 0.07in; margin-bottom: 0in; line-height: 100%">
             <br /> 
@@ -239,45 +233,39 @@
                 </span>
             </span>
         </p> 
-        <table width="4369" border="1" bordercolor="#000000" cellpadding="0" cellspacing="0"> 
-            <colgroup>
-                <col width="2183" /> 
-                <col width="2184" /> 
-            </colgroup>
-            <tbody>
-                <tr valign="TOP"> 
-                    <td width="2183"> 
-                        <p style="margin-top: 0.07in">
-                            <span style=" font-family: Times New Roman, serif">
-                                <span style=" font-size: 300%;">36 pt in table without border</span>
-                            </span>
-                        </p> 
-                    </td> 
-                    <td width="2184"> 
-                        <p style="margin-top: 0.07in">
-                            <span style=" font-family: Times New Roman, serif">
-                                <span style=" font-size: 300%;">36 pt in table without border</span>
-                            </span>
-                        </p> 
-                    </td> 
-                </tr> 
-                <tr valign="TOP"> 
-                    <td width="2183"> 
-                        <p style="margin-top: 0.07in">
-                            <span style=" font-family: Times New Roman, serif">
-                                <span style=" font-size: 300%;">40 pt in table w/o border</span>
-                            </span>
-                        </p> 
-                    </td> 
-                    <td width="2184"> 
-                        <p style="margin-top: 0.07in">
-                            <span style=" font-family: Times New Roman, serif">
-                                <span style=" font-size: 300%;">40 pt in table w/o border</span>
-                            </span>
-                        </p> 
-                    </td> 
-                </tr> 
-            </tbody>
+        <table> 
+            <tr valign="top"> 
+                <td> 
+                    <p style="margin-top: 0.07in">
+                        <span style=" font-family: Times New Roman, serif">
+                            <span style=" font-size: 300%;">36 pt in table without border</span>
+                        </span>
+                    </p> 
+                </td> 
+                <td> 
+                    <p style="margin-top: 0.07in">
+                        <span style=" font-family: Times New Roman, serif">
+                            <span style=" font-size: 300%;">36 pt in table without border</span>
+                        </span>
+                    </p> 
+                </td> 
+            </tr>
+            <tr valign="top"> 
+                <td> 
+                    <p style="margin-top: 0.07in">
+                        <span style=" font-family: Times New Roman, serif">
+                            <span style=" font-size: 300%;">40 pt in table w/o border</span>
+                        </span>
+                    </p> 
+                </td> 
+                <td> 
+                    <p style="margin-top: 0.07in">
+                        <span style=" font-family: Times New Roman, serif">
+                            <span style=" font-size: 300%;">40 pt in table w/o border</span>
+                        </span>
+                    </p> 
+                </td> 
+            </tr>
         </table> 
         <p style="margin-top: 0.07in; margin-bottom: 0in; line-height: 100%">
             <br /> 
diff --git a/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_test_doc.htm b/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_test_doc.htm
index ef338dbdb40..39b724f614f 100644
--- a/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_test_doc.htm
+++ b/plugins/org.eclipse.osee.framework.skynet.core.test/src/org/eclipse/osee/framework/skynet/core/utility/support/NormalizeHtml_test_doc.htm
@@ -90,7 +90,7 @@ pt font normal </SPAN></FONT></FONT></FONT>
 </P>
 <P STYLE="margin-top: 0.07in; margin-bottom: 0in; line-height: 100%"><FONT COLOR="#000000"><FONT FACE="Times New Roman, serif"><FONT SIZE=7 STYLE="font-size: 32pt"><SPAN STYLE="background: #ffffff">32
 pt font</SPAN></FONT></FONT></FONT></P>
-<TABLE WIDTH=4369 BORDER=1 BORDERCOLOR="#000000" CELLPADDING=0 CELLSPACING=0>
+<TABLE WIDTH=4369 BORDER=0 BORDERCOLOR="#000000" CELLPADDING=0 CELLSPACING=0>
 	<COL WIDTH=2183>
 	<COL WIDTH=2184>
 	<TR VALIGN=TOP>
diff --git a/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/artifact/Artifact.java b/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/artifact/Artifact.java
index d2f3f7c9d12..992658e8888 100644
--- a/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/artifact/Artifact.java
+++ b/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/artifact/Artifact.java
@@ -963,8 +963,8 @@ public class Artifact extends NamedIdentity<String> implements IArtifact, IAdapt
     * enumerated and value is already present
     */
    private final <T> void setOrAddAttribute(IAttributeType attributeType, T value) throws OseeCoreException {
-      List<Attribute<String>> attributes = getAttributes(attributeType);
-      for (Attribute<String> canidateAttribute : attributes) {
+      List<Attribute<Object>> attributes = getAttributes(attributeType);
+      for (Attribute<?> canidateAttribute : attributes) {
          if (canidateAttribute.getValue().equals(value)) {
             return;
          }
diff --git a/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/importing/parsers/DoorsArtifactExtractor.java b/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/importing/parsers/DoorsArtifactExtractor.java
index 6396979a1af..30cbfcb8f16 100644
--- a/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/importing/parsers/DoorsArtifactExtractor.java
+++ b/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/importing/parsers/DoorsArtifactExtractor.java
@@ -12,6 +12,7 @@ package org.eclipse.osee.framework.skynet.core.importing.parsers;
 
 import java.io.File;
 import java.io.FileFilter;
+import java.io.UnsupportedEncodingException;
 import java.net.URI;
 import java.net.URISyntaxException;
 import java.util.HashMap;
@@ -54,7 +55,10 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
    private final static String BR_TAG = "<br />";
    private final static String BODY_START_TAG = "<body>";
    private final static String BODY_END_TAG = "</body>";
+   private final static String LIST_ITEM_TAG = "<li>";
+   private final static String LIST_ITEM_END_TAG = "</li>";
    private final static String IMAGE_BASE_NAME = "Image Content_";
+   private final String BLANK_HTML_LINE = "<br />";
    private final static String[] VERIFICATION_KEYWORDS = {
       "Effectivity:",
       "Verf Method:",
@@ -223,7 +227,9 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
                tableFound = true;
             }
          } else if (qName.equalsIgnoreCase("tr")) {
-            // Do nothing here -- no processing needed
+            if (embededTableCount > 0) {
+               cell.append("<tr>");
+            }
          } else if (qName.equalsIgnoreCase("th")) {
             if (embededTableCount > 0) {
                // table within the table
@@ -274,6 +280,8 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
                      throw new SAXException(ex);
                   }
                }
+            } else {
+               cell.append("</tr>");
             }
          } else if (qName.equalsIgnoreCase("th")) {
             if (embededTableCount > 0) {
@@ -415,7 +423,7 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
       /***************************************************************
        * First check the document applicability box, if it is empty this is a header row
        */
-      boolean isHeaderRow = false, foundDataType = false;
+      boolean isHeaderRow = false, foundDataType = false, isList;
       int rowIndex;
       for (rowIndex = 0; rowIndex < row.length; rowIndex++) {
          RowTypeEnum rowType = rowIndexToRowTypeMap.get(rowIndex);
@@ -438,7 +446,12 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
          }
       }
       if (!rowIndexToRowTypeMap.isEmpty()) {
+         isList = false;
+         int requirementIndex = -1;
+         String requirementColumn = "";
+         boolean isRequirementColumn = false;
          for (rowIndex = 0; rowIndex < row.length; rowIndex++) {
+            isRequirementColumn = false;
             RowTypeEnum rowType = rowIndexToRowTypeMap.get(rowIndex);
 
             String rowValue = row[rowIndex];
@@ -465,6 +478,9 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
                      }
                      rowValue = "";
                   }
+                  requirementIndex = rowIndex;
+                  isRequirementColumn = true;
+                  requirementColumn = rowValue;
 
                   break;
 
@@ -513,7 +529,6 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
 
                      case TABLE:
                      case INFORMATION:
-                     case LIST:
                      case FIGURE:
                         isRequirement = lastDataType.equals(DataTypeEnum.REQUIREMENT);
                         break;
@@ -524,6 +539,11 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
                         theArtifact.clear();
                         return;
 
+                     case LIST:
+                        isRequirement = lastDataType.equals(DataTypeEnum.REQUIREMENT);
+                        isList = true;
+                        break;
+
                      case OTHER:
                         foundDataType = false;
                         lastDataType = DataTypeEnum.OTHER;
@@ -543,20 +563,414 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
                   break;
 
             }
+            if (!isRequirementColumn) {
+               if (inArtifact) {
+                  ListIterator<String> iter = theArtifact.listIterator(rowIndex);
+                  String theColumnValue = iter.next();
+                  theColumnValue += " " + rowValue.trim();
+                  iter.set(theColumnValue);
+               } else {
+                  theArtifact.add(rowValue.trim());
+               }
+            } else {
+               if (!inArtifact) {
+                  theArtifact.add("");
+               }
+            }
+
+         }
+         // process requirement column -- functionally always inArtifact because of the empty add above
+         ListIterator<String> iter = theArtifact.listIterator(requirementIndex);
+         String theColumnValue = iter.next();
+         if (isList) {
+            requirementColumn = processList(requirementColumn);
+         }
+         theColumnValue += " " + requirementColumn.trim();
+         iter.set(theColumnValue);
+      }
+      inArtifact = true;
+   }
+
+   private String processList(String inputValue) {
+      inputValue = normalizeHtml(inputValue);
+      /**************************************************************************************
+       * The way Doors export works with lists is that there is badly spaced <div> statements -- remove them
+       */
+      inputValue = inputValue.replaceAll("<div>", "");
+      inputValue = inputValue.replaceAll("</div>", "");
+      /*********************************************************************************
+       * Remove extra blank lines too
+       */
+      inputValue = inputValue.replaceAll(BLANK_HTML_LINE + "\\s+" + BLANK_HTML_LINE, BLANK_HTML_LINE);
+      StringBuilder returnString = new StringBuilder(inputValue.trim());
+      //@formatter:off
+      /********************************************************************************
+       * The Doors export outputs a list as pure text (e.g. a. list item). Convert this to an HTML list 
+       * 
+       * Assumptions: 
+       * 1) The format of the list is either a. or 1. 
+       * 2) There is no embedded 1. or a. in the text of the list. 
+       *    That is if 1. shows up in a alpha list it means there is a new list starting or if b.
+       *    shows up after a. then it the next item
+       */
+    //@formatter:on
+      // find first text char 
+      char[] theChars = stringBuilderToChars(returnString);
+      int[] startEnd = findEndOfList(theChars, 0);
+      int iPos = startEnd[0];
+      int endOfList = startEnd[1];
+      int startOfNextList = startEnd[2];
+      boolean isNumeric = Character.isDigit(theChars[iPos]);
+      boolean isLowerCase = Character.isLowerCase(theChars[iPos]);
+      int currentNumber = 0;
+      String currentLetter = "";
+      if (isNumeric) {
+         int startPos = iPos;
+         while ((theChars[iPos] != '.') && theChars[iPos] != ')') {
+            iPos++;
+         }
+         String theNumber = returnString.substring(startPos, iPos);
+         currentNumber = Integer.parseInt(theNumber);
+      } else {
+         int startPos = iPos;
+         while ((theChars[iPos] != '.') && theChars[iPos] != ')') {
+            iPos++;
+         }
+         currentLetter = returnString.substring(startPos, iPos);
+      }
+      int nextItem = 0;
+      returnString.delete(iPos - 1, iPos + 1);
+      endOfList -= 2;
+      startOfNextList -= 2;
+      String insertValue = null;
+      if (isNumeric) {
+         insertValue = "<ol>";
+      } else if (isLowerCase) {
+         insertValue = "<ol type = \"a\">";
+      } else {
+         insertValue = "<ol type = \"A\">";
+      }
+      returnString.insert(iPos - 1, insertValue);
+      if (iPos < endOfList) {
+         endOfList = endOfList + insertValue.length();
+         startOfNextList = startOfNextList + insertValue.length();
+      }
+      iPos += insertValue.length();
 
-            if (inArtifact) {
-               ListIterator<String> iter = theArtifact.listIterator(rowIndex);
-               String theColumnValue = iter.next();
-               theColumnValue += "\n" + rowValue.trim();
-               iter.set(theColumnValue);
+      listData theListData = new listData();
+      boolean lastWasSublist = false;
+      while (nextItem != -1) {
+         if (theListData.getNewList()) {
+            lastWasSublist = true;
+         } else {
+            lastWasSublist = false;
+            returnString.insert(iPos - 1, LIST_ITEM_TAG);
+            if (iPos < endOfList) {
+               endOfList = endOfList + LIST_ITEM_TAG.length();
+               startOfNextList = startOfNextList + LIST_ITEM_TAG.length();
+            }
+            iPos += LIST_ITEM_TAG.length() - 1;
+         }
+         theChars = stringBuilderToChars(returnString);
+         nextItem = findNextListItem(theChars, iPos, isNumeric, isLowerCase, currentNumber, currentLetter, theListData);
+         if (nextItem == -1) {
+            break;
+         }
+
+         if (theListData.getNewList()) {
+            int startPoint = (nextItem < startOfNextList) ? nextItem : startOfNextList;
+            String theSublist = returnString.substring(0, startPoint);
+            int end = theListData.getNextItem();
+            if (end >= returnString.length()) {
+               end = returnString.length() - 1;
+            }
+
+            String theRawSublist = new String(theChars, startPoint, end - startPoint + 1);
+            int initialLen = theRawSublist.length();
+            theRawSublist = processList(theRawSublist);
+            theSublist += theRawSublist;
+            theSublist += LIST_ITEM_END_TAG;
+            int delta = (theRawSublist.length() - initialLen) + LIST_ITEM_END_TAG.length();
+            endOfList += delta;
+            startOfNextList += delta;
+            if ((theListData.getNextItem() != -1) && (theListData.getNextItem() < returnString.length())) {
+               theSublist += returnString.substring(theListData.getNextItem() + 1);
+            }
+            returnString.delete(0, returnString.length());
+            returnString.append(theSublist);
+
+         } else {
+            if (isNumeric) {
+               currentNumber =
+                  Integer.valueOf(returnString.substring(nextItem, nextItem + theListData.getItemLength() - 1));
             } else {
-               theArtifact.add(rowValue.trim());
+               currentLetter = returnString.substring(nextItem, nextItem + theListData.getItemLength() - 1);
+            }
+            returnString.delete(nextItem, nextItem + theListData.getItemLength());
+            endOfList -= theListData.getItemLength();
+            startOfNextList -= theListData.getItemLength();
+            /*************************************************************
+             * Since we are converting a line of text, there is a blank line after it. Delete the <BR>
+             * </BR>
+             */
+            if (!lastWasSublist) {
+               int end = nextItem;
+               if (end > returnString.length()) {
+                  end = returnString.length();
+               }
+               String test = returnString.substring(0, end);
+               int lastPoint = test.lastIndexOf(BLANK_HTML_LINE);
+               if (lastPoint != -1) {
+                  returnString.delete(lastPoint, end);
+                  int delta = test.length() - lastPoint;
+                  endOfList -= delta;
+                  nextItem -= delta;
+                  startOfNextList -= delta;
+               }
             }
+            if (!lastWasSublist) {
+               returnString.insert(nextItem, LIST_ITEM_END_TAG);
+               if (nextItem < endOfList) {
+                  endOfList = endOfList + LIST_ITEM_END_TAG.length();
+                  startOfNextList = startOfNextList + LIST_ITEM_END_TAG.length();
+               }
+               nextItem = nextItem + LIST_ITEM_END_TAG.length();
+            }
+            iPos = nextItem + 1;
+         }
+         theChars = stringBuilderToChars(returnString);
+      }
+      // find the insertion point for list end
+      String tokenToInsert = "</li></ol>";
+      if (theListData.getNewList()) {
+         tokenToInsert = "</ol>";
+      }
+
+      if (endOfList < theChars.length) {
+         returnString.insert(endOfList, tokenToInsert);
+      } else {
+         // verify the list doesn't end with <BR></BR>
+         String test = returnString.toString();
+         int lastPoint = test.lastIndexOf(BLANK_HTML_LINE);
+         if (lastPoint == (test.length() - BLANK_HTML_LINE.length())) {
+            returnString.delete(lastPoint, returnString.length());
+         }
+         returnString.append(tokenToInsert);
+      }
+
+      return returnString.toString();
+   }
 
+   private int[] findEndOfList(char[] theChars, int startPoint) {
+      int iPos = startPoint;
+      int[] iReturn = {0, theChars.length, theChars.length};
+      int tagCount = 0;
+      boolean notFirst = false;
+      boolean foundNonTagItem = false;
+      while (iPos < theChars.length) {
+         while ((iPos < theChars.length) && ((theChars[iPos] == '\t') || (theChars[iPos] == '\n') || (Character.isWhitespace(theChars[iPos])))) {
+            iPos++;
+         }
+         if (iPos >= theChars.length) {
+            iReturn[1] = theChars.length;
+            break;
+         }
+         if (theChars[iPos] == '<') {
+            int startofCloseTag = iPos;
+            iPos++;
+            if (theChars[iPos] == '/') {
+               tagCount--;
+               while ((iPos < theChars.length) && (theChars[iPos] != '>')) {
+                  iPos++;
+               }
+               if (((tagCount == 0) && foundNonTagItem) || (tagCount < 0)) {
+                  iReturn[1] = startofCloseTag;
+                  iReturn[2] = iPos;
+                  while ((iReturn[2] < theChars.length) && (theChars[iReturn[2]] != '<')) {
+                     iReturn[2] = iReturn[2] + 1;
+                  }
+                  break;
+               }
+            } else {
+               tagCount++;
+            }
+            while ((iPos < theChars.length) && (theChars[iPos] != '>')) {
+               iPos++;
+            }
+            iPos++;
+         } else if (notFirst) {
+            if (!foundNonTagItem) {
+               iReturn[0] = iPos;
+               foundNonTagItem = true;
+            }
+            if (tagCount == 0) {
+               break;
+            } else {
+               // find next tag
+               while ((iPos < theChars.length) && (theChars[iPos] != '<')) {
+                  iPos++;
+               }
+               iReturn[1] = iPos - 1;
+               // find the end of the tag 
+               iReturn[2] = iPos;
+               while ((iReturn[2] < theChars.length) && (theChars[iReturn[2]] != '>')) {
+                  iReturn[2] = iReturn[2] + 1;
+               }
+               iReturn[2] = iReturn[2] + 1;
+            }
+         } else {
+            // no opening tags, therefore list not enclosed in tags.
+            iPos = theChars.length;
          }
+         notFirst = true;
+      }
+      return iReturn;
+   }
 
+   static char[] stringBuilderToChars(StringBuilder sb) {
+      char[] returnArray = new char[sb.length()];
+      sb.getChars(0, sb.length(), returnArray, 0);
+      return returnArray;
+   }
+
+   private class listData {
+      private boolean newList;
+      private int itemLength;
+      private int nextItem;
+
+      public listData() {
+         this.newList = false;
+         this.itemLength = 0;
       }
-      inArtifact = true;
+
+      public int getItemLength() {
+         return itemLength;
+      }
+
+      public int getNextItem() {
+         return nextItem;
+      }
+
+      public boolean getNewList() {
+         return newList;
+      }
+
+      public void setNextItem(int nextItem) {
+         this.nextItem = nextItem;
+      }
+
+      public void setItemLength(int itemLength) {
+         this.itemLength = itemLength;
+      }
+
+      public void setNewList(boolean newList) {
+         this.newList = newList;
+      }
+   }
+
+   private int findNextListItem(char[] theChars, int iPos, boolean isNumeric, boolean isLowerCase, int currentNumber, String currentLetter, listData listData) {
+      //@formatter:off
+      /****************************************************************************
+       * Now the tricky part.  We are looking for 
+       * 1) <space><next value>.<space or &nbsp; or &#something> 
+       * 2) <space><next level value>.
+       */
+      //@formatter:on
+
+      iPos++;
+      if (iPos >= theChars.length) {
+         return -1;
+      }
+      StringBuilder asString = new StringBuilder();
+      asString.append(theChars, iPos, theChars.length - iPos);
+      int aListDot = asString.toString().toLowerCase().indexOf("a.");
+      int aListParen = asString.toString().toLowerCase().indexOf("a.");
+      int aList = -1;
+      if (aListDot == -1) {
+         aList = aListParen;
+      } else if (aListParen == -1) {
+         aList = aListDot;
+      } else {
+         aList = (aListDot < aListParen) ? aListDot : aListParen;
+      }
+      int oneListDot = asString.indexOf("1.");
+      int oneListParen = asString.indexOf("1)");
+      int oneList = -1;
+      if (oneListDot == -1) {
+         oneList = oneListParen;
+      } else if (aListParen == -1) {
+         oneList = oneListDot;
+      } else {
+         oneList = (oneListDot < oneListParen) ? oneListDot : oneListParen;
+      }
+
+      int nextListItem = -1;
+      String nextItem = "";
+      if (isNumeric) {
+         nextItem = Integer.toString(currentNumber + 1) + ".";
+      } else {
+         // assume Ascii -- that is, that the letters are contiguous
+         byte[] theLetters = null;
+         try {
+            theLetters = currentLetter.getBytes("UTF-8");
+         } catch (UnsupportedEncodingException e) {
+            theLetters = currentLetter.getBytes();
+         }
+         int theCharToChange = theLetters.length - 1;
+         if (currentLetter.toLowerCase().charAt(theCharToChange) == 'z') {
+            if (theCharToChange > 0) {
+               theLetters[theCharToChange - 1]++;
+               if (isLowerCase) {
+                  theLetters[theCharToChange] = "a".getBytes()[0];
+               } else {
+                  theLetters[theCharToChange] = "A".getBytes()[0];
+               }
+            } else {
+               byte[] newLetterArray = new byte[theLetters.length + 1];
+               for (int i = 0; i < newLetterArray.length; i++) {
+                  if (isLowerCase) {
+                     newLetterArray[i] = "a".getBytes()[0];
+                  } else {
+                     newLetterArray[i] = "A".getBytes()[0];
+                  }
+               }
+               theLetters = newLetterArray;
+            }
+         } else {
+            theLetters[0]++;
+         }
+         nextItem = new String(theLetters) + ".";
+      }
+      nextListItem = asString.indexOf(nextItem);
+      if (nextListItem != -1) {
+         // verify this is not just a char and period
+         char prev = asString.charAt(nextListItem - 1);
+         while (!(Character.isWhitespace(prev) || (prev == ';') || (prev == '>'))) {
+            nextListItem = asString.indexOf(nextItem, nextListItem + 1);
+            if (nextListItem == -1) {
+               break;
+            }
+            prev = asString.charAt(nextListItem - 1);
+         }
+      }
+      if ((aList == -1) && (oneList == -1) && (nextListItem == -1)) {
+         return -1;
+      }
+      aList = (aList != -1) ? aList + iPos : theChars.length + 1;
+      oneList = (oneList != -1) ? oneList + iPos : theChars.length + 1;
+      nextListItem = (nextListItem != -1) ? nextListItem + iPos : theChars.length + 1;
+      int iReturn = (aList < oneList) ? aList : oneList;
+      iReturn = (iReturn < nextListItem) ? iReturn : nextListItem;
+      if (iReturn == nextListItem) {
+         listData.setNewList(false);
+         listData.setItemLength(nextItem.length());
+         listData.setNextItem(nextListItem);
+      } else {
+         listData.setNewList(true);
+         listData.setItemLength(2);
+         listData.setNextItem(nextListItem - 1);
+      }
+      return iReturn;
    }
 
    private void processArtifact() throws OseeCoreException {
@@ -585,7 +999,7 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
             case REQUIREMENTS:
                StringBuffer imageFileList = new StringBuffer("");
                getImageList(rowValue, imageFileList);
-               rowValue = normailizeHtml(rowValue);
+               rowValue = normalizeHtml(rowValue);
                String imageFile = imageFileList.toString();
                if (!imageFile.isEmpty()) {
                   String theImage;
@@ -783,9 +1197,9 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
       return returnValue;
    }
 
-   private String normailizeHtml(String inputHtml) {
+   private String normalizeHtml(String inputHtml) {
 
-      String returnValue = NormalizeHtml.convertToNormalizedHTML(inputHtml);
+      String returnValue = NormalizeHtml.convertToNormalizedHTML(inputHtml, true, true, true);
       int bodyStart = returnValue.indexOf(BODY_START_TAG);
       int bodyEnd = returnValue.indexOf(BODY_END_TAG);
       if (bodyStart != -1) {
@@ -821,10 +1235,10 @@ public class DoorsArtifactExtractor extends AbstractArtifactExtractor {
        * tags these are not meaningful
        */
       returnValue = returnValue.trim();
-      int brTag = returnValue.lastIndexOf(BR_TAG);
-      while (brTag == returnValue.length() - BR_TAG.length()) {
+      int brTag = returnValue.toLowerCase().lastIndexOf(BR_TAG);
+      while ((brTag != -1) && (brTag == returnValue.length() - BR_TAG.length())) {
          returnValue = returnValue.substring(0, brTag).trim();
-         brTag = returnValue.lastIndexOf(BR_TAG);
+         brTag = returnValue.toLowerCase().lastIndexOf(BR_TAG);
       }
       return returnValue;
    }
diff --git a/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtml.java b/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtml.java
index 2724c31da23..04e3af90248 100644
--- a/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtml.java
+++ b/plugins/org.eclipse.osee.framework.skynet.core/src/org/eclipse/osee/framework/skynet/core/utility/NormalizeHtml.java
@@ -10,6 +10,7 @@
  *******************************************************************************/
 package org.eclipse.osee.framework.skynet.core.utility;
 
+import java.util.ArrayList;
 import java.util.TreeMap;
 import org.jsoup.Jsoup;
 import org.jsoup.nodes.Attribute;
@@ -19,7 +20,6 @@ import org.jsoup.nodes.Document.OutputSettings;
 import org.jsoup.nodes.Document.QuirksMode;
 import org.jsoup.nodes.Element;
 import org.jsoup.nodes.Entities.EscapeMode;
-import org.jsoup.nodes.TextNode;
 import org.jsoup.parser.Tag;
 import org.jsoup.select.Elements;
 
@@ -63,11 +63,15 @@ public final class NormalizeHtml {
    private static final String rdquo = String.valueOf('\u201D');
    private static final String lsquo = String.valueOf('\u2018');
    private static final String rsquo = String.valueOf('\u2019');
+   private static final String figureDash = String.valueOf('\u2012');
+   private static final String enDash = String.valueOf('\u2013');
+   private static final String emDash = String.valueOf('\u2014');
    private static final String NON_BREAK_SPACE = String.valueOf('\u00A0');
    private static final String NON_BREAK_FIGURE_SPACE = String.valueOf('\u2007');
    private static final String NON_BREAK_NARROW_SPACE = String.valueOf('\u202F');
    private static final String NON_BREAK_WORD_JOINER = String.valueOf('\u2060');
    private static final String NON_BREAK_ZERO_WIDTH = String.valueOf('\uFEFF');
+   private static ArrayList<String> allowedAttributes = null;
 
    private NormalizeHtml() {
       // Utility Class
@@ -93,7 +97,7 @@ public final class NormalizeHtml {
     * @return Normalized HTML
     */
    public static String convertToNormalizedHTML(String inputHTML) {
-      return convertToNormalizedHTML(inputHTML, false, false);
+      return convertToNormalizedHTML(inputHTML, false, false, false);
    }
 
    /**
@@ -103,10 +107,10 @@ public final class NormalizeHtml {
     * 
     * @param inputHTML HTML source to be normalized
     * @param removeInitialStyle Remove initial style information.
-    * @param removeEmptyStyle Remove any empty (containing no text) style sections
+    * @param removeEmptyTags Remove any empty (containing no text) style sections
     * @return Normalized HTML
     */
-   public static String convertToNormalizedHTML(String inputHTML, boolean removeInitialStyle, boolean removeEmptyStyle) {
+   public static String convertToNormalizedHTML(String inputHTML, boolean removeInitialStyle, boolean removeEmptyTags, boolean removeHeaderFooter) {
       Document doc = Jsoup.parse(inputHTML);
       doc.quirksMode(QuirksMode.noQuirks);
       OutputSettings outputSettings = doc.outputSettings();
@@ -134,77 +138,154 @@ public final class NormalizeHtml {
          e.tagName("span");
          e.attr("style", "text-decoration: line-through;");
       }
+      removeDepreactedTags(doc);
+      processTagsWithAttributes(doc);
+      processHeaderFooter(doc, removeHeaderFooter);
       processFontTags(doc);
       processInitialStyleTags(doc, removeInitialStyle);
-      processEmptyStyleTags(doc, removeEmptyStyle);
+      processEmptyTags(doc, removeEmptyTags);
       return processText(doc);
    }
 
+   static void removeDepreactedTags(Document doc) {
+      Elements center = doc.select("center");
+      for (Element e : center) {
+         Elements children = e.children();
+         for (Element c : children) {
+            e.before(c);
+         }
+         e.remove();
+      }
+   }
+
+   private static void processTagsWithAttributes(Document doc) {
+      /****************************************************************************
+       * HTML allows the same table to be represented many ways. Normalize the information into a standard format. Note
+       * this will simplify the table as well (that is some formatting may be lost) Remember, the goal is to reduce the
+       * HTML to the point that it is the same regardless of the source editor Also images have similar issues --
+       * normalize to the basic keyword
+       */
+
+      if (allowedAttributes == null) {
+         allowedAttributes = new ArrayList<String>();
+         allowedAttributes.add("border");
+         allowedAttributes.add("frame");
+         allowedAttributes.add("rules");
+         allowedAttributes.add("valign");
+         allowedAttributes.add("src");
+      }
+      Elements tables = doc.select("table");
+      for (Element table : tables) {
+         removeUnsupportedAttributes(table, true);
+         // remove Colgroup
+         Elements colgroup = table.select("colgroup");
+         for (Element c : colgroup) {
+            c.remove();
+         }
+         // no support for header / footer -- just rows
+
+         removeElements(table, "thead");
+         removeElements(table, "tfoot");
+         removeElements(table, "tbody");
+         // remove unsupported attributes on tr and td tags and move the attributes from td to tr
+         Elements rows = table.select("td");
+         for (Element row : rows) {
+            String[] attributeValues = removeUnsupportedAttributes(row, false);
+            Element tr = null;
+            Element parent = row.parent();
+            if (parent.tagName().equals("tr")) {
+               tr = parent;
+            } else {
+               Elements siblings = row.siblingElements();
+               for (Element e : siblings) {
+                  if (e.tagName().equals("tr")) {
+                     tr = e;
+                     break;
+                  }
+               }
+            }
+            if (tr != null) {
+               for (int i = 0; i < attributeValues.length; i++) {
+                  if (attributeValues[i] != null) {
+                     tr.attr(allowedAttributes.get(i), attributeValues[i].toLowerCase());
+                  }
+               }
+            }
+         }
+         rows = table.select("tr");
+         for (Element row : rows) {
+            removeUnsupportedAttributes(row, true);
+         }
+      }
+
+      Elements images = doc.select("img");
+      for (Element image : images) {
+         removeUnsupportedAttributes(image, true);
+      }
+   }
+
    static void processInitialStyleTags(Document doc, boolean removeInitialStyle) {
       if (removeInitialStyle) {
-         boolean foundText = false;
          Elements pTags = doc.select("p");
          for (Element p : pTags) {
-            Elements style = p.getElementsByAttribute("style");
-            for (Element e : style) {
-               Element parent = e.parent();
-               if (!parent.tagName().equals("span")) {
-                  if (e.hasText()) {
-                     String text = e.text();
-                     TextNode newNode = new TextNode(text, e.baseUri());
-                     e.remove();
-                     // Insert newline between various text elements
-                     if (foundText) {
-                        Tag tag = Tag.valueOf("br");
-                        Element br = new Element(tag, parent.baseUri());
-                        parent.appendChild(br);
-                        br = new Element(tag, parent.baseUri());
-                        parent.appendChild(br);
-                     }
-                     parent.appendChild(newNode);
-                     foundText = true;
-                     break;
+            if (!p.attr("style").equals("")) {
+               if (p.hasText()) {
+                  if (!p.parent().tagName().equals("li")) {
+                     Element cr = new Element(Tag.valueOf("br"), p.baseUri());
+                     p.after(cr);
                   }
+                  p.unwrap();
                }
             }
          }
          Elements span = doc.select("span");
          for (Element s : span) {
-            Elements style = s.getElementsByAttributeValueMatching("style", "font*|margin*");
-            for (Element e : style) {
-               Element parent = e.parent();
-               if (!parent.tagName().equals("p")) {
-                  if (e.hasText()) {
-                     String text = e.text();
-                     TextNode newNode = new TextNode(text, e.baseUri());
-                     e.remove();
-                     parent.appendChild(newNode);
-                     // Insert newline between various text elements
-                     if (foundText) {
-                        Tag tag = Tag.valueOf("br");
-                        Element br = new Element(tag, parent.baseUri());
-                        parent.appendChild(br);
-                        parent.appendChild(br);
-                     }
-                     foundText = true;
-                     break;
-                  }
-               }
+            if (!s.attr("style").equals("") && !s.hasText()) {
+               s.remove();
             }
          }
       }
    }
 
-   private static void processEmptyStyleTags(Document doc, boolean removeEmptyStyle) {
-      if (removeEmptyStyle) {
+   private static void processEmptyTags(Document doc, boolean removeEmptyTags) {
+      if (removeEmptyTags) {
          Elements pTags = doc.select("p");
          for (Element p : pTags) {
+            // Element cr = new Element(Tag.valueOf("br"), p.baseUri());
+            //p.after(cr);
             deleteEmptyElemens(p);
          }
          Elements span = doc.select("span");
          for (Element s : span) {
             deleteEmptyElemens(s);
          }
+         Elements div = doc.select("div");
+         for (Element e : div) {
+            if (!e.hasText()) {
+               e.remove();
+            } else {
+               e.unwrap();
+            }
+         }
+         Elements aTags = doc.select("a");
+         for (Element a : aTags) {
+            Attributes attr = a.attributes();
+            if ((attr.size() == 1) && (!attr.get("name").equals(""))) {
+               a.unwrap();
+            }
+         }
+      }
+   }
+
+   static void processHeaderFooter(Document doc, boolean removeHeaderFooter) {
+      if (removeHeaderFooter) {
+         Elements div = doc.select("div");
+         for (Element d : div) {
+            Elements headerFooter = d.getElementsByAttributeValueMatching("type", "HEADER*|FOOTER*");
+            for (Element hf : headerFooter) {
+               hf.remove();
+            }
+         }
       }
    }
 
@@ -215,18 +296,12 @@ public final class NormalizeHtml {
       Elements style = elementToCheck.getElementsByAttributeValueMatching("style", "font*|margin*");
       for (Element e : style) {
          if (e.hasText()) {
-            break;
+            continue;
          } else {
-            Elements images = e.select("img");
-            Element parent = e.parent();
-            e.remove();
-            if (images.size() > 0) {
-               for (Element image : images) {
-                  parent.appendChild(image);
-               }
-            }
+            e.unwrap();
          }
       }
+
    }
 
    private static String processText(Document doc) {
@@ -243,6 +318,14 @@ public final class NormalizeHtml {
       theText = theText.replaceAll(rsquo, "'");
       theText = theText.replaceAll("&apos;", "'");
 
+      /************************************************************************
+       * Convert &ndash; and Unicode dashes to -. Not all editors handle this correctly
+       */
+      theText = theText.replaceAll("&ndash;", "-");
+      theText = theText.replaceAll(figureDash, "-");
+      theText = theText.replaceAll(enDash, "-");
+      theText = theText.replaceAll(emDash, "-");
+
       //@formatter:off
       /*****************************************************************************
        * Convert the non-blocking characters to the HTML value (&nbsp;) 
@@ -324,4 +407,39 @@ public final class NormalizeHtml {
       return theReturn;
    }
 
+   static void removeElements(Element table, String theElementParent) {
+      Elements parents = table.select(theElementParent);
+      for (Element p : parents) {
+         Elements children = p.children();
+         for (Element c : children) {
+            p.before(c);
+         }
+         p.remove();
+      }
+
+   }
+
+   static String[] removeUnsupportedAttributes(Element e, boolean addBack) {
+      String[] attributeValues = {null, null, null, null, null};
+      // remove "unsupported" attributes
+      Attributes attr = e.attributes();
+      for (Attribute a : attr) {
+         if (allowedAttributes.contains(a.getKey())) {
+            if (!(a.getKey().equals("border") && a.getValue().equals("0"))) {
+               attributeValues[allowedAttributes.indexOf(a.getKey())] = a.getValue();
+            }
+         }
+         e.removeAttr(a.getKey());
+      }
+      if (addBack) {
+         // set specific order for attributes
+         for (int i = 0; i < attributeValues.length; i++) {
+            if (attributeValues[i] != null) {
+               e.attr(allowedAttributes.get(i), attributeValues[i]);
+            }
+         }
+      }
+      return attributeValues;
+   }
+
 }
author	mpotterc0k	2013-08-19 15:36:25 +0000
committer	Gerrit Code Review @ Eclipse.org	2013-08-29 16:13:13 +0000
commit	b7c17cd7038b4adfc1e9b14f568540baed73b556 (patch)
tree	880537cc77a652f721b2db17168dae276dce5e43
parent	419c045952758bcd1c7a2aba685d890ea20db86a (diff)
download	org.eclipse.osee-b7c17cd7038b4adfc1e9b14f568540baed73b556.tar.gz org.eclipse.osee-b7c17cd7038b4adfc1e9b14f568540baed73b556.tar.xz org.eclipse.osee-b7c17cd7038b4adfc1e9b14f568540baed73b556.zip