Skip to main content
aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--org.eclipse.help.base/src/org/eclipse/help/internal/search/HTMLDocParser.java37
-rw-r--r--org.eclipse.ua.tests/.settings/org.eclipse.core.resources.prefs3
-rw-r--r--org.eclipse.ua.tests/data/help/search/testnl8859_html5.html17
-rw-r--r--org.eclipse.ua.tests/data/help/search/testnlUTF8_html5.html20
-rw-r--r--org.eclipse.ua.tests/data/help/search/toc.xml4
-rw-r--r--org.eclipse.ua.tests/help/org/eclipse/ua/tests/help/search/EncodedCharacterSearch.java23
6 files changed, 92 insertions, 12 deletions
diff --git a/org.eclipse.help.base/src/org/eclipse/help/internal/search/HTMLDocParser.java b/org.eclipse.help.base/src/org/eclipse/help/internal/search/HTMLDocParser.java
index c978caac5..9058ee66e 100644
--- a/org.eclipse.help.base/src/org/eclipse/help/internal/search/HTMLDocParser.java
+++ b/org.eclipse.help.base/src/org/eclipse/help/internal/search/HTMLDocParser.java
@@ -1,5 +1,5 @@
/*******************************************************************************
- * Copyright (c) 2000, 2015 IBM Corporation and others.
+ * Copyright (c) 2000, 2020 IBM Corporation and others.
*
* This program and the accompanying materials
* are made available under the terms of the Eclipse Public License 2.0
@@ -42,6 +42,7 @@ public class HTMLDocParser {
final static String ATTRIBUTE_HTTP = "http-equiv"; //$NON-NLS-1$
final static String ATTRIBUTE_HTTP_VALUE = "content-type"; //$NON-NLS-1$
final static String ATTRIBUTE_CONTENT = "content"; //$NON-NLS-1$
+ final static String ATTRIBUTE_CHARSET = "charset"; //$NON-NLS-1$
// states for parsing elements
final static int STATE_ELEMENT_START = 0;
@@ -58,6 +59,11 @@ public class HTMLDocParser {
final static int STATE_CONTENT_AFTER_NAME = 1;
final static int STATE_CONTENT_AFTER_EQ = 2;
final static int STATE_CONTENT_DONE = 3;
+ // states for parsing CHARSET attribute
+ final static int STATE_CHARSET_START = 0;
+ final static int STATE_CHARSET_AFTER_NAME = 1;
+ final static int STATE_CHARSET_AFTER_EQ = 2;
+ final static int STATE_CHARSET_DONE = 3;
private HTMLParser htmlParser;
private InputStream inputStream = null;
@@ -166,6 +172,7 @@ public class HTMLDocParser {
int stateContent = STATE_HTTP_START;
int stateElement = STATE_ELEMENT_START;
int stateHttp = STATE_HTTP_START;
+ int stateCharset = STATE_CHARSET_START;
try {
// in the worst case, process tokens until end of file
@@ -199,6 +206,7 @@ public class HTMLDocParser {
// META element opened
stateElement = STATE_ELEMENT_META;
// initialize state of attributes
+ stateCharset = STATE_CHARSET_START;
stateHttp = STATE_HTTP_START;
stateContent = STATE_CONTENT_START;
contentValue = null;
@@ -241,7 +249,10 @@ public class HTMLDocParser {
break;
case StreamTokenizer.TT_WORD :
// string inside META tag, can be attribute name
- if (ATTRIBUTE_HTTP
+ if (ATTRIBUTE_CHARSET.equalsIgnoreCase(tokenizer.sval)) {
+ // found CHARSET attribute name
+ stateCharset = STATE_CHARSET_AFTER_NAME;
+ } else if (ATTRIBUTE_HTTP
.equalsIgnoreCase(tokenizer.sval)) {
// found HTTP-EQUIV attribute name
stateHttp = STATE_HTTP_AFTER_NAME;
@@ -260,6 +271,9 @@ public class HTMLDocParser {
// some other attribute name or string,
// reset states of seeked attributes,
// unless successfully processed earlier
+ if (stateCharset != STATE_CHARSET_DONE) {
+ stateCharset = STATE_CHARSET_START;
+ }
if (stateHttp != STATE_HTTP_DONE) {
stateHttp = STATE_HTTP_START;
}
@@ -271,7 +285,10 @@ public class HTMLDocParser {
case '=' :
// = inside META tag, can separate interesing us
// attribute names from values
- if (stateHttp == STATE_HTTP_AFTER_NAME) {
+ if (stateCharset == STATE_CHARSET_AFTER_NAME) {
+ // we have CHARSET=
+ stateCharset = STATE_CHARSET_AFTER_EQ;
+ } else if (stateHttp == STATE_HTTP_AFTER_NAME) {
// we have HTTP-EQUIV=
stateHttp = STATE_HTTP_AFTER_EQ;
} else if (stateContent == STATE_CONTENT_AFTER_NAME) {
@@ -282,6 +299,9 @@ public class HTMLDocParser {
// name or string,
// reset states of seeked attributes,
// unless successfully processed earlier
+ if (stateCharset != STATE_CHARSET_DONE) {
+ stateCharset = STATE_CHARSET_START;
+ }
if (stateHttp != STATE_HTTP_DONE) {
stateHttp = STATE_HTTP_START;
}
@@ -293,7 +313,12 @@ public class HTMLDocParser {
case '\"' :
// quoted string inside META tag, can be
// attribute value
- if (stateHttp == STATE_HTTP_AFTER_EQ) {
+ if (stateCharset == STATE_CHARSET_AFTER_EQ) {
+ // value of CHARSET attribute
+ // we found <META CHARSET="***"
+ stateContent = STATE_CHARSET_DONE;
+ return tokenizer.sval.isEmpty() ? null : tokenizer.sval;
+ } else if (stateHttp == STATE_HTTP_AFTER_EQ) {
// value of HTTP-EQUIV attribute
if (ATTRIBUTE_HTTP_VALUE
.equalsIgnoreCase(tokenizer.sval)) {
@@ -314,6 +339,7 @@ public class HTMLDocParser {
} else {
// value for the attribute is missing
// reset states of seeked attributes
+ stateCharset = STATE_CHARSET_START;
stateHttp = STATE_HTTP_START;
stateContent = STATE_CONTENT_START;
}
@@ -322,6 +348,9 @@ public class HTMLDocParser {
// other unexpected token inside META tag
// reset states of seeked attributes,
// unless successfully processed earlier
+ if (stateCharset != STATE_CHARSET_DONE) {
+ stateCharset = STATE_CHARSET_START;
+ }
if (stateHttp != STATE_HTTP_DONE) {
stateHttp = STATE_HTTP_START;
}
diff --git a/org.eclipse.ua.tests/.settings/org.eclipse.core.resources.prefs b/org.eclipse.ua.tests/.settings/org.eclipse.core.resources.prefs
index 88a681f43..af5c9e96a 100644
--- a/org.eclipse.ua.tests/.settings/org.eclipse.core.resources.prefs
+++ b/org.eclipse.ua.tests/.settings/org.eclipse.core.resources.prefs
@@ -1,7 +1,8 @@
-#Tue Aug 18 14:40:22 PDT 2009
eclipse.preferences.version=1
encoding//data/help/search/testnl8859.htm=ISO-8859-1
+encoding//data/help/search/testnl8859_html5.html=ISO-8859-1
encoding//data/help/search/testnlUTF8.htm=UTF-8
+encoding//data/help/search/testnlUTF8_html5.html=UTF-8
encoding//data/help/toc/filteredToc/parent8859.html=ISO-8859-1
encoding//data/help/toc/filteredToc/parentUTF8.html=UTF-8
encoding//non_junit/test_plan.htm=ISO-8859-1
diff --git a/org.eclipse.ua.tests/data/help/search/testnl8859_html5.html b/org.eclipse.ua.tests/data/help/search/testnl8859_html5.html
new file mode 100644
index 000000000..5aa137395
--- /dev/null
+++ b/org.eclipse.ua.tests/data/help/search/testnl8859_html5.html
@@ -0,0 +1,17 @@
+<!DOCTYPE html>
+<html lang="en-us">
+<head>
+<meta charset="ISO-8859-1">
+<title>ISO 8859-1 HTML5 doc</title>
+</head>
+<body>
+
+This is a test document to search words declared in a document
+encoded in ISO-8859-1
+
+The test searches for this unique string:
+
+醙uilaxaxcs
+
+</body>
+</html>
diff --git a/org.eclipse.ua.tests/data/help/search/testnlUTF8_html5.html b/org.eclipse.ua.tests/data/help/search/testnlUTF8_html5.html
new file mode 100644
index 000000000..206b824ae
--- /dev/null
+++ b/org.eclipse.ua.tests/data/help/search/testnlUTF8_html5.html
@@ -0,0 +1,20 @@
+<!DOCTYPE html>
+<html lang="en-us">
+<head>
+<meta charset="UTF-8">
+<title>UTF-8 HTML5 doc</title>
+</head>
+<body>
+
+This is a test for search within a UTF-8 encoded document
+
+The test searches for these unique strings:
+
+acfele贸n
+
+杈叉泦鏂板勾
+
+讗址住职讟职专讜止谞讜止诪职讬指讛) 诇执拽旨讜旨讬 (讬专讞 讗讜 砖诪砖
+
+</body>
+</html>
diff --git a/org.eclipse.ua.tests/data/help/search/toc.xml b/org.eclipse.ua.tests/data/help/search/toc.xml
index 7651523b9..b822a0ae3 100644
--- a/org.eclipse.ua.tests/data/help/search/toc.xml
+++ b/org.eclipse.ua.tests/data/help/search/toc.xml
@@ -2,7 +2,7 @@
<?NLS TYPE="org.eclipse.help.toc"?>
<!--
- Copyright (c) 2005, 2009 IBM Corporation and others.
+ Copyright (c) 2005, 2020 IBM Corporation and others.
This program and the accompanying materials
are made available under the terms of the Eclipse Public License 2.0
@@ -35,8 +35,10 @@
</topic>
<topic href="data/help/search/testnl8859.htm" label="ISO 8859-1 doc">
</topic>
+ <topic href="data/help/search/testnl8859_html5.html" label="ISO 8859-1 doc html5"/>
<topic href="data/help/search/testnlUTF8.htm" label="UTF-8 doc">
</topic>
+ <topic href="data/help/search/testnlUTF8_html5.html" label="UTF-8 doc html5"/>
<topic href="data/help/search/testMeta.htm" label="HTML doc with meta">
</topic>
<topic href="data/help/search/testMeta.xhtml" label="XHTML doc with meta">
diff --git a/org.eclipse.ua.tests/help/org/eclipse/ua/tests/help/search/EncodedCharacterSearch.java b/org.eclipse.ua.tests/help/org/eclipse/ua/tests/help/search/EncodedCharacterSearch.java
index a2b550832..29171e1a6 100644
--- a/org.eclipse.ua.tests/help/org/eclipse/ua/tests/help/search/EncodedCharacterSearch.java
+++ b/org.eclipse.ua.tests/help/org/eclipse/ua/tests/help/search/EncodedCharacterSearch.java
@@ -1,5 +1,5 @@
/*******************************************************************************
- * Copyright (c) 2009, 2016 IBM Corporation and others.
+ * Copyright (c) 2009, 2020 IBM Corporation and others.
*
* This program and the accompanying materials
* are made available under the terms of the Eclipse Public License 2.0
@@ -18,7 +18,10 @@ import org.junit.Test;
public class EncodedCharacterSearch {
@Test
public void testIso8859() {
- SearchTestUtils.searchOneLocale("\u00E1guilaxaxcs", new String[] {"/org.eclipse.ua.tests/data/help/search/testnl8859.htm" }, "en");
+ SearchTestUtils.searchOneLocale("\u00E1guilaxaxcs",
+ new String[] { "/org.eclipse.ua.tests/data/help/search/testnl8859.htm",
+ "/org.eclipse.ua.tests/data/help/search/testnl8859_html5.html" },
+ "en");
}
@Test
@@ -28,19 +31,27 @@ public class EncodedCharacterSearch {
@Test
public void testUtf8Accented() {
- SearchTestUtils.searchOneLocale("acfele\u00F3n", new String[] {"/org.eclipse.ua.tests/data/help/search/testnlUTF8.htm" }, "en");
+ SearchTestUtils.searchOneLocale("acfele\u00F3n",
+ new String[] { "/org.eclipse.ua.tests/data/help/search/testnlUTF8.htm",
+ "/org.eclipse.ua.tests/data/help/search/testnlUTF8_html5.html" },
+ "en");
}
@Test
public void testUtf8Chinese() {
- SearchTestUtils.searchOneLocale("\u8FB2\u66C6\u65B0\u5E74", new String[] {"/org.eclipse.ua.tests/data/help/search/testnlUTF8.htm" }, "en");
+ SearchTestUtils.searchOneLocale("\u8FB2\u66C6\u65B0\u5E74",
+ new String[] { "/org.eclipse.ua.tests/data/help/search/testnlUTF8.htm",
+ "/org.eclipse.ua.tests/data/help/search/testnlUTF8_html5.html" },
+ "en");
}
@Test
public void testUtf8Hebrew() {
SearchTestUtils.searchOneLocale("\u05D0\u05B7\u05E1\u05B0\u05D8\u05B0\u05E8\u05D5\u05B9\u05E0\u05D5\u05B9\u05DE"
- + "\u05B0\u05D9\u05B8\u05D4) \u05DC\u05B4\u05E7\u05BC\u05D5\u05BC\u05D9 (\u05D9\u05E8\u05D7 \u05D0\u05D5 \u05E9\u05DE\u05E9", new String[] {"/org.eclipse.ua.tests/data/help/search/testnlUTF8.htm" }, "en");
+ + "\u05B0\u05D9\u05B8\u05D4) \u05DC\u05B4\u05E7\u05BC\u05D5\u05BC\u05D9 (\u05D9\u05E8\u05D7 \u05D0\u05D5 \u05E9\u05DE\u05E9",
+ new String[] { "/org.eclipse.ua.tests/data/help/search/testnlUTF8.htm",
+ "/org.eclipse.ua.tests/data/help/search/testnlUTF8_html5.html" },
+ "en");
}
-
}

Back to the top