diff options
6 files changed, 92 insertions, 12 deletions
diff --git a/org.eclipse.help.base/src/org/eclipse/help/internal/search/HTMLDocParser.java b/org.eclipse.help.base/src/org/eclipse/help/internal/search/HTMLDocParser.java index c978caac5..9058ee66e 100644 --- a/org.eclipse.help.base/src/org/eclipse/help/internal/search/HTMLDocParser.java +++ b/org.eclipse.help.base/src/org/eclipse/help/internal/search/HTMLDocParser.java @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright (c) 2000, 2015 IBM Corporation and others. + * Copyright (c) 2000, 2020 IBM Corporation and others. * * This program and the accompanying materials * are made available under the terms of the Eclipse Public License 2.0 @@ -42,6 +42,7 @@ public class HTMLDocParser { final static String ATTRIBUTE_HTTP = "http-equiv"; //$NON-NLS-1$ final static String ATTRIBUTE_HTTP_VALUE = "content-type"; //$NON-NLS-1$ final static String ATTRIBUTE_CONTENT = "content"; //$NON-NLS-1$ + final static String ATTRIBUTE_CHARSET = "charset"; //$NON-NLS-1$ // states for parsing elements final static int STATE_ELEMENT_START = 0; @@ -58,6 +59,11 @@ public class HTMLDocParser { final static int STATE_CONTENT_AFTER_NAME = 1; final static int STATE_CONTENT_AFTER_EQ = 2; final static int STATE_CONTENT_DONE = 3; + // states for parsing CHARSET attribute + final static int STATE_CHARSET_START = 0; + final static int STATE_CHARSET_AFTER_NAME = 1; + final static int STATE_CHARSET_AFTER_EQ = 2; + final static int STATE_CHARSET_DONE = 3; private HTMLParser htmlParser; private InputStream inputStream = null; @@ -166,6 +172,7 @@ public class HTMLDocParser { int stateContent = STATE_HTTP_START; int stateElement = STATE_ELEMENT_START; int stateHttp = STATE_HTTP_START; + int stateCharset = STATE_CHARSET_START; try { // in the worst case, process tokens until end of file @@ -199,6 +206,7 @@ public class HTMLDocParser { // META element opened stateElement = STATE_ELEMENT_META; // initialize state of attributes + stateCharset = STATE_CHARSET_START; stateHttp = STATE_HTTP_START; stateContent = STATE_CONTENT_START; contentValue = null; @@ -241,7 +249,10 @@ public class HTMLDocParser { break; case StreamTokenizer.TT_WORD : // string inside META tag, can be attribute name - if (ATTRIBUTE_HTTP + if (ATTRIBUTE_CHARSET.equalsIgnoreCase(tokenizer.sval)) { + // found CHARSET attribute name + stateCharset = STATE_CHARSET_AFTER_NAME; + } else if (ATTRIBUTE_HTTP .equalsIgnoreCase(tokenizer.sval)) { // found HTTP-EQUIV attribute name stateHttp = STATE_HTTP_AFTER_NAME; @@ -260,6 +271,9 @@ public class HTMLDocParser { // some other attribute name or string, // reset states of seeked attributes, // unless successfully processed earlier + if (stateCharset != STATE_CHARSET_DONE) { + stateCharset = STATE_CHARSET_START; + } if (stateHttp != STATE_HTTP_DONE) { stateHttp = STATE_HTTP_START; } @@ -271,7 +285,10 @@ public class HTMLDocParser { case '=' : // = inside META tag, can separate interesing us // attribute names from values - if (stateHttp == STATE_HTTP_AFTER_NAME) { + if (stateCharset == STATE_CHARSET_AFTER_NAME) { + // we have CHARSET= + stateCharset = STATE_CHARSET_AFTER_EQ; + } else if (stateHttp == STATE_HTTP_AFTER_NAME) { // we have HTTP-EQUIV= stateHttp = STATE_HTTP_AFTER_EQ; } else if (stateContent == STATE_CONTENT_AFTER_NAME) { @@ -282,6 +299,9 @@ public class HTMLDocParser { // name or string, // reset states of seeked attributes, // unless successfully processed earlier + if (stateCharset != STATE_CHARSET_DONE) { + stateCharset = STATE_CHARSET_START; + } if (stateHttp != STATE_HTTP_DONE) { stateHttp = STATE_HTTP_START; } @@ -293,7 +313,12 @@ public class HTMLDocParser { case '\"' : // quoted string inside META tag, can be // attribute value - if (stateHttp == STATE_HTTP_AFTER_EQ) { + if (stateCharset == STATE_CHARSET_AFTER_EQ) { + // value of CHARSET attribute + // we found <META CHARSET="***" + stateContent = STATE_CHARSET_DONE; + return tokenizer.sval.isEmpty() ? null : tokenizer.sval; + } else if (stateHttp == STATE_HTTP_AFTER_EQ) { // value of HTTP-EQUIV attribute if (ATTRIBUTE_HTTP_VALUE .equalsIgnoreCase(tokenizer.sval)) { @@ -314,6 +339,7 @@ public class HTMLDocParser { } else { // value for the attribute is missing // reset states of seeked attributes + stateCharset = STATE_CHARSET_START; stateHttp = STATE_HTTP_START; stateContent = STATE_CONTENT_START; } @@ -322,6 +348,9 @@ public class HTMLDocParser { // other unexpected token inside META tag // reset states of seeked attributes, // unless successfully processed earlier + if (stateCharset != STATE_CHARSET_DONE) { + stateCharset = STATE_CHARSET_START; + } if (stateHttp != STATE_HTTP_DONE) { stateHttp = STATE_HTTP_START; } diff --git a/org.eclipse.ua.tests/.settings/org.eclipse.core.resources.prefs b/org.eclipse.ua.tests/.settings/org.eclipse.core.resources.prefs index 88a681f43..af5c9e96a 100644 --- a/org.eclipse.ua.tests/.settings/org.eclipse.core.resources.prefs +++ b/org.eclipse.ua.tests/.settings/org.eclipse.core.resources.prefs @@ -1,7 +1,8 @@ -#Tue Aug 18 14:40:22 PDT 2009 eclipse.preferences.version=1 encoding//data/help/search/testnl8859.htm=ISO-8859-1 +encoding//data/help/search/testnl8859_html5.html=ISO-8859-1 encoding//data/help/search/testnlUTF8.htm=UTF-8 +encoding//data/help/search/testnlUTF8_html5.html=UTF-8 encoding//data/help/toc/filteredToc/parent8859.html=ISO-8859-1 encoding//data/help/toc/filteredToc/parentUTF8.html=UTF-8 encoding//non_junit/test_plan.htm=ISO-8859-1 diff --git a/org.eclipse.ua.tests/data/help/search/testnl8859_html5.html b/org.eclipse.ua.tests/data/help/search/testnl8859_html5.html new file mode 100644 index 000000000..5aa137395 --- /dev/null +++ b/org.eclipse.ua.tests/data/help/search/testnl8859_html5.html @@ -0,0 +1,17 @@ +<!DOCTYPE html> +<html lang="en-us"> +<head> +<meta charset="ISO-8859-1"> +<title>ISO 8859-1 HTML5 doc</title> +</head> +<body> + +This is a test document to search words declared in a document +encoded in ISO-8859-1 + +The test searches for this unique string: + +醙uilaxaxcs + +</body> +</html> diff --git a/org.eclipse.ua.tests/data/help/search/testnlUTF8_html5.html b/org.eclipse.ua.tests/data/help/search/testnlUTF8_html5.html new file mode 100644 index 000000000..206b824ae --- /dev/null +++ b/org.eclipse.ua.tests/data/help/search/testnlUTF8_html5.html @@ -0,0 +1,20 @@ +<!DOCTYPE html> +<html lang="en-us"> +<head> +<meta charset="UTF-8"> +<title>UTF-8 HTML5 doc</title> +</head> +<body> + +This is a test for search within a UTF-8 encoded document + +The test searches for these unique strings: + +acfele贸n + +杈叉泦鏂板勾 + +讗址住职讟职专讜止谞讜止诪职讬指讛) 诇执拽旨讜旨讬 (讬专讞 讗讜 砖诪砖 + +</body> +</html> diff --git a/org.eclipse.ua.tests/data/help/search/toc.xml b/org.eclipse.ua.tests/data/help/search/toc.xml index 7651523b9..b822a0ae3 100644 --- a/org.eclipse.ua.tests/data/help/search/toc.xml +++ b/org.eclipse.ua.tests/data/help/search/toc.xml @@ -2,7 +2,7 @@ <?NLS TYPE="org.eclipse.help.toc"?> <!-- - Copyright (c) 2005, 2009 IBM Corporation and others. + Copyright (c) 2005, 2020 IBM Corporation and others. This program and the accompanying materials are made available under the terms of the Eclipse Public License 2.0 @@ -35,8 +35,10 @@ </topic> <topic href="data/help/search/testnl8859.htm" label="ISO 8859-1 doc"> </topic> + <topic href="data/help/search/testnl8859_html5.html" label="ISO 8859-1 doc html5"/> <topic href="data/help/search/testnlUTF8.htm" label="UTF-8 doc"> </topic> + <topic href="data/help/search/testnlUTF8_html5.html" label="UTF-8 doc html5"/> <topic href="data/help/search/testMeta.htm" label="HTML doc with meta"> </topic> <topic href="data/help/search/testMeta.xhtml" label="XHTML doc with meta"> diff --git a/org.eclipse.ua.tests/help/org/eclipse/ua/tests/help/search/EncodedCharacterSearch.java b/org.eclipse.ua.tests/help/org/eclipse/ua/tests/help/search/EncodedCharacterSearch.java index a2b550832..29171e1a6 100644 --- a/org.eclipse.ua.tests/help/org/eclipse/ua/tests/help/search/EncodedCharacterSearch.java +++ b/org.eclipse.ua.tests/help/org/eclipse/ua/tests/help/search/EncodedCharacterSearch.java @@ -1,5 +1,5 @@ /******************************************************************************* - * Copyright (c) 2009, 2016 IBM Corporation and others. + * Copyright (c) 2009, 2020 IBM Corporation and others. * * This program and the accompanying materials * are made available under the terms of the Eclipse Public License 2.0 @@ -18,7 +18,10 @@ import org.junit.Test; public class EncodedCharacterSearch { @Test public void testIso8859() { - SearchTestUtils.searchOneLocale("\u00E1guilaxaxcs", new String[] {"/org.eclipse.ua.tests/data/help/search/testnl8859.htm" }, "en"); + SearchTestUtils.searchOneLocale("\u00E1guilaxaxcs", + new String[] { "/org.eclipse.ua.tests/data/help/search/testnl8859.htm", + "/org.eclipse.ua.tests/data/help/search/testnl8859_html5.html" }, + "en"); } @Test @@ -28,19 +31,27 @@ public class EncodedCharacterSearch { @Test public void testUtf8Accented() { - SearchTestUtils.searchOneLocale("acfele\u00F3n", new String[] {"/org.eclipse.ua.tests/data/help/search/testnlUTF8.htm" }, "en"); + SearchTestUtils.searchOneLocale("acfele\u00F3n", + new String[] { "/org.eclipse.ua.tests/data/help/search/testnlUTF8.htm", + "/org.eclipse.ua.tests/data/help/search/testnlUTF8_html5.html" }, + "en"); } @Test public void testUtf8Chinese() { - SearchTestUtils.searchOneLocale("\u8FB2\u66C6\u65B0\u5E74", new String[] {"/org.eclipse.ua.tests/data/help/search/testnlUTF8.htm" }, "en"); + SearchTestUtils.searchOneLocale("\u8FB2\u66C6\u65B0\u5E74", + new String[] { "/org.eclipse.ua.tests/data/help/search/testnlUTF8.htm", + "/org.eclipse.ua.tests/data/help/search/testnlUTF8_html5.html" }, + "en"); } @Test public void testUtf8Hebrew() { SearchTestUtils.searchOneLocale("\u05D0\u05B7\u05E1\u05B0\u05D8\u05B0\u05E8\u05D5\u05B9\u05E0\u05D5\u05B9\u05DE" - + "\u05B0\u05D9\u05B8\u05D4) \u05DC\u05B4\u05E7\u05BC\u05D5\u05BC\u05D9 (\u05D9\u05E8\u05D7 \u05D0\u05D5 \u05E9\u05DE\u05E9", new String[] {"/org.eclipse.ua.tests/data/help/search/testnlUTF8.htm" }, "en"); + + "\u05B0\u05D9\u05B8\u05D4) \u05DC\u05B4\u05E7\u05BC\u05D5\u05BC\u05D9 (\u05D9\u05E8\u05D7 \u05D0\u05D5 \u05E9\u05DE\u05E9", + new String[] { "/org.eclipse.ua.tests/data/help/search/testnlUTF8.htm", + "/org.eclipse.ua.tests/data/help/search/testnlUTF8_html5.html" }, + "en"); } - } |