Skip to main content
aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLeo Ufimtsev2018-06-13 17:54:20 -0400
committerLeo Ufimtsev2018-06-14 12:51:43 -0400
commit1823ab237d69270276b2e681b68b46bf881f6abf (patch)
tree336fbe79972786c379789cd1dee6aac1c7909021
parent1df1be7c4bb508960a7308b328f53bdbc9cb769f (diff)
downloadeclipse.platform.swt-1823ab237d69270276b2e681b68b46bf881f6abf.tar.gz
eclipse.platform.swt-1823ab237d69270276b2e681b68b46bf881f6abf.tar.xz
eclipse.platform.swt-1823ab237d69270276b2e681b68b46bf881f6abf.zip
Bug 535392 [Webkit2] Browser.getText() returns wrong decoding when
setText() contains utf (code point >127) characters Problem: - Webkit sometimes returns ASCII encoding and sometimes UTF-16LE - ASCII is returned when all characters are ascii, UFT-16LE is returned if at least one character is non-ascii. - At the binary level, it is not possible to tell encodings apart - webkit2 currently (v2.20) doesn't have api to get encoding of data. - Byte order mark is not provided with return value. Solution: - I wrote a in-house-made SWT heuristic to try and figure out encoding. This works well for strings that are 2+ characters and OK for most single characters. - Hopefully this is a temporary solution until Webkit2gtk developers give us access to encoding api. (I will request in 535392). Testing: - Added testing suite for converter. - Tested via snippet. - Tested via given SWT bot project that sets/gets text of 'find' man page. - All SWT jUnit tests work fine. - Child eclipse seems to work fine. Considerations & potential issues with this patch: - This patch contains a lot of double/triple byte UTF characters, if some of our build tools have issues with special UTF characters, then they may have issues. Bug: https://bugs.eclipse.org/bugs/show_bug.cgi?id=535392 Change-Id: I9be1f679676dfacf6415d0ae702cea41aa30cf7b Signed-off-by: Leo Ufimtsev <lufimtse@redhat.com>
-rw-r--r--bundles/org.eclipse.swt/Eclipse SWT WebKit/gtk/org/eclipse/swt/browser/WebKit.java11
-rw-r--r--bundles/org.eclipse.swt/Eclipse SWT/gtk/org/eclipse/swt/internal/Converter.java217
-rw-r--r--tests/org.eclipse.swt.tests.gtk/JUnit Tests/org/eclipse/swt/tests/gtk/AllGTKTests.java4
-rw-r--r--tests/org.eclipse.swt.tests.gtk/JUnit Tests/org/eclipse/swt/tests/gtk/Test_GtkConverter.java160
-rw-r--r--tests/org.eclipse.swt.tests.gtk/ManualTests/org/eclipse/swt/tests/gtk/snippets/Bug535392_getText.java123
5 files changed, 497 insertions, 18 deletions
diff --git a/bundles/org.eclipse.swt/Eclipse SWT WebKit/gtk/org/eclipse/swt/browser/WebKit.java b/bundles/org.eclipse.swt/Eclipse SWT WebKit/gtk/org/eclipse/swt/browser/WebKit.java
index f987f90f65..8ee66d8a0b 100644
--- a/bundles/org.eclipse.swt/Eclipse SWT WebKit/gtk/org/eclipse/swt/browser/WebKit.java
+++ b/bundles/org.eclipse.swt/Eclipse SWT WebKit/gtk/org/eclipse/swt/browser/WebKit.java
@@ -1758,17 +1758,16 @@ private static class Webkit2AsyncToSync {
OS.g_error_free(gerrorRes[0]);
retObj.returnValue = (String) "";
} else {
- long /*int*/ GString;
- GString = OS.g_string_new_len(guchar_data, gsize_len[0]); //(Str + len) -> (null terminated str)
- String text = Converter.cCharPtrToJavaString(OS.GString_str(GString), false);
- OS.g_string_free(GString, 1);
- retObj.returnValue = (String) text;
+ int len = (int) gsize_len[0];
+ byte[] buffer = new byte [len];
+ C.memmove (buffer, guchar_data, len);
+ String text = Converter.byteToStringViaHeuristic(buffer);
+ retObj.returnValue = text;
}
retObj.callbackFinished = true;
Display.getCurrent().wake();
}
-
/**
* You should check 'retObj.swtAsyncTimeout' after making a call to this.
*/
diff --git a/bundles/org.eclipse.swt/Eclipse SWT/gtk/org/eclipse/swt/internal/Converter.java b/bundles/org.eclipse.swt/Eclipse SWT/gtk/org/eclipse/swt/internal/Converter.java
index d0ca0677c6..1e5c67ef7e 100644
--- a/bundles/org.eclipse.swt/Eclipse SWT/gtk/org/eclipse/swt/internal/Converter.java
+++ b/bundles/org.eclipse.swt/Eclipse SWT/gtk/org/eclipse/swt/internal/Converter.java
@@ -11,33 +11,105 @@
package org.eclipse.swt.internal;
+import java.io.*;
+import java.nio.*;
+import java.nio.charset.*;
+
import org.eclipse.swt.internal.gtk.*;
/**
+ * About this class:
+ * #################
* This class implements the conversions between unicode characters
- * and the <em>platform supported</em> representation for characters.
- * <p>
+ * and the platform supported representation for characters.
+ *
* Note that, unicode characters which can not be found in the platform
* encoding will be converted to an arbitrary platform specific character.
*
- * Note:
- * Regular JNI String conversion usually uses a modified UTF-8, see:
- * https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8
- * And in JNI, normally (env*)->GetStringUTFChars(..) is used to convert a javaString into a C string. See:
- * http://docs.oracle.com/javase/8/docs/technotes/guides/jni/spec/functions.html#GetStringUTFChars
+ * This class is tested via: org.eclipse.swt.tests.gtk.Test_GtkTextEncoding
+ *
+ * About JNI & string conversion:
+ * #############################
+ * - Regular JNI String conversion usually uses a modified UTF-8, see: https://en.wikipedia.org/wiki/UTF-8#Modified_UTF-8
+ * - And in JNI, normally (env*)->GetStringUTFChars(..) is used to convert a javaString into a C string.
+ * See: http://docs.oracle.com/javase/8/docs/technotes/guides/jni/spec/functions.html#GetStringUTFChars
+ *
* However, the modified UTF-8 only works well with C system functions as it doesn't contain embedded nulls
* and is null terminated.
+ *
* But because the modified UTF-8 only supports up to 3 bytes (and not up to 4 as regular UTF-8), characters
* that require 4 bytes (e.g emojos) are not translated properly from Java to C.
- * To work around this issue, we convert the Java string to a byte array on the Java side manually and then
- * pass it to C. See:
- * http://stackoverflow.com/questions/32205446/getting-true-utf-8-characters-in-java-jni
+ *
+ * To work around this issue, we convert the Java string to a byte array on the Java side manually and then pass it to C.
+ * See: http://stackoverflow.com/questions/32205446/getting-true-utf-8-characters-in-java-jni
*
* Note:
* Java uses UTF-16 Wide characters internally to represent a string.
* C uses UTF-8 Multibyte characters (null terminated) to represent a string.
*
- * </p>
+ * About encoding on Linux/Gtk & it's relevance to SWT:
+ * ####################################################
+ *
+ * UTF-* = variable length encoding.
+ *
+ * UTF-8 = minimum is 8 bits, max is 6 bytes, but rarely goes beyond 4 bytes. Gtk & most of web uses this.
+ * UTF-16 = minimum is 16 bits. Java's string are stored this way.
+ * UTF-16 can be
+ * Big Endian : 65 = 00000000 01000001 # Human friendly, reads left to right.
+ * Little Endian : 65 = 01000001 00000000 # Intel x86 and also AMD64 / x86-64 series of processors use the little-endian [1]
+ * # i.e, we in SWT often have to deal with UTF-16 LE
+ * Some terminology:
+ * - "Code point" is the numerical value of unicode character.
+ * - All of UTF-* have the same letter to code-point mapping,
+ * but UTF-8/16/32 have different "back-ends".
+ *
+ * Illustration:
+ * (char) = (code point) = (back end).
+ * A = 65 = 01000001 UTF-8
+ * = 00000000 01000001 UTF-16 BE
+ * = 01000001 00000000 UTF-16 LE
+ *
+ * - Byte Order Marks (BOM) are a few bytes at the start of a *file* indicating which endianess is used.
+ * Problem: Gtk/webkit often don't give us BOM's.
+ * (further reading *3)
+ *
+ * - We can reliably encode character to a backend (A -> UTF-8/16), but the other way round is
+ * guess work since byte order marks are often missing and UTF-16 bits are technically valid UTF-8.
+ * (see Converter.heuristic for details).
+ * We could improve our heuristic by using something like http://jchardet.sourceforge.net/.
+ *
+ * - Glib has some conversion functions:
+ * g_utf16_to_utf8
+ * g_utf8_to_utf16
+ *
+ * - So does java: (e.g null terminated UTF-8)
+ * ("myString" + '\0').getBytes(StandardCharsets.UTF-8)
+ *
+ * - I suggest using Java functions where possible to avoid memory leaks.
+ * (Yes, they happen and are big-pain-in-the-ass to find https://bugs.eclipse.org/bugs/show_bug.cgi?id=533995)
+ *
+ *
+ * Learning about encoding:
+ * #########################
+ * I suggest the following 3 videos to understand ASCII/UTF-8/UTF-16[LE|BE]/UTF-32 encoding:
+ * Overview: https://www.youtube.com/watch?v=MijmeoH9LT4
+ * Details:
+ * Part-1: https://www.youtube.com/watch?v=B1Sf1IhA0j4
+ * Part-2: https://www.youtube.com/watch?v=-oYfv794R9s
+ * Part-3: https://www.youtube.com/watch?v=vLBtrd9Ar28
+ *
+ * Also read all of this:
+ * http://kunststube.net/encoding/
+ * and this:
+ * https://www.joelonsoftware.com/2003/10/08/the-absolute-minimum-every-software-developer-absolutely-positively-must-know-about-unicode-and-character-sets-no-excuses/
+ *
+ * And lastly, good utf-8 reference: https://en.wikipedia.org/wiki/UTF-8#Description
+ *
+ * You should now be a master of encoding. I wish you luck on your journey.
+ *
+ * [1] https://en.wikipedia.org/wiki/Endianness
+ * [2] https://en.wikipedia.org/wiki/Byte_order_mark
+ * [3] BOM's: http://unicode.org/faq/utf_bom.html#BOM
*/
public final class Converter {
public static final byte [] NullByteArray = new byte [1];
@@ -182,4 +254,127 @@ public static char mbcsToWcs (char ch) {
return result [0];
}
+/**
+ * Given a byte array with unknown encoding, try to decode it via (relatively simple) heuristic.
+ * This is useful when we're not provided the encoding by OS/library.<br>
+ *
+ * Current implementation only supports standard java charsets but can be extended as needed.
+ * This method could be improved by using http://jchardet.sourceforge.net/ <br>
+ *
+ * Run time is O(a * n) where a is a constant that varies depending on the size of input n, but roughly 1-20)
+ *
+ * @param bytes raw bits from the OS.
+ * @return String based on the most pop
+ */
+public static String byteToStringViaHeuristic(byte [] bytes) {
+ /*
+ * Technical notes:
+ * - Given a sequence of bytes, UTF-8 and UTF-16 cannot determined deterministically (1*).
+ * - However, UTF-16 has a lot of null bytes when code points are mostly in the 0-255 range (using only 2nd byte),
+ * a byte sequence with many null bytes is likely UTF-16.
+ * - Valid UTF-8 technically can contain null bytes, but it's rare.
+ *
+ * Some times it can get confused if it receives two non-null bytes. e.g Ё = (UTF-16 [01,04])
+ * It can either mean a valid set of UTF-8 characters or a single UTF-16 character.
+ * This issue typically only occurs for very short sequences 1-5 characters of very special characters).
+ * Improving the heuristic for such corner cases is complicated. We'd have to implement a mechanism
+ * that would be aware of character frequencies and assign a score to the probability of each mapping.
+ *
+ * [1] https://softwareengineering.stackexchange.com/questions/187169/how-to-detect-the-encoding-of-a-file
+ */
+ // Base cases
+ if ((bytes.length == 0) ||
+ (bytes.length == 1 && bytes[0] == 0)) {
+ return "";
+ }
+
+ // Test if it's valid UTF-8.
+ // Note, ASCII is a subset of UTF-8.
+ try {
+ CharsetDecoder charDecoder = StandardCharsets.UTF_8.newDecoder();
+ charDecoder.onMalformedInput(CodingErrorAction.REPORT);
+ charDecoder.onUnmappableCharacter(CodingErrorAction.REPORT);
+ String text = charDecoder.decode(ByteBuffer.wrap(bytes)).toString();
+
+ // No exception thrown means that we have valid UTF-8 "bit string". However, valid UTF-8 bit string doesn't mean it's the corect decoding.
+ // We have assert correctness via an educated guess
+ boolean probablyUTF8 = true;
+
+ {
+ // Problem 1: It might be UTF-16 since at the binary level UTF-16 can be valid UTF-8. (null is a valid utf-8 character).
+ // Solution: Count nulls to try to guess if it's UTF-16.
+ // Verified via
+ // org.eclipse.swt.tests.gtk.Test_GtkConverter.test_HeuristicUTF16_letters()
+ // org.eclipse.swt.tests.gtk.Test_GtkConverter.test_HeuristicUTF16_letter()
+ double nullBytePercentageForUtf16 = 0.01; // if more than this % null bytes, then it's probably utf-16.
+ int nullCount = 0;
+ for (byte b : bytes) {
+ if (b == 0)
+ nullCount++;
+ }
+ double nullPercentage = (double) nullCount / (double) bytes.length;
+ if (nullPercentage > nullBytePercentageForUtf16) {
+ probablyUTF8 = false;
+ }
+ }
+
+ // Problem 2: Valid UTF-8 bit string can map to invalid code points (i.e undefined unicode)
+ // Solution 2: verify that every character is a valid code point.
+ if (probablyUTF8) {
+ char [] chars = text.toCharArray();
+
+ for (int i = 0; i < chars.length; i++) {
+ int codePoint = Character.codePointAt(chars, i);
+ if (!Character.isValidCodePoint(codePoint)) {
+ probablyUTF8 = false;
+ break;
+ }
+ }
+ }
+
+ // Problem 3: Short 2-byte sequences are very ambiguous.
+ // E.g Unicode Hyphen U+2010 '‐' ( which btw different from the ascii U+002D '-' Hyphen-Minus)
+ // can be miss-understood as 16 (Synchronous Idle) & 32 (Space).
+ // Solution: Unless we have two valid alphabet characters, it's probably a single utf-16 character.
+ // However, this leads to the problem that single non-alphabetic unicode characters are not recognized correctly.
+ // Below code is left in case recognizing alphabetic characters is of higher priority than exotic unicode once.
+// if (probablyUTF8) {
+// if (bytes.length == 2) {
+// char [] chars = text.toCharArray();
+// for (int i = 0; i < chars.length; i++) {
+// int codePoint = Character.codePointAt(chars, i);
+// if (!Character.isAlphabetic(codePoint)) {
+// probablyUTF8 = false;
+// break;
+// }
+// }
+// }
+// }
+
+ if (!probablyUTF8) {
+ return new String (bytes, StandardCharsets.UTF_16LE);
+ } else {
+ return text;
+ }
+ } catch (CharacterCodingException e) {
+ }
+
+ // Invalid UTF-8. Try other character sets.
+ Charset [] commonWebCharSets = new Charset[] {StandardCharsets.UTF_16LE, StandardCharsets.ISO_8859_1, StandardCharsets.UTF_16BE, StandardCharsets.UTF_16};
+ for (Charset setToTry : commonWebCharSets) {
+ try {
+ CharsetDecoder charDecoder = setToTry.newDecoder();
+ charDecoder.onMalformedInput(CodingErrorAction.REPORT);
+ charDecoder.onUnmappableCharacter(CodingErrorAction.REPORT);
+ return charDecoder.decode(ByteBuffer.wrap(bytes)).toString();
+ } catch (CharacterCodingException e) {}
+ }
+
+ // Could not determine encoding.
+ // Return error string with stack trace to help users determine which function lead to a failed decoding.
+ StringWriter sw = new StringWriter();
+ new Throwable("").printStackTrace(new PrintWriter(sw));
+ return "SWT: Failed to decode byte buffer. Encoding is not ASCII/UTF-8/UTF-16[LE|BE|BOM]/ISO_8859_1. Stack trace:\n" + sw.toString();
+}
+
}
diff --git a/tests/org.eclipse.swt.tests.gtk/JUnit Tests/org/eclipse/swt/tests/gtk/AllGTKTests.java b/tests/org.eclipse.swt.tests.gtk/JUnit Tests/org/eclipse/swt/tests/gtk/AllGTKTests.java
index 73fb65a34f..6be5e22579 100644
--- a/tests/org.eclipse.swt.tests.gtk/JUnit Tests/org/eclipse/swt/tests/gtk/AllGTKTests.java
+++ b/tests/org.eclipse.swt.tests.gtk/JUnit Tests/org/eclipse/swt/tests/gtk/AllGTKTests.java
@@ -14,6 +14,7 @@
*/
package org.eclipse.swt.tests.gtk;
+import org.junit.runner.JUnitCore;
import org.junit.runner.RunWith;
import org.junit.runners.Suite;
@@ -21,12 +22,13 @@ import org.junit.runners.Suite;
@RunWith(Suite.class)
@Suite.SuiteClasses({
// Test.class be added here.
+ Test_GtkConverter.class
})
public class AllGTKTests {
public static void main(String[] args) {
-// JUnitCore.main(AllGTKTests.class.getName()); // Enable once a test is added.
+ JUnitCore.main(AllGTKTests.class.getName()); // Enable once a test is added.
}
}
diff --git a/tests/org.eclipse.swt.tests.gtk/JUnit Tests/org/eclipse/swt/tests/gtk/Test_GtkConverter.java b/tests/org.eclipse.swt.tests.gtk/JUnit Tests/org/eclipse/swt/tests/gtk/Test_GtkConverter.java
new file mode 100644
index 0000000000..8a348d51ef
--- /dev/null
+++ b/tests/org.eclipse.swt.tests.gtk/JUnit Tests/org/eclipse/swt/tests/gtk/Test_GtkConverter.java
@@ -0,0 +1,160 @@
+/*******************************************************************************
+ * Copyright (c) 2018 Red Hat and others. All rights reserved.
+ * The contents of this file are made available under the terms
+ * of the GNU Lesser General Public License (LGPL) Version 2.1 that
+ * accompanies this distribution (lgpl-v21.txt). The LGPL is also
+ * available at http://www.gnu.org/licenses/lgpl.html. If the version
+ * of the LGPL at http://www.gnu.org is different to the version of
+ * the LGPL accompanying this distribution and there is any conflict
+ * between the two license versions, the terms of the LGPL accompanying
+ * this distribution shall govern.
+ *
+ * Contributors:
+ * Red Hat - initial API and implementation
+ */
+package org.eclipse.swt.tests.gtk;
+
+import static org.junit.Assert.fail;
+
+import java.nio.charset.StandardCharsets;
+
+import org.eclipse.swt.internal.Converter;
+import org.junit.Ignore;
+import org.junit.Test;
+
+/**
+ * Good source for UTF-8 code points for testing:
+ * https://en.wikipedia.org/wiki/List_of_Unicode_characters
+ *
+ * We care about Ascii, UTF-8 (as it's used by glib/gtk much) and UTF-16LE (as it's used by java/intel/amd architecture).
+ */
+public class Test_GtkConverter {
+
+ static final String emptyStr = "";
+
+ static final String asciiLetterA = "A"; // = 65 // Note, UTF-8 is backwards compatible with Ascii
+ static final String dollarSign = "$"; // =36
+
+ static final String asciiLetters = "ABCabc"; // 65(A), 66, 67 97(a), 98, 99
+
+ // Anything above 127 translates to 2 bytes in utf-8. See: https://en.wikipedia.org/wiki/UTF-8#Description
+ static final String codePoint174 = "®"; // U+00AE Registered sign.
+ static final String unicodeCharactersLowCodePoints = "®ÖöėŊ‐"; // bigger than 127, but not many bytes.
+ static final String unicodeCharactersHighCodePoints = "▇░▙▚▧▫♂☢⛔"; //2000+ code points.
+
+ @Test
+ public void test_HeuristicASCII_letterA() {
+ helper_testHeuristic(asciiLetterA.getBytes(StandardCharsets.US_ASCII), asciiLetterA); // A = 65
+ }
+ @Test
+ public void test_HeuristicASCII_dollarSign() {
+ helper_testHeuristic(dollarSign.getBytes(StandardCharsets.US_ASCII), dollarSign); // $ = 36
+ }
+
+ @Test
+ public void test_Heuristic_null() {
+ helper_testHeuristic(new byte[] {0}, emptyStr); // simulate null terminator.
+ }
+
+ @Test
+ public void test_HeuristicASCII_emptyString() {
+ helper_testHeuristic(emptyStr.getBytes(StandardCharsets.US_ASCII), emptyStr); // "" -> [] (empty byte array)
+ }
+
+ @Test
+ public void test_HeuristicUTF8_null() {
+ helper_testHeuristic(emptyStr.getBytes(StandardCharsets.UTF_8), emptyStr);
+ }
+
+ @Test
+ public void test_HeuristicUTF16LE_null() {
+ helper_testHeuristic(emptyStr.getBytes(StandardCharsets.UTF_16LE), emptyStr);
+ }
+
+ @Test
+ public void test_HeuristicASCII_letters() {
+ helper_testHeuristic(asciiLetters.getBytes(StandardCharsets.US_ASCII), asciiLetters);
+ }
+
+ @Test
+ public void test_HeuristicUTF8_letterUnder127() {
+ helper_testHeuristic(asciiLetterA.getBytes(StandardCharsets.UTF_8), asciiLetterA);
+ }
+
+ @Test
+ public void test_HeuristicUTF8_letterOver127() {
+ helper_testHeuristic(codePoint174.getBytes(StandardCharsets.UTF_8), codePoint174);
+ }
+
+
+ @Test
+ public void test_HeuristicUTF8_letterSpecial() {
+ helper_testHeuristic("Ё".getBytes(StandardCharsets.UTF_8), "Ё");
+ }
+
+ @Test
+ public void test_HeuristicUTF8_LowCodePoints() {
+ helper_testHeuristic(unicodeCharactersLowCodePoints.getBytes(StandardCharsets.UTF_8), unicodeCharactersLowCodePoints);
+ }
+
+ @Test
+ public void test_HeuristicUTF8_HighCodePoints() {
+ byte [] testBytes = unicodeCharactersHighCodePoints.getBytes(StandardCharsets.UTF_8);
+ helper_testHeuristic(testBytes, unicodeCharactersHighCodePoints);
+ }
+
+
+ @Test
+ public void test_HeuristicUTF16_Asciiletter() {
+ helper_testHeuristic(asciiLetterA.getBytes(StandardCharsets.UTF_16LE), asciiLetterA);
+ }
+
+ @Test
+ public void test_HeuristicUTF16_AsciiLetters() {
+ helper_testHeuristic(asciiLetters.getBytes(StandardCharsets.UTF_16LE), asciiLetters);
+ }
+
+ @Test
+ public void test_HeuristicUTF16_letter() {
+ String testValue = "®"; // 174
+ byte [] testBytes = testValue.getBytes(StandardCharsets.UTF_16LE);
+ helper_testHeuristic(testBytes, testValue);
+ }
+
+ @Test
+ public void test_HeuristicUTF16_letters() {
+ helper_testHeuristic(unicodeCharactersLowCodePoints.getBytes(StandardCharsets.UTF_16LE), unicodeCharactersLowCodePoints);
+ }
+
+ @Test
+ public void test_HeuristicUTF16_LotsOfLetters() {
+ byte [] testBytes = unicodeCharactersHighCodePoints.getBytes(StandardCharsets.UTF_16LE);
+ helper_testHeuristic(testBytes, unicodeCharactersHighCodePoints);
+ }
+
+ /**
+ * There are a few unicode characters that are ambiguous if they are decoded on their own,
+ * as they can translate to either two valid UTF-8 characters or a single valid UTF-16LE character.
+ *
+ * e.g 'Ё'. (but there are others).
+ *
+ * The heuristic is better is better if there are 2+ characters, e.g HЁLLO WORLD.
+ *
+ * This test is documented, but is currently known to fail.
+ *
+ */
+ @Ignore
+ @Test
+ public void test_Heuristic_specialSingleCases() {
+ byte [] testBytes = "Ё".getBytes(StandardCharsets.UTF_16LE);
+ helper_testHeuristic(testBytes, "Ё");
+ }
+
+ private void helper_testHeuristic(byte[] testBytes, String expected) {
+ String result = Converter.byteToStringViaHeuristic(testBytes);
+ if (!expected.equals(result)) {
+ fail();
+ }
+ }
+
+}
diff --git a/tests/org.eclipse.swt.tests.gtk/ManualTests/org/eclipse/swt/tests/gtk/snippets/Bug535392_getText.java b/tests/org.eclipse.swt.tests.gtk/ManualTests/org/eclipse/swt/tests/gtk/snippets/Bug535392_getText.java
new file mode 100644
index 0000000000..3e2d50c5a9
--- /dev/null
+++ b/tests/org.eclipse.swt.tests.gtk/ManualTests/org/eclipse/swt/tests/gtk/snippets/Bug535392_getText.java
@@ -0,0 +1,123 @@
+/*******************************************************************************
+ * Copyright (c) 2018 Red Hat and others. All rights reserved.
+ * The contents of this file are made available under the terms
+ * of the GNU Lesser General Public License (LGPL) Version 2.1 that
+ * accompanies this distribution (lgpl-v21.txt). The LGPL is also
+ * available at http://www.gnu.org/licenses/lgpl.html. If the version
+ * of the LGPL at http://www.gnu.org is different to the version of
+ * the LGPL accompanying this distribution and there is any conflict
+ * between the two license versions, the terms of the LGPL accompanying
+ * this distribution shall govern.
+ *
+ * Contributors:
+ * Red Hat - initial API and implementation
+ */
+package org.eclipse.swt.tests.gtk.snippets;
+
+import java.util.concurrent.atomic.AtomicBoolean;
+
+import org.eclipse.swt.SWT;
+import org.eclipse.swt.browser.Browser;
+import org.eclipse.swt.browser.ProgressAdapter;
+import org.eclipse.swt.browser.ProgressEvent;
+import org.eclipse.swt.layout.GridData;
+import org.eclipse.swt.layout.GridLayout;
+import org.eclipse.swt.widgets.Display;
+import org.eclipse.swt.widgets.Shell;
+
+
+public class Bug535392_getText {
+ static int run = 1;
+ public static void main(String[] args) {
+ Display display = new Display();
+ Shell shell = new Shell(display);
+ shell.setSize(400, 400);
+ shell.setLayout(new GridLayout(2, false));
+ Browser browser = new Browser(shell, SWT.BORDER);
+ browser.setLayoutData(new GridData(SWT.FILL, SWT.FILL, true, true, 2, 1));
+ shell.open();
+
+ // Most of below work, but a few single unicode characters are miss-understood. Heuristic works better with many characters.
+ // https://en.wikipedia.org/wiki/List_of_Unicode_characters
+ testValue(display, shell, browser, "-", true); // working, regular ascii '-'
+ testValue(display, shell, browser, "‐", true); // BROKEN, (single char read as UTF-8 instead of UTF-16).
+ testValue(display, shell, browser, "ABC", true); // 65 66 67
+ testValue(display, shell, browser, "A®A", true); // U+00AE ® 0174 &reg; Registered sign 0110
+ testValue(display, shell, browser, "A¢A", true); // U+00BF ¿ 0191 &iquest; Inverted Question Mark 0127
+ testValue(display, shell, browser, "ABCüDü", true); // U+00FC ü 0252 &uuml; Latin Small Letter U with diaeresis 0188
+ testValue(display, shell, browser, "AӛB", true); // U+04DB ӛ Cyrillic Small Letter Schwa with diaeresis 0620
+ testValue(display, shell, browser, "Ё", true); // BROKEN. (single char read as UTF-8 instead of UTF-16). U+04DB ӛ Cyrillic Small Letter Schwa with diaeresis 0620
+
+ if (run == 0) {
+ for (int i = 0; i < 100000; i++) {
+ final String testStr = new String(new char [] {'A', (char) i});
+ testValue(display, shell, browser, testStr, i, true);
+
+ }
+ }
+
+ testValue(display, shell, browser, "SYNOPSIS\n" +
+ " find [-H] [-L] [-P] [-D debugopts] [-Olevel] [starting-point...] [expression]\n" +
+ "\n" +
+ "DESCRIPTION\n" +
+ " This manual page documents the GNU version of find. GNU find searches the directory tree rooted at each given starting-point by evaluating the given expression from left to right, according to the rules of precedence (see sec‐\n" +
+ " tion OPERATORS), until the outcome is known (the left hand side is false for and operations, true for or), at which point find moves on to the next file name. If no starting-point is specified, `.' is assumed.\n" +
+ "\n" +
+ " If you are using find in an environment where security is important (for example if you are using it to search directories that are writable by other users), you should read the \"Security Considerations\" chapter of the findu‐\n" +
+ " tils documentation, which is called Finding Files and comes with findutils. That document also includes a lot more detail and discussion than this manual page, so you may find it a more useful source of information.\n" +
+ "\n" +
+ "OPTIONS\n" +
+ " The -H, -L and -P options control the treatment of symbolic links. Command-line arguments following these are taken to be names of files or directories to be examined, up to the first argument that begins with `-', or the\n" +
+ " argument `(' or `!'. That argument and any following arguments are taken to be the expression describing what is to be searched for. If no paths are given, the current directory is used. If no expression is given, the\n" +
+ " expression -print is used (but you should probably consider using -print0 instead, anyway).\n" +
+ "\n" +
+ " This manual page talks about `options' within the expression list. These options control the behaviour of find but are specified immediately after the last path name. The five `real' options -H, -L, -P, -D and -O must appear\n" +
+ " before the first path name, if at all. A double dash -- can also be used to signal that any remaining arguments are not options (though ensuring that all start points begin with either `./' or `/' is generally safer if you use\n" +
+ " wildcards in the list of start points).", true);
+
+
+
+
+ display.dispose();
+ }
+
+
+ private static void testValue(Display display, Shell shell, Browser browser, String testStr, boolean autoTest) {
+ testValue(display, shell, browser, testStr, 0, autoTest);
+ }
+
+ // I think this is broken for values above 127 :-/.
+ private static void testValue(Display display, Shell shell, Browser browser, String testStr, int testID, boolean autoTest) {
+ AtomicBoolean testFinished = new AtomicBoolean(false);
+ browser.setText(testStr);
+
+ ProgressAdapter completionTester = new ProgressAdapter() {
+ @Override
+ public void completed(ProgressEvent event) {
+ Browser browser = (Browser) event.widget;
+ String returnedStr = browser.getText();
+ if (testStr.equals(returnedStr)) {
+ System.out.println("(PASS): testStr/returnedStr: " + testStr + "/" + returnedStr + " Test id:" + testID);
+ } else {
+ System.err.println("(FAIL): testStr/returnedStr: " + testStr + "/" + returnedStr + " Test id:" + testID);
+ }
+ testFinished.set(true);
+ }
+ };
+
+ browser.addProgressListener(completionTester);
+
+ if (autoTest) {
+ while (!shell.isDisposed() && !testFinished.get()) {
+ display.readAndDispatch();
+ }
+ browser.removeProgressListener(completionTester);
+ } else {
+ while (!shell.isDisposed()) {
+ if (!display.readAndDispatch()) {
+ display.sleep();
+ }
+ }
+ }
+ }
+}

Back to the top