Skip to main content
aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMathias Kunter2012-02-02 02:49:48 +0000
committerMarc Khouzam2012-03-06 14:23:21 +0000
commita974434ec3a84966ec2c18463276be7ff8086bff (patch)
tree86cd798b55f2e29338418e51a3473fc64d630efc /dsf-gdb/org.eclipse.cdt.dsf.gdb/src/org/eclipse/cdt/dsf/mi/service/command/output/MIStringHandler.java
parentf360d64c77c71187041a08cc97bf132b7ede88c9 (diff)
downloadorg.eclipse.cdt-a974434ec3a84966ec2c18463276be7ff8086bff.tar.gz
org.eclipse.cdt-a974434ec3a84966ec2c18463276be7ff8086bff.tar.xz
org.eclipse.cdt-a974434ec3a84966ec2c18463276be7ff8086bff.zip
Bug 307311: Incorrect string display when using non-ASCII characters
Diffstat (limited to 'dsf-gdb/org.eclipse.cdt.dsf.gdb/src/org/eclipse/cdt/dsf/mi/service/command/output/MIStringHandler.java')
-rw-r--r--dsf-gdb/org.eclipse.cdt.dsf.gdb/src/org/eclipse/cdt/dsf/mi/service/command/output/MIStringHandler.java487
1 files changed, 487 insertions, 0 deletions
diff --git a/dsf-gdb/org.eclipse.cdt.dsf.gdb/src/org/eclipse/cdt/dsf/mi/service/command/output/MIStringHandler.java b/dsf-gdb/org.eclipse.cdt.dsf.gdb/src/org/eclipse/cdt/dsf/mi/service/command/output/MIStringHandler.java
new file mode 100644
index 00000000000..a5a4a770583
--- /dev/null
+++ b/dsf-gdb/org.eclipse.cdt.dsf.gdb/src/org/eclipse/cdt/dsf/mi/service/command/output/MIStringHandler.java
@@ -0,0 +1,487 @@
+/*******************************************************************************
+ * Copyright (c) 2012 Mathias Kunter and others.
+ * All rights reserved. This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License v1.0
+ * which accompanies this distribution, and is available at
+ * http://www.eclipse.org/legal/epl-v10.html
+ *
+ * Contributors:
+ * Mathias Kunter - Initial Implementation (Bug 307311)
+ *******************************************************************************/
+package org.eclipse.cdt.dsf.mi.service.command.output;
+
+import java.nio.ByteBuffer;
+import java.nio.CharBuffer;
+import java.nio.charset.Charset;
+import java.nio.charset.CharsetDecoder;
+import java.nio.charset.CharsetEncoder;
+import java.text.ParseException;
+import java.util.EnumSet;
+
+/**
+ * The MIStringHandler class provides several static functions to handle C and / or MI strings.
+ * @since 4.1
+ */
+public class MIStringHandler {
+
+ /**
+ * Defines special characters which are used within escape notations to represent a
+ * corresponding Unicode code point (i.e. character code). See the specialCodePoints
+ * list below on the same index for the corresponding code point.
+ */
+ private static char[] specialChars = new char[] {
+ 'a', // Alert (bell) character
+ 'b', // Backspace character
+ 'e', // GNU extension: Escape character
+ 'E', // same as 'e'
+ 'f', // Form feed character
+ 'n', // New line character
+ 'r', // Carriage return character
+ 't', // Horizontal tabulation character
+ 'v', // Vertical tabulation character
+ '\'', // Single quotation mark
+ '"', // Double quotation mark
+ '\\', // Backslash
+ '?' // Literal question mark
+ };
+
+ /**
+ * Defines special Unicode code points (i.e. character codes) which can be escaped by a
+ * corresponding special character. See the specialChars list above on the same index
+ * for the corresponding special character.
+ */
+ private static int[] specialCodePoints = new int[] {
+ 0x07, // corresponds to \a
+ 0x08, // corresponds to \b
+ 0x1B, // corresponds to \e
+ 0x1B, // corresponds to \E
+ 0x0C, // corresponds to \f
+ 0x0A, // corresponds to \n
+ 0x0D, // corresponds to \r
+ 0x09, // corresponds to \t
+ 0x0B, // corresponds to \v
+ 0x27, // corresponds to \'
+ 0x22, // corresponds to \"
+ 0x5C, // corresponds to \\
+ 0x3F // corresponds to \?
+ };
+
+ /**
+ * An internal helper enumeration which holds the current status while parsing an escaped
+ * text sequence.
+ */
+ private enum EscapeStatus {
+ NONE,
+ BEGIN,
+ OCTAL_NUMBER,
+ HEX_NUMBER,
+ UNICODE_SHORT_NUMBER,
+ UNICODE_LONG_NUMBER,
+ VALID,
+ INVALID
+ }
+
+ /**
+ * An enumeration defining the escape sequences which should be parsed.
+ */
+ public enum ParseFlags {
+ SPECIAL_CHARS,
+ OCTAL_NUMBERS,
+ HEX_NUMBERS,
+ UNICODE_SHORT_NUMBERS,
+ UNICODE_LONG_NUMBERS
+ }
+
+ /**
+ * Translates the given C string into a string suitable for display. This includes handling
+ * of escaped characters and different string encodings. This is necessary in order to correctly
+ * deal with non-ASCII strings.
+ * @param str The C string to translate.
+ * @param escapeChars Defines whether non-printable characters should be escaped within
+ * the translated string, or not.
+ * @return The translated string.
+ */
+ public static String translateCString(String str, boolean escapeChars) {
+ if (escapeChars) {
+ // Don't parse the special character escape notations here. We can do this here because
+ // we want to keep them in their escaped form anyway, and because the following string
+ // transcoding process isn't affected by escaped special chars. By doing so we avoid
+ // caring about some nasty details of the special character escaping process: for
+ // example, single quotation marks are commonly only escaped within character constants,
+ // while double quotation marks are commonly only escaped within string constants. By
+ // not parsing the special character escape notations at all here, we just keep the
+ // original special character escaping provided by the given MI string.
+ str = parseString(str, EnumSet.complementOf(EnumSet.of(ParseFlags.SPECIAL_CHARS)));
+ } else {
+ // Parse all escaped characters.
+ str = parseString(str);
+ }
+
+ // Transcode the string in order to handle non-ASCII strings correctly.
+ str = transcodeString(str);
+
+ if (escapeChars) {
+ // Escape any non-printable characters again, as we want to be able to display them.
+ // However, don't escape any printable special chars, as they haven't been parsed before.
+ str = escapeString(str, false);
+ } else {
+ // No escaping necessary here. We however have to make sure that we use the correct line
+ // separation character sequence.
+ str = str.replace("\n", System.getProperty("line.separator", "\n")); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
+ }
+
+ return str;
+ }
+
+ /**
+ * Returns whether the given character is a special character, or not.
+ * @param c The character to test.
+ * @return The test result.
+ */
+ public static boolean isSpecialChar(char c) {
+ for (int i = 0; i < specialChars.length; i++) {
+ if (specialChars[i] == c) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Returns whether the given Unicode code point is a special code point, or not.
+ * @param codePoint The Unicode code point to test.
+ * @return The test result.
+ */
+ public static boolean isSpecialCodePoint(int codePoint) {
+ for (int i = 0; i < specialCodePoints.length; i++) {
+ if (specialCodePoints[i] == codePoint) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /**
+ * Parses the given special character into an Unicode code point.
+ * @param c The special character to parse.
+ * @return The parsed Unicode code point.
+ * @throws ParseException Thrown when the given character can't be parsed. This happens when it's
+ * not a special character.
+ */
+ public static int parseSpecialChar(char c) throws ParseException {
+ for (int i = 0; i < specialChars.length; i++) {
+ if (specialChars[i] == c) {
+ return specialCodePoints[i];
+ }
+ }
+ throw new ParseException("The given character '" + c + "' is not a special character.", 0); //$NON-NLS-1$ //$NON-NLS-2$
+ }
+
+ /**
+ * Parses the given special Unicode code point into a character.
+ * @param codePoint The special Unicode code point to parse.
+ * @return The parsed character.
+ * @throws ParseException Thrown when the given Unicode code point can't be parsed. This happens
+ * when it's not a special code point.
+ */
+ public static char parseSpecialCodePoint(int codePoint) throws ParseException {
+ for (int i = 0; i < specialCodePoints.length; i++) {
+ if (specialCodePoints[i] == codePoint) {
+ return specialChars[i];
+ }
+ }
+ throw new ParseException("The given Unicode code point " + codePoint + " is not a special code point.", 0); //$NON-NLS-1$ //$NON-NLS-2$
+ }
+
+ /**
+ * This is an overloaded function. See the Javadoc of the other function overload for details.
+ * @param str The string which should be parsed.
+ * @return The parsed string.
+ */
+ public static String parseString(String str) {
+ return parseString(str, EnumSet.allOf(ParseFlags.class));
+ }
+
+ /**
+ * Parses any escaped characters and replaces them with their corresponding Unicode code points.
+ * This function parses all escape notations which are supported by gcc and / or gdb. Those are:</br></br>
+ *
+ * <ul>
+ * <li>Special char escape notations: \a, \b, \e, \E, \f, \n, \r, \t, \v, \', \", \\, and \?</li>
+ *
+ * <li>Octal escape notation: An initial backslash, followed by 1, 2, or 3 octal digits. Values
+ * above 0xFF are ignored. Octal escape notations may not use more than 3 octal digits.</li>
+ *
+ * <li>Hexadecimal escape notation: An initial backslash, followed by an "x" and 1 or more
+ * hexadecimal digits. Hexadecimal escape notations may not use more than 4 hexadecimal digits
+ * (although gcc accepts hexadecimal escape notations of any arbitrary length).</li>
+ *
+ * <li>Short Unicode escape notation: An initial backslash, followed by an "u" and exactly 4
+ * hexadecimal digits.</li>
+ *
+ * <li>Long Unicode escape notation: An initial backslash, followed by an "U" and exactly 8
+ * hexadecimal digits.</li>
+ * </ul>
+ * @param str The string which should be parsed.
+ * @param parseFlags The set of escape notations which should be parsed.
+ * @return The parsed string.
+ */
+ public static String parseString(String str, EnumSet<ParseFlags> parseFlags) {
+ StringBuffer buffer = new StringBuffer();
+ StringBuffer escapeBuffer = new StringBuffer();
+ EscapeStatus escStatus = EscapeStatus.NONE;
+
+ for (int i = 0; i < str.length(); i++) {
+ char c = str.charAt(i);
+ boolean consumeChar = true;
+ boolean isLastChar = i == str.length() - 1;
+
+ if (escStatus == EscapeStatus.NONE) {
+ if (c == '\\') {
+ // Escaping begins. Reset the escape buffer.
+ escapeBuffer.setLength(0);
+ escapeBuffer.append(c);
+ escStatus = EscapeStatus.BEGIN;
+ }
+ } else if (escStatus == EscapeStatus.BEGIN) {
+ if (parseFlags.contains(ParseFlags.SPECIAL_CHARS) && isSpecialChar(c)) {
+ try {
+ buffer.appendCodePoint(parseSpecialChar(c));
+ escStatus = EscapeStatus.VALID;
+ } catch (ParseException e) {
+ // This is just for completeness. We will actually never catch any ParseException here
+ // since we already checked the character with isSpecialChar() before.
+ escapeBuffer.append(c);
+ escStatus = EscapeStatus.INVALID;
+ }
+ } else if (parseFlags.contains(ParseFlags.OCTAL_NUMBERS) && c >= '0' && c <= '7') {
+ escStatus = EscapeStatus.OCTAL_NUMBER;
+ // Don't consume this character right now - as this wouldn't work if it's the last character.
+ consumeChar = false;
+ } else if (parseFlags.contains(ParseFlags.HEX_NUMBERS) && c == 'x') {
+ escStatus = EscapeStatus.HEX_NUMBER;
+ } else if (parseFlags.contains(ParseFlags.UNICODE_SHORT_NUMBERS) && c == 'u') {
+ escStatus = EscapeStatus.UNICODE_SHORT_NUMBER;
+ } else if (parseFlags.contains(ParseFlags.UNICODE_LONG_NUMBERS) && c == 'U') {
+ escStatus = EscapeStatus.UNICODE_LONG_NUMBER;
+ } else {
+ escStatus = EscapeStatus.INVALID;
+ }
+ if (consumeChar) {
+ escapeBuffer.append(c);
+ }
+ } else if (escStatus == EscapeStatus.HEX_NUMBER) {
+ // Only consume this character if it belongs to the escape sequence.
+ consumeChar = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+ if (consumeChar) {
+ escapeBuffer.append(c);
+ }
+
+ if (!consumeChar || isLastChar || escapeBuffer.length() == 6) {
+ // The escape sequence is terminated. Set the escape status to invalid until
+ // we know that it's actually valid.
+ escStatus = EscapeStatus.INVALID;
+ if (escapeBuffer.length() > 2) {
+ // Decode the hexadecimal number.
+ try {
+ int codePoint = Integer.parseInt(escapeBuffer.toString().substring(2), 16);
+ if (codePoint <= 0x10FFFF) {
+ buffer.appendCodePoint(codePoint);
+ escStatus = EscapeStatus.VALID;
+ }
+ } catch (NumberFormatException e) {
+ }
+ }
+ }
+ } else if (escStatus == EscapeStatus.UNICODE_SHORT_NUMBER || escStatus == EscapeStatus.UNICODE_LONG_NUMBER) {
+ // Only consume this character if it belongs to the escape sequence.
+ consumeChar = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
+ if (consumeChar) {
+ escapeBuffer.append(c);
+ }
+
+ int finalLength = escStatus == EscapeStatus.UNICODE_SHORT_NUMBER ? 6 : 10;
+ if (escapeBuffer.length() == finalLength) {
+ // The escape sequence is terminated. Set the escape status to invalid until
+ // we know that it's actually valid. Decode the hexadecimal number.
+ escStatus = EscapeStatus.INVALID;
+ try {
+ int codePoint = Integer.parseInt(escapeBuffer.toString().substring(2), 16);
+ if (codePoint <= 0x10FFFF) {
+ buffer.appendCodePoint(codePoint);
+ escStatus = EscapeStatus.VALID;
+ }
+ } catch (NumberFormatException e) {
+ }
+ } else if (!consumeChar || isLastChar) {
+ // The escape sequence is terminated and invalid.
+ escStatus = EscapeStatus.INVALID;
+ }
+ } else if (escStatus == EscapeStatus.OCTAL_NUMBER) {
+ // Only consume this character if it belongs to the escape sequence.
+ consumeChar = c >= '0' && c <= '7';
+ if (consumeChar) {
+ escapeBuffer.append(c);
+ }
+
+ if (!consumeChar || isLastChar || escapeBuffer.length() == 4) {
+ // The escape sequence is terminated. Set the escape status to invalid until
+ // we know that it's actually valid.
+ escStatus = EscapeStatus.INVALID;
+ if (escapeBuffer.length() > 1) {
+ // Decode the octal number.
+ try {
+ int codePoint = Integer.parseInt(escapeBuffer.toString().substring(1), 8);
+ if (codePoint <= 0xFF) {
+ buffer.appendCodePoint(codePoint);
+ escStatus = EscapeStatus.VALID;
+ }
+ } catch (NumberFormatException e) {
+ }
+ }
+ }
+ }
+
+ if (escStatus == EscapeStatus.NONE) {
+ // Current character isn't escaped - copy it over to the destination buffer.
+ buffer.append(c);
+ } else if (escStatus == EscapeStatus.VALID) {
+ escStatus = EscapeStatus.NONE;
+ } else if (escStatus == EscapeStatus.INVALID) {
+ buffer.append(escapeBuffer);
+ escStatus = EscapeStatus.NONE;
+ }
+
+ if (!consumeChar) {
+ // Don't consume the current character.
+ i--;
+ }
+ }
+
+ // Check for non-finished escape sequences at the end of the string.
+ if (escStatus != EscapeStatus.NONE) {
+ buffer.append(escapeBuffer);
+ }
+
+ // Convert the buffer into a string and return it.
+ return buffer.toString();
+ }
+
+ /**
+ * Transcodes the given string. This is done as follows:</br></br>
+ * 1) The given string is encoded into a binary byte buffer.</br></br>
+ * 2) It's tested whether this binary byte buffer seems to represent a string which is encoded as
+ * either ASCII, Latin-1, or UTF-8. If this is the case, the binary byte buffer is decoded back into
+ * a string and this string is returned. If the test is negative, the given string is returned without
+ * modification because its encoding can't be reliably determined in this case.
+ * The most important use case of this function is to transcode a string which is actually UTF-8 but has
+ * been incorrectly decoded as Latin-1 instead.
+ * @param str The string to transcode.
+ * @return The transcoded string.
+ */
+ public static String transcodeString(String str) {
+ // Try to transcode the string from Latin-1 to UTF-8 (ASCII doesn't need to be explicitly
+ // considered here since Latin-1 is backwards compatible with ASCII). The transcoding will
+ // almost certainly only succeed if the string actually *is* encoded in UTF-8. If the
+ // transcoding fails, the string is simply left unchanged.
+ try {
+ // First, try to encode the string as Latin-1 in order to obtain the binary byte
+ // representation of the string.
+ CharsetEncoder latin1Encoder = Charset.forName("ISO-8859-1").newEncoder(); //$NON-NLS-1$
+ ByteBuffer stringBytes = latin1Encoder.encode(CharBuffer.wrap(str.toCharArray()));
+
+ // Next, try to decode the string as UTF-8. This will almost certainly only succeed
+ // if the string actually *is* encoded in UTF-8. Note that if the decoding fails,
+ // an exception is thrown before the str variable is assigned. The original string
+ // is therefore left unchanged in this case.
+ CharsetDecoder utf8Decoder = Charset.forName("UTF-8").newDecoder(); //$NON-NLS-1$
+ str = utf8Decoder.decode(stringBytes).toString();
+ } catch (Exception e) {
+ }
+
+ return str;
+ }
+
+ /**
+ * Escapes any non-printable characters as well as the printable special characters single quotation
+ * mark, double quotation mark, backslash, and literal question mark within the given string. Supports
+ * the entire Unicode code space.
+ * @param str The string which should be escaped.
+ * @return The escaped string.
+ */
+ public static String escapeString(String str) {
+ return escapeString(str, true);
+ }
+
+ /**
+ * Escapes any non-printable characters within the given string. Supports the entire Unicode code space.
+ * @param str The string which should be escaped.
+ * @param escapePrintableSpecialChars Defines whether the printable special characters single
+ * quotation mark, double quotation mark, backslash, and literal question mark should be
+ * escaped as well, or not.
+ * @return The escaped string.
+ */
+ public static String escapeString(String str, boolean escapePrintableSpecialChars) {
+ StringBuffer buffer = new StringBuffer();
+
+ for (int i = 0; i < str.length(); i++) {
+ // Get the current character code point. Note that using the Java "char" data type isn't
+ // sufficient here, as it can't handle all Unicode characters.
+ int codePoint = str.codePointAt(i);
+ if (Character.isSupplementaryCodePoint(codePoint)) {
+ i++;
+ }
+
+ // Check the code point type of the character in order to determine whether it's
+ // printable or not.
+ int codePointType = Character.getType(codePoint);
+ switch (codePointType) {
+ case Character.LINE_SEPARATOR:
+ case Character.PARAGRAPH_SEPARATOR:
+ case Character.CONTROL:
+ case Character.PRIVATE_USE:
+ case Character.SURROGATE:
+ case Character.UNASSIGNED:
+ // Non-printable character.
+ if (isSpecialCodePoint(codePoint)) {
+ // Escape by using the special character escape notation.
+ buffer.append('\\');
+ try {
+ buffer.append(parseSpecialCodePoint(codePoint));
+ } catch (ParseException e) {
+ buffer.appendCodePoint(codePoint);
+ }
+ } else if (codePoint == 0x00) {
+ // Escape the null character separately - don't use leading zeros.
+ buffer.append("\\0"); //$NON-NLS-1$
+ } else if (codePoint <= 0xFF) {
+ // Escape by using the octal escape notation.
+ buffer.append(String.format("\\%03o", codePoint)); //$NON-NLS-1$
+ } else if (codePoint <= 0xFFFF) {
+ // Escape by using the short Unicode escape notation.
+ buffer.append(String.format("\\u%04x", codePoint)); //$NON-NLS-1$
+ } else {
+ // Escape by using the long Unicode escape notation.
+ buffer.append(String.format("\\U%08x", codePoint)); //$NON-NLS-1$
+ }
+ break;
+ default:
+ // Printable character.
+ if (escapePrintableSpecialChars && isSpecialCodePoint(codePoint)) {
+ // Escape by using the special character escape notation.
+ buffer.append('\\');
+ try {
+ buffer.append(parseSpecialCodePoint(codePoint));
+ } catch (ParseException e) {
+ buffer.appendCodePoint(codePoint);
+ }
+ } else {
+ // Don't escape.
+ buffer.appendCodePoint(codePoint);
+ }
+ }
+ }
+
+ return buffer.toString();
+ }
+}

Back to the top