diff options
Diffstat (limited to 'dsf-gdb/org.eclipse.cdt.dsf.gdb/src/org/eclipse/cdt/dsf/mi/service/command/output/MIStringHandler.java')
-rw-r--r-- | dsf-gdb/org.eclipse.cdt.dsf.gdb/src/org/eclipse/cdt/dsf/mi/service/command/output/MIStringHandler.java | 850 |
1 files changed, 420 insertions, 430 deletions
diff --git a/dsf-gdb/org.eclipse.cdt.dsf.gdb/src/org/eclipse/cdt/dsf/mi/service/command/output/MIStringHandler.java b/dsf-gdb/org.eclipse.cdt.dsf.gdb/src/org/eclipse/cdt/dsf/mi/service/command/output/MIStringHandler.java index 7812a5202d9..af9f9aef5a6 100644 --- a/dsf-gdb/org.eclipse.cdt.dsf.gdb/src/org/eclipse/cdt/dsf/mi/service/command/output/MIStringHandler.java +++ b/dsf-gdb/org.eclipse.cdt.dsf.gdb/src/org/eclipse/cdt/dsf/mi/service/command/output/MIStringHandler.java @@ -7,7 +7,7 @@ * https://www.eclipse.org/legal/epl-2.0/ * * SPDX-License-Identifier: EPL-2.0 - * + * * Contributors: * Mathias Kunter - Initial Implementation (Bug 307311) *******************************************************************************/ @@ -29,434 +29,424 @@ import java.util.Map.Entry; * @since 4.1 */ public class MIStringHandler { - - /** - * A map of special characters which are used within escape notations to represent a - * corresponding Unicode code point (i.e. character code). - */ + + /** + * A map of special characters which are used within escape notations to represent a + * corresponding Unicode code point (i.e. character code). + */ // Use a LinkedHashMap to preserve order, so as to get 'e' and not 'E' - private static Map<Character,Integer> fSpecialCharactersToCodePointMap = new LinkedHashMap<Character,Integer>(); - static { - fSpecialCharactersToCodePointMap.put('a', 0x07); // Alert (bell) character - fSpecialCharactersToCodePointMap.put('b', 0x08); // Backspace character - fSpecialCharactersToCodePointMap.put('e', 0x1B); // GNU extension: Escape character - fSpecialCharactersToCodePointMap.put('E', 0x1B); // same as 'e' - fSpecialCharactersToCodePointMap.put('f', 0x0C); // Form feed character - fSpecialCharactersToCodePointMap.put('n', 0x0A); // New line character - fSpecialCharactersToCodePointMap.put('r', 0x0D); // Carriage return character - fSpecialCharactersToCodePointMap.put('t', 0x09); // Horizontal tabulation character - fSpecialCharactersToCodePointMap.put('v', 0x0B); // Vertical tabulation character - fSpecialCharactersToCodePointMap.put('\'', 0x27); // Single quotation mark - fSpecialCharactersToCodePointMap.put('"', 0x22); // Double quotation mark - fSpecialCharactersToCodePointMap.put('\\', 0x5C); // Backslash - fSpecialCharactersToCodePointMap.put('?', 0x3F); // Literal question mark - } - - /** - * An internal helper enumeration which holds the current status while parsing an escaped - * text sequence. - */ - private enum EscapeStatus { - NONE, - BEGIN, - OCTAL_NUMBER, - HEX_NUMBER, - UNICODE_SHORT_NUMBER, - UNICODE_LONG_NUMBER, - VALID, - INVALID - } - - /** - * An enumeration defining the escape sequences which should be parsed. - */ - public enum ParseFlags { - SPECIAL_CHARS, - OCTAL_NUMBERS, - HEX_NUMBERS, - UNICODE_SHORT_NUMBERS, - UNICODE_LONG_NUMBERS - } - - /** - * Translates the given C string into a string suitable for display. This includes handling - * of escaped characters and different string encodings. This is necessary in order to correctly - * deal with non-ASCII strings. - * @param str The C string to translate. - * @param escapeChars Defines whether non-printable characters should be escaped within - * the translated string, or not. - * @return The translated string. - */ - public static String translateCString(String str, boolean escapeChars) { - if (escapeChars) { - // Don't parse the special character escape notations here. We can do this here because - // we want to keep them in their escaped form anyway, and because the following string - // transcoding process isn't affected by escaped special chars. By doing so we avoid - // caring about some nasty details of the special character escaping process: for - // example, single quotation marks are commonly only escaped within character constants, - // while double quotation marks are commonly only escaped within string constants. By - // not parsing the special character escape notations at all here, we just keep the - // original special character escaping provided by the given MI string. - str = parseString(str, EnumSet.complementOf(EnumSet.of(ParseFlags.SPECIAL_CHARS))); - } else { - // Parse all escaped characters. - str = parseString(str); - } - - // Transcode the string in order to handle non-ASCII strings correctly. - str = transcodeString(str); - - if (escapeChars) { - // Escape any non-printable characters again, as we want to be able to display them. - // However, don't escape any printable special chars, as they haven't been parsed before. - str = escapeString(str, false); - } else { - // No escaping necessary here. We however have to make sure that we use the correct line - // separation character sequence. - str = str.replace("\n", System.getProperty("line.separator", "\n")); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ - } - - return str; - } - - /** - * Returns whether the given character is a special character, or not. - * @param c The character to test. - * @return The test result. - */ - public static boolean isSpecialChar(char c) { - return fSpecialCharactersToCodePointMap.containsKey(c); - } - - /** - * Returns whether the given Unicode code point is a special code point, or not. - * @param codePoint The Unicode code point to test. - * @return The test result. - */ - public static boolean isSpecialCodePoint(int codePoint) { - return fSpecialCharactersToCodePointMap.containsValue(codePoint); - } - - /** - * Parses the given special character into an Unicode code point. - * @param c The special character to parse. - * @return The parsed Unicode code point. - * @throws ParseException Thrown when the given character can't be parsed. This happens when it's - * not a special character. - */ - public static int parseSpecialChar(char c) throws ParseException { - Integer codePoint = fSpecialCharactersToCodePointMap.get(c); - if (codePoint != null) { - return codePoint; - } - throw new ParseException("The given character '" + c + "' is not a special character.", 0); //$NON-NLS-1$ //$NON-NLS-2$ - } - - /** - * Parses the given special Unicode code point into a character. - * @param codePoint The special Unicode code point to parse. - * @return The parsed character. - * @throws ParseException Thrown when the given Unicode code point can't be parsed. This happens - * when it's not a special code point. - */ - public static char parseSpecialCodePoint(int codePoint) throws ParseException { - for (Entry<Character, Integer> entry : fSpecialCharactersToCodePointMap.entrySet()) { - if (entry.getValue().equals(codePoint)) { - return entry.getKey(); - } - } - throw new ParseException("The given Unicode code point " + codePoint + " is not a special code point.", 0); //$NON-NLS-1$ //$NON-NLS-2$ - } - - /** - * This is an overloaded function. See the Javadoc of the other function overload for details. - * @param str The string which should be parsed. - * @return The parsed string. - */ - public static String parseString(String str) { - return parseString(str, EnumSet.allOf(ParseFlags.class)); - } - - /** - * Parses any escaped characters and replaces them with their corresponding Unicode code points. - * This function parses all escape notations which are supported by gcc and / or gdb. Those are:</br></br> - * - * <ul> - * <li>Special char escape notations: \a, \b, \e, \E, \f, \n, \r, \t, \v, \', \", \\, and \?</li> - * - * <li>Octal escape notation: An initial backslash, followed by 1, 2, or 3 octal digits. Values - * above 0xFF are ignored. Octal escape notations may not use more than 3 octal digits.</li> - * - * <li>Hexadecimal escape notation: An initial backslash, followed by an "x" and 1 or more - * hexadecimal digits. Hexadecimal escape notations may not use more than 4 hexadecimal digits - * (although gcc accepts hexadecimal escape notations of any arbitrary length).</li> - * - * <li>Short Unicode escape notation: An initial backslash, followed by an "u" and exactly 4 - * hexadecimal digits.</li> - * - * <li>Long Unicode escape notation: An initial backslash, followed by an "U" and exactly 8 - * hexadecimal digits.</li> - * </ul> - * @param str The string which should be parsed. - * @param parseFlags The set of escape notations which should be parsed. - * @return The parsed string. - */ - public static String parseString(String str, EnumSet<ParseFlags> parseFlags) { - StringBuilder buffer = new StringBuilder(); - StringBuilder escapeBuffer = new StringBuilder(); - EscapeStatus escStatus = EscapeStatus.NONE; - - for (int i = 0; i < str.length(); i++) { - char c = str.charAt(i); - boolean consumeChar = true; - boolean isLastChar = i == str.length() - 1; - - if (escStatus == EscapeStatus.NONE) { - if (c == '\\') { - // Escaping begins. Reset the escape buffer. - escapeBuffer.setLength(0); - escapeBuffer.append(c); - escStatus = EscapeStatus.BEGIN; - } - } else if (escStatus == EscapeStatus.BEGIN) { - if (parseFlags.contains(ParseFlags.SPECIAL_CHARS) && isSpecialChar(c)) { - try { - buffer.appendCodePoint(parseSpecialChar(c)); - escStatus = EscapeStatus.VALID; - } catch (ParseException e) { - // This is just for completeness. We will actually never catch any ParseException here - // since we already checked the character with isSpecialChar() before. - escapeBuffer.append(c); - escStatus = EscapeStatus.INVALID; - } - } else if (parseFlags.contains(ParseFlags.OCTAL_NUMBERS) && c >= '0' && c <= '7') { - escStatus = EscapeStatus.OCTAL_NUMBER; - // Don't consume this character right now - as this wouldn't work if it's the last character. - consumeChar = false; - } else if (parseFlags.contains(ParseFlags.HEX_NUMBERS) && c == 'x') { - escStatus = EscapeStatus.HEX_NUMBER; - } else if (parseFlags.contains(ParseFlags.UNICODE_SHORT_NUMBERS) && c == 'u') { - escStatus = EscapeStatus.UNICODE_SHORT_NUMBER; - } else if (parseFlags.contains(ParseFlags.UNICODE_LONG_NUMBERS) && c == 'U') { - escStatus = EscapeStatus.UNICODE_LONG_NUMBER; - } else { - escStatus = EscapeStatus.INVALID; - } - if (consumeChar) { - escapeBuffer.append(c); - } - } else if (escStatus == EscapeStatus.HEX_NUMBER) { - // Only consume this character if it belongs to the escape sequence. - consumeChar = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); - if (consumeChar) { - escapeBuffer.append(c); - } - - if (!consumeChar || isLastChar || escapeBuffer.length() == 6) { - // The escape sequence is terminated. Set the escape status to invalid until - // we know that it's actually valid. - escStatus = EscapeStatus.INVALID; - if (escapeBuffer.length() > 2) { - // Decode the hexadecimal number. - try { - int codePoint = Integer.parseInt(escapeBuffer.toString().substring(2), 16); - if (codePoint <= 0x10FFFF) { - buffer.appendCodePoint(codePoint); - escStatus = EscapeStatus.VALID; - } - } catch (NumberFormatException e) { - } - } - } - } else if (escStatus == EscapeStatus.UNICODE_SHORT_NUMBER || escStatus == EscapeStatus.UNICODE_LONG_NUMBER) { - // Only consume this character if it belongs to the escape sequence. - consumeChar = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); - if (consumeChar) { - escapeBuffer.append(c); - } - - int finalLength = escStatus == EscapeStatus.UNICODE_SHORT_NUMBER ? 6 : 10; - if (escapeBuffer.length() == finalLength) { - // The escape sequence is terminated. Set the escape status to invalid until - // we know that it's actually valid. Decode the hexadecimal number. - escStatus = EscapeStatus.INVALID; - try { - int codePoint = Integer.parseInt(escapeBuffer.toString().substring(2), 16); - if (codePoint <= 0x10FFFF) { - buffer.appendCodePoint(codePoint); - escStatus = EscapeStatus.VALID; - } - } catch (NumberFormatException e) { - } - } else if (!consumeChar || isLastChar) { - // The escape sequence is terminated and invalid. - escStatus = EscapeStatus.INVALID; - } - } else if (escStatus == EscapeStatus.OCTAL_NUMBER) { - // Only consume this character if it belongs to the escape sequence. - consumeChar = c >= '0' && c <= '7'; - if (consumeChar) { - escapeBuffer.append(c); - } - - if (!consumeChar || isLastChar || escapeBuffer.length() == 4) { - // The escape sequence is terminated. Set the escape status to invalid until - // we know that it's actually valid. - escStatus = EscapeStatus.INVALID; - if (escapeBuffer.length() > 1) { - // Decode the octal number. - try { - int codePoint = Integer.parseInt(escapeBuffer.toString().substring(1), 8); - if (codePoint <= 0xFF) { - buffer.appendCodePoint(codePoint); - escStatus = EscapeStatus.VALID; - } - } catch (NumberFormatException e) { - } - } - } - } - - if (escStatus == EscapeStatus.NONE) { - // Current character isn't escaped - copy it over to the destination buffer. - buffer.append(c); - } else if (escStatus == EscapeStatus.VALID) { - escStatus = EscapeStatus.NONE; - } else if (escStatus == EscapeStatus.INVALID) { - buffer.append(escapeBuffer); - escStatus = EscapeStatus.NONE; - } - - if (!consumeChar) { - // Don't consume the current character. - i--; - } - } - - // Check for non-finished escape sequences at the end of the string. - if (escStatus != EscapeStatus.NONE) { - buffer.append(escapeBuffer); - } - - // Convert the buffer into a string and return it. - return buffer.toString(); - } - - /** - * Transcodes the given string. This is done as follows:</br></br> - * 1) The given string is encoded into a binary byte buffer.</br></br> - * 2) It's tested whether this binary byte buffer seems to represent a string which is encoded as - * either ASCII, Latin-1, or UTF-8. If this is the case, the binary byte buffer is decoded back into - * a string and this string is returned. If the test is negative, the given string is returned without - * modification because its encoding can't be reliably determined in this case. - * The most important use case of this function is to transcode a string which is actually UTF-8 but has - * been incorrectly decoded as Latin-1 instead. - * @param str The string to transcode. - * @return The transcoded string. - */ - public static String transcodeString(String str) { - // Try to transcode the string from Latin-1 to UTF-8 (ASCII doesn't need to be explicitly - // considered here since Latin-1 is backwards compatible with ASCII). The transcoding will - // almost certainly only succeed if the string actually *is* encoded in UTF-8. If the - // transcoding fails, the string is simply left unchanged. - try { - // First, try to encode the string as Latin-1 in order to obtain the binary byte - // representation of the string. - CharsetEncoder latin1Encoder = Charset.forName("ISO-8859-1").newEncoder(); //$NON-NLS-1$ - ByteBuffer stringBytes = latin1Encoder.encode(CharBuffer.wrap(str.toCharArray())); - - // Next, try to decode the string as UTF-8. This will almost certainly only succeed - // if the string actually *is* encoded in UTF-8. Note that if the decoding fails, - // an exception is thrown before the str variable is assigned. The original string - // is therefore left unchanged in this case. - CharsetDecoder utf8Decoder = Charset.forName("UTF-8").newDecoder(); //$NON-NLS-1$ - str = utf8Decoder.decode(stringBytes).toString(); - } catch (Exception e) { - } - - return str; - } - - /** - * Escapes any non-printable characters as well as the printable special characters single quotation - * mark, double quotation mark, backslash, and literal question mark within the given string. Supports - * the entire Unicode code space. - * @param str The string which should be escaped. - * @return The escaped string. - */ - public static String escapeString(String str) { - return escapeString(str, true); - } - - /** - * Escapes any non-printable characters within the given string. Supports the entire Unicode code space. - * @param str The string which should be escaped. - * @param escapePrintableSpecialChars Defines whether the printable special characters single - * quotation mark, double quotation mark, backslash, and literal question mark should be - * escaped as well, or not. - * @return The escaped string. - */ - public static String escapeString(String str, boolean escapePrintableSpecialChars) { - StringBuilder buffer = new StringBuilder(); - - for (int i = 0; i < str.length(); i++) { - // Get the current character code point. Note that using the Java "char" data type isn't - // sufficient here, as it can't handle all Unicode characters. - int codePoint = str.codePointAt(i); - if (Character.isSupplementaryCodePoint(codePoint)) { - i++; - } - - // Check the code point type of the character in order to determine whether it's - // printable or not. - int codePointType = Character.getType(codePoint); - switch (codePointType) { - case Character.LINE_SEPARATOR: - case Character.PARAGRAPH_SEPARATOR: - case Character.CONTROL: - case Character.PRIVATE_USE: - case Character.SURROGATE: - case Character.UNASSIGNED: - // Non-printable character. - if (isSpecialCodePoint(codePoint)) { - // Escape by using the special character escape notation. - buffer.append('\\'); - try { - buffer.append(parseSpecialCodePoint(codePoint)); - } catch (ParseException e) { - buffer.appendCodePoint(codePoint); - } - } else if (codePoint == 0x00) { - // Escape the null character separately - don't use leading zeros. - buffer.append("\\0"); //$NON-NLS-1$ - } else if (codePoint <= 0xFF) { - // Escape by using the octal escape notation. - buffer.append(String.format("\\%03o", codePoint)); //$NON-NLS-1$ - } else if (codePoint <= 0xFFFF) { - // Escape by using the short Unicode escape notation. - buffer.append(String.format("\\u%04x", codePoint)); //$NON-NLS-1$ - } else { - // Escape by using the long Unicode escape notation. - buffer.append(String.format("\\U%08x", codePoint)); //$NON-NLS-1$ - } - break; - default: - // Printable character. - if (escapePrintableSpecialChars && isSpecialCodePoint(codePoint)) { - // Escape by using the special character escape notation. - buffer.append('\\'); - try { - buffer.append(parseSpecialCodePoint(codePoint)); - } catch (ParseException e) { - buffer.appendCodePoint(codePoint); - } - } else { - // Don't escape. - buffer.appendCodePoint(codePoint); - } - } - } - - return buffer.toString(); - } + private static Map<Character, Integer> fSpecialCharactersToCodePointMap = new LinkedHashMap<Character, Integer>(); + static { + fSpecialCharactersToCodePointMap.put('a', 0x07); // Alert (bell) character + fSpecialCharactersToCodePointMap.put('b', 0x08); // Backspace character + fSpecialCharactersToCodePointMap.put('e', 0x1B); // GNU extension: Escape character + fSpecialCharactersToCodePointMap.put('E', 0x1B); // same as 'e' + fSpecialCharactersToCodePointMap.put('f', 0x0C); // Form feed character + fSpecialCharactersToCodePointMap.put('n', 0x0A); // New line character + fSpecialCharactersToCodePointMap.put('r', 0x0D); // Carriage return character + fSpecialCharactersToCodePointMap.put('t', 0x09); // Horizontal tabulation character + fSpecialCharactersToCodePointMap.put('v', 0x0B); // Vertical tabulation character + fSpecialCharactersToCodePointMap.put('\'', 0x27); // Single quotation mark + fSpecialCharactersToCodePointMap.put('"', 0x22); // Double quotation mark + fSpecialCharactersToCodePointMap.put('\\', 0x5C); // Backslash + fSpecialCharactersToCodePointMap.put('?', 0x3F); // Literal question mark + } + + /** + * An internal helper enumeration which holds the current status while parsing an escaped + * text sequence. + */ + private enum EscapeStatus { + NONE, BEGIN, OCTAL_NUMBER, HEX_NUMBER, UNICODE_SHORT_NUMBER, UNICODE_LONG_NUMBER, VALID, INVALID + } + + /** + * An enumeration defining the escape sequences which should be parsed. + */ + public enum ParseFlags { + SPECIAL_CHARS, OCTAL_NUMBERS, HEX_NUMBERS, UNICODE_SHORT_NUMBERS, UNICODE_LONG_NUMBERS + } + + /** + * Translates the given C string into a string suitable for display. This includes handling + * of escaped characters and different string encodings. This is necessary in order to correctly + * deal with non-ASCII strings. + * @param str The C string to translate. + * @param escapeChars Defines whether non-printable characters should be escaped within + * the translated string, or not. + * @return The translated string. + */ + public static String translateCString(String str, boolean escapeChars) { + if (escapeChars) { + // Don't parse the special character escape notations here. We can do this here because + // we want to keep them in their escaped form anyway, and because the following string + // transcoding process isn't affected by escaped special chars. By doing so we avoid + // caring about some nasty details of the special character escaping process: for + // example, single quotation marks are commonly only escaped within character constants, + // while double quotation marks are commonly only escaped within string constants. By + // not parsing the special character escape notations at all here, we just keep the + // original special character escaping provided by the given MI string. + str = parseString(str, EnumSet.complementOf(EnumSet.of(ParseFlags.SPECIAL_CHARS))); + } else { + // Parse all escaped characters. + str = parseString(str); + } + + // Transcode the string in order to handle non-ASCII strings correctly. + str = transcodeString(str); + + if (escapeChars) { + // Escape any non-printable characters again, as we want to be able to display them. + // However, don't escape any printable special chars, as they haven't been parsed before. + str = escapeString(str, false); + } else { + // No escaping necessary here. We however have to make sure that we use the correct line + // separation character sequence. + str = str.replace("\n", System.getProperty("line.separator", "\n")); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$ + } + + return str; + } + + /** + * Returns whether the given character is a special character, or not. + * @param c The character to test. + * @return The test result. + */ + public static boolean isSpecialChar(char c) { + return fSpecialCharactersToCodePointMap.containsKey(c); + } + + /** + * Returns whether the given Unicode code point is a special code point, or not. + * @param codePoint The Unicode code point to test. + * @return The test result. + */ + public static boolean isSpecialCodePoint(int codePoint) { + return fSpecialCharactersToCodePointMap.containsValue(codePoint); + } + + /** + * Parses the given special character into an Unicode code point. + * @param c The special character to parse. + * @return The parsed Unicode code point. + * @throws ParseException Thrown when the given character can't be parsed. This happens when it's + * not a special character. + */ + public static int parseSpecialChar(char c) throws ParseException { + Integer codePoint = fSpecialCharactersToCodePointMap.get(c); + if (codePoint != null) { + return codePoint; + } + throw new ParseException("The given character '" + c + "' is not a special character.", 0); //$NON-NLS-1$ //$NON-NLS-2$ + } + + /** + * Parses the given special Unicode code point into a character. + * @param codePoint The special Unicode code point to parse. + * @return The parsed character. + * @throws ParseException Thrown when the given Unicode code point can't be parsed. This happens + * when it's not a special code point. + */ + public static char parseSpecialCodePoint(int codePoint) throws ParseException { + for (Entry<Character, Integer> entry : fSpecialCharactersToCodePointMap.entrySet()) { + if (entry.getValue().equals(codePoint)) { + return entry.getKey(); + } + } + throw new ParseException("The given Unicode code point " + codePoint + " is not a special code point.", 0); //$NON-NLS-1$ //$NON-NLS-2$ + } + + /** + * This is an overloaded function. See the Javadoc of the other function overload for details. + * @param str The string which should be parsed. + * @return The parsed string. + */ + public static String parseString(String str) { + return parseString(str, EnumSet.allOf(ParseFlags.class)); + } + + /** + * Parses any escaped characters and replaces them with their corresponding Unicode code points. + * This function parses all escape notations which are supported by gcc and / or gdb. Those are:</br></br> + * + * <ul> + * <li>Special char escape notations: \a, \b, \e, \E, \f, \n, \r, \t, \v, \', \", \\, and \?</li> + * + * <li>Octal escape notation: An initial backslash, followed by 1, 2, or 3 octal digits. Values + * above 0xFF are ignored. Octal escape notations may not use more than 3 octal digits.</li> + * + * <li>Hexadecimal escape notation: An initial backslash, followed by an "x" and 1 or more + * hexadecimal digits. Hexadecimal escape notations may not use more than 4 hexadecimal digits + * (although gcc accepts hexadecimal escape notations of any arbitrary length).</li> + * + * <li>Short Unicode escape notation: An initial backslash, followed by an "u" and exactly 4 + * hexadecimal digits.</li> + * + * <li>Long Unicode escape notation: An initial backslash, followed by an "U" and exactly 8 + * hexadecimal digits.</li> + * </ul> + * @param str The string which should be parsed. + * @param parseFlags The set of escape notations which should be parsed. + * @return The parsed string. + */ + public static String parseString(String str, EnumSet<ParseFlags> parseFlags) { + StringBuilder buffer = new StringBuilder(); + StringBuilder escapeBuffer = new StringBuilder(); + EscapeStatus escStatus = EscapeStatus.NONE; + + for (int i = 0; i < str.length(); i++) { + char c = str.charAt(i); + boolean consumeChar = true; + boolean isLastChar = i == str.length() - 1; + + if (escStatus == EscapeStatus.NONE) { + if (c == '\\') { + // Escaping begins. Reset the escape buffer. + escapeBuffer.setLength(0); + escapeBuffer.append(c); + escStatus = EscapeStatus.BEGIN; + } + } else if (escStatus == EscapeStatus.BEGIN) { + if (parseFlags.contains(ParseFlags.SPECIAL_CHARS) && isSpecialChar(c)) { + try { + buffer.appendCodePoint(parseSpecialChar(c)); + escStatus = EscapeStatus.VALID; + } catch (ParseException e) { + // This is just for completeness. We will actually never catch any ParseException here + // since we already checked the character with isSpecialChar() before. + escapeBuffer.append(c); + escStatus = EscapeStatus.INVALID; + } + } else if (parseFlags.contains(ParseFlags.OCTAL_NUMBERS) && c >= '0' && c <= '7') { + escStatus = EscapeStatus.OCTAL_NUMBER; + // Don't consume this character right now - as this wouldn't work if it's the last character. + consumeChar = false; + } else if (parseFlags.contains(ParseFlags.HEX_NUMBERS) && c == 'x') { + escStatus = EscapeStatus.HEX_NUMBER; + } else if (parseFlags.contains(ParseFlags.UNICODE_SHORT_NUMBERS) && c == 'u') { + escStatus = EscapeStatus.UNICODE_SHORT_NUMBER; + } else if (parseFlags.contains(ParseFlags.UNICODE_LONG_NUMBERS) && c == 'U') { + escStatus = EscapeStatus.UNICODE_LONG_NUMBER; + } else { + escStatus = EscapeStatus.INVALID; + } + if (consumeChar) { + escapeBuffer.append(c); + } + } else if (escStatus == EscapeStatus.HEX_NUMBER) { + // Only consume this character if it belongs to the escape sequence. + consumeChar = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); + if (consumeChar) { + escapeBuffer.append(c); + } + + if (!consumeChar || isLastChar || escapeBuffer.length() == 6) { + // The escape sequence is terminated. Set the escape status to invalid until + // we know that it's actually valid. + escStatus = EscapeStatus.INVALID; + if (escapeBuffer.length() > 2) { + // Decode the hexadecimal number. + try { + int codePoint = Integer.parseInt(escapeBuffer.toString().substring(2), 16); + if (codePoint <= 0x10FFFF) { + buffer.appendCodePoint(codePoint); + escStatus = EscapeStatus.VALID; + } + } catch (NumberFormatException e) { + } + } + } + } else if (escStatus == EscapeStatus.UNICODE_SHORT_NUMBER + || escStatus == EscapeStatus.UNICODE_LONG_NUMBER) { + // Only consume this character if it belongs to the escape sequence. + consumeChar = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); + if (consumeChar) { + escapeBuffer.append(c); + } + + int finalLength = escStatus == EscapeStatus.UNICODE_SHORT_NUMBER ? 6 : 10; + if (escapeBuffer.length() == finalLength) { + // The escape sequence is terminated. Set the escape status to invalid until + // we know that it's actually valid. Decode the hexadecimal number. + escStatus = EscapeStatus.INVALID; + try { + int codePoint = Integer.parseInt(escapeBuffer.toString().substring(2), 16); + if (codePoint <= 0x10FFFF) { + buffer.appendCodePoint(codePoint); + escStatus = EscapeStatus.VALID; + } + } catch (NumberFormatException e) { + } + } else if (!consumeChar || isLastChar) { + // The escape sequence is terminated and invalid. + escStatus = EscapeStatus.INVALID; + } + } else if (escStatus == EscapeStatus.OCTAL_NUMBER) { + // Only consume this character if it belongs to the escape sequence. + consumeChar = c >= '0' && c <= '7'; + if (consumeChar) { + escapeBuffer.append(c); + } + + if (!consumeChar || isLastChar || escapeBuffer.length() == 4) { + // The escape sequence is terminated. Set the escape status to invalid until + // we know that it's actually valid. + escStatus = EscapeStatus.INVALID; + if (escapeBuffer.length() > 1) { + // Decode the octal number. + try { + int codePoint = Integer.parseInt(escapeBuffer.toString().substring(1), 8); + if (codePoint <= 0xFF) { + buffer.appendCodePoint(codePoint); + escStatus = EscapeStatus.VALID; + } + } catch (NumberFormatException e) { + } + } + } + } + + if (escStatus == EscapeStatus.NONE) { + // Current character isn't escaped - copy it over to the destination buffer. + buffer.append(c); + } else if (escStatus == EscapeStatus.VALID) { + escStatus = EscapeStatus.NONE; + } else if (escStatus == EscapeStatus.INVALID) { + buffer.append(escapeBuffer); + escStatus = EscapeStatus.NONE; + } + + if (!consumeChar) { + // Don't consume the current character. + i--; + } + } + + // Check for non-finished escape sequences at the end of the string. + if (escStatus != EscapeStatus.NONE) { + buffer.append(escapeBuffer); + } + + // Convert the buffer into a string and return it. + return buffer.toString(); + } + + /** + * Transcodes the given string. This is done as follows:</br></br> + * 1) The given string is encoded into a binary byte buffer.</br></br> + * 2) It's tested whether this binary byte buffer seems to represent a string which is encoded as + * either ASCII, Latin-1, or UTF-8. If this is the case, the binary byte buffer is decoded back into + * a string and this string is returned. If the test is negative, the given string is returned without + * modification because its encoding can't be reliably determined in this case. + * The most important use case of this function is to transcode a string which is actually UTF-8 but has + * been incorrectly decoded as Latin-1 instead. + * @param str The string to transcode. + * @return The transcoded string. + */ + public static String transcodeString(String str) { + // Try to transcode the string from Latin-1 to UTF-8 (ASCII doesn't need to be explicitly + // considered here since Latin-1 is backwards compatible with ASCII). The transcoding will + // almost certainly only succeed if the string actually *is* encoded in UTF-8. If the + // transcoding fails, the string is simply left unchanged. + try { + // First, try to encode the string as Latin-1 in order to obtain the binary byte + // representation of the string. + CharsetEncoder latin1Encoder = Charset.forName("ISO-8859-1").newEncoder(); //$NON-NLS-1$ + ByteBuffer stringBytes = latin1Encoder.encode(CharBuffer.wrap(str.toCharArray())); + + // Next, try to decode the string as UTF-8. This will almost certainly only succeed + // if the string actually *is* encoded in UTF-8. Note that if the decoding fails, + // an exception is thrown before the str variable is assigned. The original string + // is therefore left unchanged in this case. + CharsetDecoder utf8Decoder = Charset.forName("UTF-8").newDecoder(); //$NON-NLS-1$ + str = utf8Decoder.decode(stringBytes).toString(); + } catch (Exception e) { + } + + return str; + } + + /** + * Escapes any non-printable characters as well as the printable special characters single quotation + * mark, double quotation mark, backslash, and literal question mark within the given string. Supports + * the entire Unicode code space. + * @param str The string which should be escaped. + * @return The escaped string. + */ + public static String escapeString(String str) { + return escapeString(str, true); + } + + /** + * Escapes any non-printable characters within the given string. Supports the entire Unicode code space. + * @param str The string which should be escaped. + * @param escapePrintableSpecialChars Defines whether the printable special characters single + * quotation mark, double quotation mark, backslash, and literal question mark should be + * escaped as well, or not. + * @return The escaped string. + */ + public static String escapeString(String str, boolean escapePrintableSpecialChars) { + StringBuilder buffer = new StringBuilder(); + + for (int i = 0; i < str.length(); i++) { + // Get the current character code point. Note that using the Java "char" data type isn't + // sufficient here, as it can't handle all Unicode characters. + int codePoint = str.codePointAt(i); + if (Character.isSupplementaryCodePoint(codePoint)) { + i++; + } + + // Check the code point type of the character in order to determine whether it's + // printable or not. + int codePointType = Character.getType(codePoint); + switch (codePointType) { + case Character.LINE_SEPARATOR: + case Character.PARAGRAPH_SEPARATOR: + case Character.CONTROL: + case Character.PRIVATE_USE: + case Character.SURROGATE: + case Character.UNASSIGNED: + // Non-printable character. + if (isSpecialCodePoint(codePoint)) { + // Escape by using the special character escape notation. + buffer.append('\\'); + try { + buffer.append(parseSpecialCodePoint(codePoint)); + } catch (ParseException e) { + buffer.appendCodePoint(codePoint); + } + } else if (codePoint == 0x00) { + // Escape the null character separately - don't use leading zeros. + buffer.append("\\0"); //$NON-NLS-1$ + } else if (codePoint <= 0xFF) { + // Escape by using the octal escape notation. + buffer.append(String.format("\\%03o", codePoint)); //$NON-NLS-1$ + } else if (codePoint <= 0xFFFF) { + // Escape by using the short Unicode escape notation. + buffer.append(String.format("\\u%04x", codePoint)); //$NON-NLS-1$ + } else { + // Escape by using the long Unicode escape notation. + buffer.append(String.format("\\U%08x", codePoint)); //$NON-NLS-1$ + } + break; + default: + // Printable character. + if (escapePrintableSpecialChars && isSpecialCodePoint(codePoint)) { + // Escape by using the special character escape notation. + buffer.append('\\'); + try { + buffer.append(parseSpecialCodePoint(codePoint)); + } catch (ParseException e) { + buffer.appendCodePoint(codePoint); + } + } else { + // Don't escape. + buffer.appendCodePoint(codePoint); + } + } + } + + return buffer.toString(); + } } |