Skip to main content
aboutsummaryrefslogtreecommitdiffstats
blob: d9f0f76a2b66e9f932af2eed9a8002bb398ef372 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
/*******************************************************************************
 * Copyright (c) 2012, 2016 Mathias Kunter and others.
 *
 * This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License 2.0
 * which accompanies this distribution, and is available at
 * https://www.eclipse.org/legal/epl-2.0/
 *
 * SPDX-License-Identifier: EPL-2.0
 *
 * Contributors:
 *     Mathias Kunter       - Initial Implementation (Bug 307311)
 *******************************************************************************/
package org.eclipse.cdt.dsf.mi.service.command.output;

import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.nio.charset.CharsetDecoder;
import java.nio.charset.CharsetEncoder;
import java.text.ParseException;
import java.util.EnumSet;
import java.util.LinkedHashMap;
import java.util.Map;
import java.util.Map.Entry;

/**
 * The MIStringHandler class provides several static functions to handle C and / or MI strings.
 * @since 4.1
 */
public class MIStringHandler {

	/**
	 * A map of special characters which are used within escape notations to represent a
	 * corresponding Unicode code point (i.e. character code).
	 */
	// Use a LinkedHashMap to preserve order, so as to get 'e' and not 'E'
	private static Map<Character, Integer> fSpecialCharactersToCodePointMap = new LinkedHashMap<>();
	static {
		fSpecialCharactersToCodePointMap.put('a', 0x07); // Alert (bell) character
		fSpecialCharactersToCodePointMap.put('b', 0x08); // Backspace character
		fSpecialCharactersToCodePointMap.put('e', 0x1B); // GNU extension: Escape character
		fSpecialCharactersToCodePointMap.put('E', 0x1B); // same as 'e'
		fSpecialCharactersToCodePointMap.put('f', 0x0C); // Form feed character
		fSpecialCharactersToCodePointMap.put('n', 0x0A); // New line character
		fSpecialCharactersToCodePointMap.put('r', 0x0D); // Carriage return character
		fSpecialCharactersToCodePointMap.put('t', 0x09); // Horizontal tabulation character
		fSpecialCharactersToCodePointMap.put('v', 0x0B); // Vertical tabulation character
		fSpecialCharactersToCodePointMap.put('\'', 0x27); // Single quotation mark
		fSpecialCharactersToCodePointMap.put('"', 0x22); // Double quotation mark
		fSpecialCharactersToCodePointMap.put('\\', 0x5C); // Backslash
		fSpecialCharactersToCodePointMap.put('?', 0x3F); // Literal question mark
	}

	/**
	 * An internal helper enumeration which holds the current status while parsing an escaped
	 * text sequence.
	 */
	private enum EscapeStatus {
		NONE, BEGIN, OCTAL_NUMBER, HEX_NUMBER, UNICODE_SHORT_NUMBER, UNICODE_LONG_NUMBER, VALID, INVALID
	}

	/**
	 * An enumeration defining the escape sequences which should be parsed.
	 */
	public enum ParseFlags {
		SPECIAL_CHARS, OCTAL_NUMBERS, HEX_NUMBERS, UNICODE_SHORT_NUMBERS, UNICODE_LONG_NUMBERS
	}

	/**
	 * Translates the given C string into a string suitable for display. This includes handling
	 * of escaped characters and different string encodings. This is necessary in order to correctly
	 * deal with non-ASCII strings.
	 * @param str The C string to translate.
	 * @param escapeChars Defines whether non-printable characters should be escaped within
	 * the translated string, or not.
	 * @return The translated string.
	 */
	public static String translateCString(String str, boolean escapeChars) {
		if (escapeChars) {
			// Don't parse the special character escape notations here. We can do this here because
			// we want to keep them in their escaped form anyway, and because the following string
			// transcoding process isn't affected by escaped special chars. By doing so we avoid
			// caring about some nasty details of the special character escaping process: for
			// example, single quotation marks are commonly only escaped within character constants,
			// while double quotation marks are commonly only escaped within string constants. By
			// not parsing the special character escape notations at all here, we just keep the
			// original special character escaping provided by the given MI string.
			str = parseString(str, EnumSet.complementOf(EnumSet.of(ParseFlags.SPECIAL_CHARS)));
		} else {
			// Parse all escaped characters.
			str = parseString(str);
		}

		// Transcode the string in order to handle non-ASCII strings correctly.
		str = transcodeString(str);

		if (escapeChars) {
			// Escape any non-printable characters again, as we want to be able to display them.
			// However, don't escape any printable special chars, as they haven't been parsed before.
			str = escapeString(str, false);
		} else {
			// No escaping necessary here. We however have to make sure that we use the correct line
			// separation character sequence.
			str = str.replace("\n", System.getProperty("line.separator", "\n")); //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
		}

		return str;
	}

	/**
	 * Returns whether the given character is a special character, or not.
	 * @param c The character to test.
	 * @return The test result.
	 */
	public static boolean isSpecialChar(char c) {
		return fSpecialCharactersToCodePointMap.containsKey(c);
	}

	/**
	 * Returns whether the given Unicode code point is a special code point, or not.
	 * @param codePoint The Unicode code point to test.
	 * @return The test result.
	 */
	public static boolean isSpecialCodePoint(int codePoint) {
		return fSpecialCharactersToCodePointMap.containsValue(codePoint);
	}

	/**
	 * Parses the given special character into an Unicode code point.
	 * @param c The special character to parse.
	 * @return The parsed Unicode code point.
	 * @throws ParseException Thrown when the given character can't be parsed. This happens when it's
	 * not a special character.
	 */
	public static int parseSpecialChar(char c) throws ParseException {
		Integer codePoint = fSpecialCharactersToCodePointMap.get(c);
		if (codePoint != null) {
			return codePoint;
		}
		throw new ParseException("The given character '" + c + "' is not a special character.", 0); //$NON-NLS-1$ //$NON-NLS-2$
	}

	/**
	 * Parses the given special Unicode code point into a character.
	 * @param codePoint The special Unicode code point to parse.
	 * @return The parsed character.
	 * @throws ParseException Thrown when the given Unicode code point can't be parsed. This happens
	 * when it's not a special code point.
	 */
	public static char parseSpecialCodePoint(int codePoint) throws ParseException {
		for (Entry<Character, Integer> entry : fSpecialCharactersToCodePointMap.entrySet()) {
			if (entry.getValue().equals(codePoint)) {
				return entry.getKey();
			}
		}
		throw new ParseException("The given Unicode code point " + codePoint + " is not a special code point.", 0); //$NON-NLS-1$ //$NON-NLS-2$
	}

	/**
	 * This is an overloaded function. See the Javadoc of the other function overload for details.
	 * @param str The string which should be parsed.
	 * @return The parsed string.
	 */
	public static String parseString(String str) {
		return parseString(str, EnumSet.allOf(ParseFlags.class));
	}

	/**
	 * Parses any escaped characters and replaces them with their corresponding Unicode code points.
	 * This function parses all escape notations which are supported by gcc and / or gdb. Those are:</br></br>
	 *
	 * <ul>
	 * <li>Special char escape notations: \a, \b, \e, \E, \f, \n, \r, \t, \v, \', \", \\, and \?</li>
	 *
	 * <li>Octal escape notation: An initial backslash, followed by 1, 2, or 3 octal digits. Values
	 * above 0xFF are ignored. Octal escape notations may not use more than 3 octal digits.</li>
	 *
	 * <li>Hexadecimal escape notation: An initial backslash, followed by an "x" and 1 or more
	 * hexadecimal digits. Hexadecimal escape notations may not use more than 4 hexadecimal digits
	 * (although gcc accepts hexadecimal escape notations of any arbitrary length).</li>
	 *
	 * <li>Short Unicode escape notation: An initial backslash, followed by an "u" and exactly 4
	 * hexadecimal digits.</li>
	 *
	 * <li>Long Unicode escape notation: An initial backslash, followed by an "U" and exactly 8
	 * hexadecimal digits.</li>
	 * </ul>
	 * @param str The string which should be parsed.
	 * @param parseFlags The set of escape notations which should be parsed.
	 * @return The parsed string.
	 */
	public static String parseString(String str, EnumSet<ParseFlags> parseFlags) {
		StringBuilder buffer = new StringBuilder();
		StringBuilder escapeBuffer = new StringBuilder();
		EscapeStatus escStatus = EscapeStatus.NONE;

		for (int i = 0; i < str.length(); i++) {
			char c = str.charAt(i);
			boolean consumeChar = true;
			boolean isLastChar = i == str.length() - 1;

			if (escStatus == EscapeStatus.NONE) {
				if (c == '\\') {
					// Escaping begins. Reset the escape buffer.
					escapeBuffer.setLength(0);
					escapeBuffer.append(c);
					escStatus = EscapeStatus.BEGIN;
				}
			} else if (escStatus == EscapeStatus.BEGIN) {
				if (parseFlags.contains(ParseFlags.SPECIAL_CHARS) && isSpecialChar(c)) {
					try {
						buffer.appendCodePoint(parseSpecialChar(c));
						escStatus = EscapeStatus.VALID;
					} catch (ParseException e) {
						// This is just for completeness. We will actually never catch any ParseException here
						// since we already checked the character with isSpecialChar() before.
						escapeBuffer.append(c);
						escStatus = EscapeStatus.INVALID;
					}
				} else if (parseFlags.contains(ParseFlags.OCTAL_NUMBERS) && c >= '0' && c <= '7') {
					escStatus = EscapeStatus.OCTAL_NUMBER;
					// Don't consume this character right now - as this wouldn't work if it's the last character.
					consumeChar = false;
				} else if (parseFlags.contains(ParseFlags.HEX_NUMBERS) && c == 'x') {
					escStatus = EscapeStatus.HEX_NUMBER;
				} else if (parseFlags.contains(ParseFlags.UNICODE_SHORT_NUMBERS) && c == 'u') {
					escStatus = EscapeStatus.UNICODE_SHORT_NUMBER;
				} else if (parseFlags.contains(ParseFlags.UNICODE_LONG_NUMBERS) && c == 'U') {
					escStatus = EscapeStatus.UNICODE_LONG_NUMBER;
				} else {
					escStatus = EscapeStatus.INVALID;
				}
				if (consumeChar) {
					escapeBuffer.append(c);
				}
			} else if (escStatus == EscapeStatus.HEX_NUMBER) {
				// Only consume this character if it belongs to the escape sequence.
				consumeChar = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
				if (consumeChar) {
					escapeBuffer.append(c);
				}

				if (!consumeChar || isLastChar || escapeBuffer.length() == 6) {
					// The escape sequence is terminated. Set the escape status to invalid until
					// we know that it's actually valid.
					escStatus = EscapeStatus.INVALID;
					if (escapeBuffer.length() > 2) {
						// Decode the hexadecimal number.
						try {
							int codePoint = Integer.parseInt(escapeBuffer.toString().substring(2), 16);
							if (codePoint <= 0x10FFFF) {
								buffer.appendCodePoint(codePoint);
								escStatus = EscapeStatus.VALID;
							}
						} catch (NumberFormatException e) {
						}
					}
				}
			} else if (escStatus == EscapeStatus.UNICODE_SHORT_NUMBER
					|| escStatus == EscapeStatus.UNICODE_LONG_NUMBER) {
				// Only consume this character if it belongs to the escape sequence.
				consumeChar = (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F');
				if (consumeChar) {
					escapeBuffer.append(c);
				}

				int finalLength = escStatus == EscapeStatus.UNICODE_SHORT_NUMBER ? 6 : 10;
				if (escapeBuffer.length() == finalLength) {
					// The escape sequence is terminated. Set the escape status to invalid until
					// we know that it's actually valid. Decode the hexadecimal number.
					escStatus = EscapeStatus.INVALID;
					try {
						int codePoint = Integer.parseInt(escapeBuffer.toString().substring(2), 16);
						if (codePoint <= 0x10FFFF) {
							buffer.appendCodePoint(codePoint);
							escStatus = EscapeStatus.VALID;
						}
					} catch (NumberFormatException e) {
					}
				} else if (!consumeChar || isLastChar) {
					// The escape sequence is terminated and invalid.
					escStatus = EscapeStatus.INVALID;
				}
			} else if (escStatus == EscapeStatus.OCTAL_NUMBER) {
				// Only consume this character if it belongs to the escape sequence.
				consumeChar = c >= '0' && c <= '7';
				if (consumeChar) {
					escapeBuffer.append(c);
				}

				if (!consumeChar || isLastChar || escapeBuffer.length() == 4) {
					// The escape sequence is terminated. Set the escape status to invalid until
					// we know that it's actually valid.
					escStatus = EscapeStatus.INVALID;
					if (escapeBuffer.length() > 1) {
						// Decode the octal number.
						try {
							int codePoint = Integer.parseInt(escapeBuffer.toString().substring(1), 8);
							if (codePoint <= 0xFF) {
								buffer.appendCodePoint(codePoint);
								escStatus = EscapeStatus.VALID;
							}
						} catch (NumberFormatException e) {
						}
					}
				}
			}

			if (escStatus == EscapeStatus.NONE) {
				// Current character isn't escaped - copy it over to the destination buffer.
				buffer.append(c);
			} else if (escStatus == EscapeStatus.VALID) {
				escStatus = EscapeStatus.NONE;
			} else if (escStatus == EscapeStatus.INVALID) {
				buffer.append(escapeBuffer);
				escStatus = EscapeStatus.NONE;
			}

			if (!consumeChar) {
				// Don't consume the current character.
				i--;
			}
		}

		// Check for non-finished escape sequences at the end of the string.
		if (escStatus != EscapeStatus.NONE) {
			buffer.append(escapeBuffer);
		}

		// Convert the buffer into a string and return it.
		return buffer.toString();
	}

	/**
	 * Transcodes the given string. This is done as follows:</br></br>
	 * 1) The given string is encoded into a binary byte buffer.</br></br>
	 * 2) It's tested whether this binary byte buffer seems to represent a string which is encoded as
	 * either ASCII, Latin-1, or UTF-8. If this is the case, the binary byte buffer is decoded back into
	 * a string and this string is returned. If the test is negative, the given string is returned without
	 * modification because its encoding can't be reliably determined in this case.
	 * The most important use case of this function is to transcode a string which is actually UTF-8 but has
	 * been incorrectly decoded as Latin-1 instead.
	 * @param str The string to transcode.
	 * @return The transcoded string.
	 */
	public static String transcodeString(String str) {
		// Try to transcode the string from Latin-1 to UTF-8 (ASCII doesn't need to be explicitly
		// considered here since Latin-1 is backwards compatible with ASCII). The transcoding will
		// almost certainly only succeed if the string actually *is* encoded in UTF-8. If the
		// transcoding fails, the string is simply left unchanged.
		try {
			// First, try to encode the string as Latin-1 in order to obtain the binary byte
			// representation of the string.
			CharsetEncoder latin1Encoder = Charset.forName("ISO-8859-1").newEncoder(); //$NON-NLS-1$
			ByteBuffer stringBytes = latin1Encoder.encode(CharBuffer.wrap(str.toCharArray()));

			// Next, try to decode the string as UTF-8. This will almost certainly only succeed
			// if the string actually *is* encoded in UTF-8. Note that if the decoding fails,
			// an exception is thrown before the str variable is assigned. The original string
			// is therefore left unchanged in this case.
			CharsetDecoder utf8Decoder = Charset.forName("UTF-8").newDecoder(); //$NON-NLS-1$
			str = utf8Decoder.decode(stringBytes).toString();
		} catch (Exception e) {
		}

		return str;
	}

	/**
	 * Escapes any non-printable characters as well as the printable special characters single quotation
	 * mark, double quotation mark, backslash, and literal question mark within the given string. Supports
	 * the entire Unicode code space.
	 * @param str The string which should be escaped.
	 * @return The escaped string.
	 */
	public static String escapeString(String str) {
		return escapeString(str, true);
	}

	/**
	 * Escapes any non-printable characters within the given string. Supports the entire Unicode code space.
	 * @param str The string which should be escaped.
	 * @param escapePrintableSpecialChars Defines whether the printable special characters single
	 * quotation mark, double quotation mark, backslash, and literal question mark should be
	 * escaped as well, or not.
	 * @return The escaped string.
	 */
	public static String escapeString(String str, boolean escapePrintableSpecialChars) {
		StringBuilder buffer = new StringBuilder();

		for (int i = 0; i < str.length(); i++) {
			// Get the current character code point. Note that using the Java "char" data type isn't
			// sufficient here, as it can't handle all Unicode characters.
			int codePoint = str.codePointAt(i);
			if (Character.isSupplementaryCodePoint(codePoint)) {
				i++;
			}

			// Check the code point type of the character in order to determine whether it's
			// printable or not.
			int codePointType = Character.getType(codePoint);
			switch (codePointType) {
			case Character.LINE_SEPARATOR:
			case Character.PARAGRAPH_SEPARATOR:
			case Character.CONTROL:
			case Character.PRIVATE_USE:
			case Character.SURROGATE:
			case Character.UNASSIGNED:
				// Non-printable character.
				if (isSpecialCodePoint(codePoint)) {
					// Escape by using the special character escape notation.
					buffer.append('\\');
					try {
						buffer.append(parseSpecialCodePoint(codePoint));
					} catch (ParseException e) {
						buffer.appendCodePoint(codePoint);
					}
				} else if (codePoint == 0x00) {
					// Escape the null character separately - don't use leading zeros.
					buffer.append("\\0"); //$NON-NLS-1$
				} else if (codePoint <= 0xFF) {
					// Escape by using the octal escape notation.
					buffer.append(String.format("\\%03o", codePoint)); //$NON-NLS-1$
				} else if (codePoint <= 0xFFFF) {
					// Escape by using the short Unicode escape notation.
					buffer.append(String.format("\\u%04x", codePoint)); //$NON-NLS-1$
				} else {
					// Escape by using the long Unicode escape notation.
					buffer.append(String.format("\\U%08x", codePoint)); //$NON-NLS-1$
				}
				break;
			default:
				// Printable character.
				if (escapePrintableSpecialChars && isSpecialCodePoint(codePoint)) {
					// Escape by using the special character escape notation.
					buffer.append('\\');
					try {
						buffer.append(parseSpecialCodePoint(codePoint));
					} catch (ParseException e) {
						buffer.appendCodePoint(codePoint);
					}
				} else {
					// Don't escape.
					buffer.appendCodePoint(codePoint);
				}
			}
		}

		return buffer.toString();
	}
}

Back to the top