Home

Contribute

Source code


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193

/*******************************************************************************
 * Copyright (c) 2011 Wind River Systems, Inc. and others. All rights reserved.
 * This program and the accompanying materials are made available under the terms
 * of the Eclipse Public License v1.0 which accompanies this distribution, and is
 * available at http://www.eclipse.org/legal/epl-v10.html
 *
 * Contributors:
 * Wind River Systems - initial API and implementation
 *******************************************************************************/
package org.eclipse.tcf.te.core.utils.text;

import java.io.IOException;
import java.io.StreamTokenizer;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

import org.eclipse.core.runtime.Assert;

/**
 * A class providing useful static method to manipulate strings.
 */
public final class StringUtil {

	/**
	 * Tokenize a list of whitespace-separated arguments into a list.
	 * <p>
	 * <b>Note:</b> <i>This method has been designed for the specific needs of tokenizing
	 * command line arguments to launch external application from Java!</i>
	 * <p>
	 * Arguments may be quoted by double quotes. Quotes must not appear
	 * inside words, or they will lead to new tokens.
	 * <p>
	 * Example:<pre><code>    a"bc"d   -->  tokenized into a,bc,d</code></pre>
	 * <p>
	 * If <code>maxArgs</code> is greater than 0, then a maximum of <code>maxArgs</code>
	 * tokens is returned and the resulting array is filled to the given number of arguments
	 * with empty strings.
	 * <p>
	 * If <code>maxArgs</code> is less than or equal 0, the original number of arguments is tokenized.
	 *
	 * @param arguments The space separated arguments string. Must not be <code>null</code>.
	 * @param maxArgs The maximum number of returned tokens or <code>0</code>.
	 * @param keepQuotes If <code>true</code>, the original arguments quotes are retained, <code>false</code> otherwise.
	 *
	 * @return The tokenized string or an empty list.
	 */
	public static String[] tokenize(String arguments, int maxArgs, boolean keepQuotes) {
		Assert.isNotNull(arguments);

		// Create the result list
		List<String> result = maxArgs > 0 ? new ArrayList<String>(maxArgs) : new ArrayList<String>();

		// Arguments sent separately
		StreamTokenizer tok = new StreamTokenizer(new StringReader(arguments));
		tok.resetSyntax();
		// whitespace is everything from 0 to 32 (space)
		tok.whitespaceChars(0, 32);
		// everything from 33 to 255 is treated as word character
		tok.wordChars(33, 255);
		// except the 0xa0, is an whitespace too.
		tok.whitespaceChars(0xa0, 0xa0);
		// the quoting character is the double-quote
		tok.quoteChar('"');

		// extract only the number of arguments request or unlimited if maxArgs == 0
		int nArgs = 0;
		while (maxArgs <= 0 || nArgs < maxArgs) {
			try {
				// get the next token from the stream
				int ttype = tok.nextToken();
				// if reached end of file, leave the loop
				if (ttype == StreamTokenizer.TT_EOF) {
					break;
				}

				if (keepQuotes && ttype == 34) { //quoted word
					String quoted = enQuote(tok.sval);
					if (quoted.length() < 2 || quoted.charAt(0) != '"' ||
						quoted.charAt(quoted.length() - 1) != '"') {
						quoted = '"' + quoted + '"';
					}
					result.add(quoted);
				}
				else {
					result.add(tok.sval);
				}
				nArgs++;
			}
			catch (IOException e) {
				// on any IO exception, break the loop
				break;
			}
		}
		return result.toArray(new String[result.size()]);
	}

	private final static Pattern ALLOWED_STRING_PATTERN = Pattern.compile("[a-zA-Z0-9_@.-]*"); //$NON-NLS-1$

	/**
	 * Enquote the given string if necessary by putting double quotes around it.
	 * <p>
	 * <b>Note:</b> <i>This method has been designed for the specific needs of enquoting
	 * command line arguments to launch external application from Java!</i>
	 * <p>
	 * Characters that require quoting are:
	 * <dl>
	 *   <dt>/\:;</dt><dd>Because they are file or path separators respectively</dd>
	 *   <dt>\s</dt><dd>All whitespace characters, naturally</dd>
	 *   <dt>|<>{}()$^~</dt><dd>Because they have shell or TCL special meaning</dd>
	 *   <dt>*%?&</dt><dd>Because they are commonly used as wildcard's</dd>
	 *   <dt>#</dt><dd>Because it is commonly used to start comments</dd>
	 *   <dt>'"`</dt><dd>Because they are quoting characters</dd>
	 * </dl>
	 * To make a long story short, we only allow ASCII characters, numbers,
	 * the underscore and <code>@.-</code> to go unquoted.
	 *
	 * @param unqouted The string to quote or <code>null</code>.
	 *
	 * @return The quoted string or "\"\"".
	 */
	public static String enQuote(String unqouted) {
		if (unqouted == null) {
			return "\"\""; //$NON-NLS-1$
		}
		else if (unqouted.length() == 0) {
			return "\"\""; //$NON-NLS-1$
		}
		else if (ALLOWED_STRING_PATTERN.matcher(unqouted).matches()) {
			return unqouted;
		}
		else {
			StringReader r = new StringReader(unqouted);
			StringBuffer buf = new StringBuffer(unqouted.length() + 16);
			boolean containsWhitespaces =
				Pattern.compile("\\s").matcher(unqouted).find(); //$NON-NLS-1$
			if (containsWhitespaces) {
				buf.append('\"');
			}
			try {
				int c = r.read();
				while (c >= 0) {
					switch (c) {
						// case '\\':
						case '\'':
						case '\"':
							buf.append('\\');
							buf.append((char)c);
							break;
						case '\b':
							buf.append("\\b");break; //$NON-NLS-1$
						case '\f':
							buf.append("\\f");break; //$NON-NLS-1$
						case '\n':
							buf.append("\\n");break; //$NON-NLS-1$
						case '\r':
							buf.append("\\r");break; //$NON-NLS-1$
						case '\t':
							buf.append("\\t");break; //$NON-NLS-1$
						default:
							if (c > 0xff) { // Unicode
								buf.append('\\');
								buf.append('u');
								String hexString = Integer.toHexString(c);
								if (hexString.length() < 4) {
									buf.append('0');
								}
								buf.append(hexString);
							}
							else if (c < 0x20 || c > '~') {
								// octal escape sequence
								buf.append('\\');
								buf.append(Integer.toOctalString(c));
							}
							else {
								buf.append((char)c);
							}
					}
					c = r.read();
				}
			}
			catch (IOException e) { /* ignore */
			}
			r.close();
			if (containsWhitespaces) {
				buf.append('\"');
			}
			return buf.toString();
		}
	}

}