extract whitespace processing to new XML class

https://bugs.eclipse.org/bugs/show_bug.cgi?id=408453
diff --git a/org.eclipse.vex.core/src/org/eclipse/vex/core/XML.java b/org.eclipse.vex.core/src/org/eclipse/vex/core/XML.java
new file mode 100644
index 0000000..887e5c0
--- /dev/null
+++ b/org.eclipse.vex.core/src/org/eclipse/vex/core/XML.java
@@ -0,0 +1,165 @@
+/*******************************************************************************

+ * Copyright (c) 2013 Carsten Hiesserich and others.

+ * All rights reserved. This program and the accompanying materials

+ * are made available under the terms of the Eclipse Public License v1.0

+ * which accompanies this distribution, and is available at

+ * http://www.eclipse.org/legal/epl-v10.html

+ * 

+ * Contributors:

+ *     Carsten Hiesserich - extracted whitespace handling from DocumentBuilder (bug 408453)

+ *******************************************************************************/

+package org.eclipse.vex.core;

+

+/**

+ * Common processing methods according to http://www.w3.org/TR/REC-xml/

+ * 

+ * @see http://www.w3.org/TR/REC-xml/#sec-white-space

+ * @see http://www.w3.org/TR/REC-xml/#sec-line-ends

+ * @see http://www.w3.org/TR/REC-xml/#NT-S

+ */

+public class XML {

+

+	/**

+	 * @param c

+	 * @return <code>true</code> if c is a whitespace according to the W3C recommendation<br />

+	 *         (http://www.w3.org/TR/REC-xml/#NT-S)

+	 */

+	public static boolean isWhitespace(final char c) {

+		return c == 0x20 || c == 0x9 || c == 0xD || c == 0xA;

+	}

+

+	/**

+	 * Replace runs of XML whitespace (see {@link #isWhitespace}) with a single space. Newlines in the input should be

+	 * normalized before calling this method.

+	 * 

+	 * @param input

+	 *            String to compress.

+	 * @param trimLeading

+	 *            <code>true</code> to remove leading whitespace

+	 * @param trimTrailing

+	 *            <code>true</code> to remove trailing whitespace

+	 * @param keepNewlines

+	 *            <code>true</code> to keep newlines (runs of newlines will still be compressed), <code>false</code> to

+	 *            replace newlines with a space.

+	 * @return A new String with whitespace compressed.

+	 */

+	public static String compressWhitespace(final String input, final boolean trimLeading, final boolean trimTrailing, final boolean keepNewlines) {

+		return compressWhitespace(new StringBuilder(input), trimLeading, trimTrailing, keepNewlines).toString();

+	}

+

+	/**

+	 * Replace runs of XML whitespace (see {@link #isWhitespace}) with a single space. Newlines in the input should be

+	 * normalized before calling this method.

+	 * 

+	 * @param sb

+	 *            StringBuilder to compress.

+	 * @param trimLeading

+	 *            <code>true</code> to remove leading whitespace

+	 * @param trimTrailing

+	 *            <code>true</code> to remove trailing whitespace

+	 * @param keepNewlines

+	 *            <code>true</code> to keep newlines (runs of newlines will still be compressed), <code>false</code> to

+	 *            replace newlines with a space.

+	 * @return A new StringBuilder with whitespace compressed.

+	 */

+	public static StringBuilder compressWhitespace(final StringBuilder sb, final boolean trimLeading, final boolean trimTrailing, final boolean keepNewlines) {

+

+		final StringBuilder result = new StringBuilder(sb.length());

+

+		boolean ws = false; // true if we're in a run of whitespace

+		char last = 0;

+		for (int i = 0; i < sb.length(); i++) {

+			final char c = sb.charAt(i);

+			if (XML.isWhitespace(c)) {

+				if (c != last && last == '\n' && keepNewlines) {

+					result.append('\n');

+				}

+				ws = true;

+			} else {

+				if (ws) {

+					result.append(last == '\n' && keepNewlines ? '\n' : ' ');

+					ws = false;

+				}

+				result.append(c);

+			}

+			last = c;

+		}

+		if (ws) {

+			result.append(last == '\n' && keepNewlines ? '\n' : ' ');

+		}

+		// trim leading and trailing space, if necessary

+		if (trimLeading && result.length() > 0 && result.charAt(0) == ' ') {

+			result.deleteCharAt(0);

+		}

+		if (trimTrailing && result.length() > 0 && result.charAt(result.length() - 1) == ' ') {

+			result.setLength(result.length() - 1);

+		}

+

+		return result;

+	}

+

+	/**

+	 * Convert lines that end in CR and CRLFs to plain newlines.

+	 * 

+	 * @param input

+	 *            String to be normalized.

+	 */

+	public static String normalizeNewlines(final String input) {

+		final StringBuilder sb = new StringBuilder(input);

+		normalizeNewlines(sb);

+		return sb.toString();

+	}

+

+	/**

+	 * Convert lines that end in CR and CRLFs to plain newlines.

+	 * 

+	 * @param sb

+	 *            StringBuilder to be normalized.

+	 */

+	public static void normalizeNewlines(final StringBuilder sb) {

+

+		// State machine states

+		final int START = 0;

+		final int SEEN_CR = 1;

+

+		int state = START;

+		int i = 0;

+		while (i < sb.length()) {

+			// No simple 'for' here, since we may delete chars

+

+			final char c = sb.charAt(i);

+

+			switch (state) {

+			case START:

+				if (c == '\r') {

+					state = SEEN_CR;

+				}

+				i++;

+				break;

+

+			case SEEN_CR:

+				if (c == '\n') {

+					// CR-LF, just delete the previous CR

+					sb.deleteCharAt(i - 1);

+					state = START;

+					// no need to advance i, since it's done implicitly

+				} else if (c == '\r') {

+					// CR line ending followed by another

+					// Replace the first with a newline...

+					sb.setCharAt(i - 1, '\n');

+					i++;

+					// ...and stay in the SEEN_CR state

+				} else {

+					// CR line ending, replace it with a newline

+					sb.setCharAt(i - 1, '\n');

+					i++;

+					state = START;

+				}

+			}

+		}

+

+		if (state == SEEN_CR) {

+			// CR line ending, replace it with a newline

+		}

+	}

+}

diff --git a/org.eclipse.vex.core/src/org/eclipse/vex/core/internal/io/DocumentBuilder.java b/org.eclipse.vex.core/src/org/eclipse/vex/core/internal/io/DocumentBuilder.java
index de7b274..cd7ed24 100644
--- a/org.eclipse.vex.core/src/org/eclipse/vex/core/internal/io/DocumentBuilder.java
+++ b/org.eclipse.vex.core/src/org/eclipse/vex/core/internal/io/DocumentBuilder.java
@@ -19,6 +19,7 @@
 
 import org.eclipse.core.runtime.Assert;
 import org.eclipse.core.runtime.QualifiedName;
+import org.eclipse.vex.core.XML;
 import org.eclipse.vex.core.internal.dom.Comment;
 import org.eclipse.vex.core.internal.dom.Document;
 import org.eclipse.vex.core.internal.dom.Element;
@@ -379,44 +380,16 @@
 
 		if (entry != null && entry.pre) {
 			sb = pendingChars;
+			XML.normalizeNewlines(sb);
 		} else {
-
-			// collapse the space in the pending characters
-			sb = new StringBuilder(pendingChars.length());
-			boolean ws = false; // true if we're in a run of whitespace
-			for (int i = 0; i < pendingChars.length(); i++) {
-				final char c = pendingChars.charAt(i);
-				if (isXmlWhitespace(c)) {
-					ws = true;
-				} else {
-					if (ws) {
-						sb.append(' ');
-						ws = false;
-					}
-					sb.append(c);
-				}
-			}
-			if (ws) {
-				sb.append(' ');
-			}
-			// trim leading and trailing space, if necessary
-			if (trimLeading && sb.length() > 0 && sb.charAt(0) == ' ') {
-				sb.deleteCharAt(0);
-			}
-			if (trimTrailing && sb.length() > 0 && sb.charAt(sb.length() - 1) == ' ') {
-				sb.setLength(sb.length() - 1);
-			}
+			sb = new StringBuilder(pendingChars);
+			XML.normalizeNewlines(sb);
+			sb = XML.compressWhitespace(sb, trimLeading, trimTrailing, false);
 		}
 
-		normalizeNewlines(sb);
 		return sb;
 	}
 
-	private static boolean isXmlWhitespace(final char c) {
-		// whitespace according to the W3C recommendation (http://www.w3.org/TR/REC-xml/#NT-S)
-		return c == 0x20 || c == 0x9 || c == 0xD || c == 0xA;
-	}
-
 	private boolean isBlock(final Node node) {
 		return policy != null && policy.isBlock(node);
 	}
@@ -450,59 +423,6 @@
 		});
 	}
 
-	/**
-	 * Convert lines that end in CR and CRLFs to plain newlines.
-	 * 
-	 * @param sb
-	 *            StringBuffer to be normalized.
-	 */
-	private void normalizeNewlines(final StringBuilder sb) {
-
-		// State machine states
-		final int START = 0;
-		final int SEEN_CR = 1;
-
-		int state = START;
-		int i = 0;
-		while (i < sb.length()) {
-			// No simple 'for' here, since we may delete chars
-
-			final char c = sb.charAt(i);
-
-			switch (state) {
-			case START:
-				if (c == '\r') {
-					state = SEEN_CR;
-				}
-				i++;
-				break;
-
-			case SEEN_CR:
-				if (c == '\n') {
-					// CR-LF, just delete the previous CR
-					sb.deleteCharAt(i - 1);
-					state = START;
-					// no need to advance i, since it's done implicitly
-				} else if (c == '\r') {
-					// CR line ending followed by another
-					// Replace the first with a newline...
-					sb.setCharAt(i - 1, '\n');
-					i++;
-					// ...and stay in the SEEN_CR state
-				} else {
-					// CR line ending, replace it with a newline
-					sb.setCharAt(i - 1, '\n');
-					i++;
-					state = START;
-				}
-			}
-		}
-
-		if (state == SEEN_CR) {
-			// CR line ending, replace it with a newline
-		}
-	}
-
 	private static class StackEntry {
 		public Element element;
 		public int offset;