Skip to main content
aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--org.eclipse.vex.core/src/org/eclipse/vex/core/XML.java165
-rw-r--r--org.eclipse.vex.core/src/org/eclipse/vex/core/internal/io/DocumentBuilder.java90
2 files changed, 170 insertions, 85 deletions
diff --git a/org.eclipse.vex.core/src/org/eclipse/vex/core/XML.java b/org.eclipse.vex.core/src/org/eclipse/vex/core/XML.java
new file mode 100644
index 00000000..887e5c08
--- /dev/null
+++ b/org.eclipse.vex.core/src/org/eclipse/vex/core/XML.java
@@ -0,0 +1,165 @@
+/*******************************************************************************
+ * Copyright (c) 2013 Carsten Hiesserich and others.
+ * All rights reserved. This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License v1.0
+ * which accompanies this distribution, and is available at
+ * http://www.eclipse.org/legal/epl-v10.html
+ *
+ * Contributors:
+ * Carsten Hiesserich - extracted whitespace handling from DocumentBuilder (bug 408453)
+ *******************************************************************************/
+package org.eclipse.vex.core;
+
+/**
+ * Common processing methods according to http://www.w3.org/TR/REC-xml/
+ *
+ * @see http://www.w3.org/TR/REC-xml/#sec-white-space
+ * @see http://www.w3.org/TR/REC-xml/#sec-line-ends
+ * @see http://www.w3.org/TR/REC-xml/#NT-S
+ */
+public class XML {
+
+ /**
+ * @param c
+ * @return <code>true</code> if c is a whitespace according to the W3C recommendation<br />
+ * (http://www.w3.org/TR/REC-xml/#NT-S)
+ */
+ public static boolean isWhitespace(final char c) {
+ return c == 0x20 || c == 0x9 || c == 0xD || c == 0xA;
+ }
+
+ /**
+ * Replace runs of XML whitespace (see {@link #isWhitespace}) with a single space. Newlines in the input should be
+ * normalized before calling this method.
+ *
+ * @param input
+ * String to compress.
+ * @param trimLeading
+ * <code>true</code> to remove leading whitespace
+ * @param trimTrailing
+ * <code>true</code> to remove trailing whitespace
+ * @param keepNewlines
+ * <code>true</code> to keep newlines (runs of newlines will still be compressed), <code>false</code> to
+ * replace newlines with a space.
+ * @return A new String with whitespace compressed.
+ */
+ public static String compressWhitespace(final String input, final boolean trimLeading, final boolean trimTrailing, final boolean keepNewlines) {
+ return compressWhitespace(new StringBuilder(input), trimLeading, trimTrailing, keepNewlines).toString();
+ }
+
+ /**
+ * Replace runs of XML whitespace (see {@link #isWhitespace}) with a single space. Newlines in the input should be
+ * normalized before calling this method.
+ *
+ * @param sb
+ * StringBuilder to compress.
+ * @param trimLeading
+ * <code>true</code> to remove leading whitespace
+ * @param trimTrailing
+ * <code>true</code> to remove trailing whitespace
+ * @param keepNewlines
+ * <code>true</code> to keep newlines (runs of newlines will still be compressed), <code>false</code> to
+ * replace newlines with a space.
+ * @return A new StringBuilder with whitespace compressed.
+ */
+ public static StringBuilder compressWhitespace(final StringBuilder sb, final boolean trimLeading, final boolean trimTrailing, final boolean keepNewlines) {
+
+ final StringBuilder result = new StringBuilder(sb.length());
+
+ boolean ws = false; // true if we're in a run of whitespace
+ char last = 0;
+ for (int i = 0; i < sb.length(); i++) {
+ final char c = sb.charAt(i);
+ if (XML.isWhitespace(c)) {
+ if (c != last && last == '\n' && keepNewlines) {
+ result.append('\n');
+ }
+ ws = true;
+ } else {
+ if (ws) {
+ result.append(last == '\n' && keepNewlines ? '\n' : ' ');
+ ws = false;
+ }
+ result.append(c);
+ }
+ last = c;
+ }
+ if (ws) {
+ result.append(last == '\n' && keepNewlines ? '\n' : ' ');
+ }
+ // trim leading and trailing space, if necessary
+ if (trimLeading && result.length() > 0 && result.charAt(0) == ' ') {
+ result.deleteCharAt(0);
+ }
+ if (trimTrailing && result.length() > 0 && result.charAt(result.length() - 1) == ' ') {
+ result.setLength(result.length() - 1);
+ }
+
+ return result;
+ }
+
+ /**
+ * Convert lines that end in CR and CRLFs to plain newlines.
+ *
+ * @param input
+ * String to be normalized.
+ */
+ public static String normalizeNewlines(final String input) {
+ final StringBuilder sb = new StringBuilder(input);
+ normalizeNewlines(sb);
+ return sb.toString();
+ }
+
+ /**
+ * Convert lines that end in CR and CRLFs to plain newlines.
+ *
+ * @param sb
+ * StringBuilder to be normalized.
+ */
+ public static void normalizeNewlines(final StringBuilder sb) {
+
+ // State machine states
+ final int START = 0;
+ final int SEEN_CR = 1;
+
+ int state = START;
+ int i = 0;
+ while (i < sb.length()) {
+ // No simple 'for' here, since we may delete chars
+
+ final char c = sb.charAt(i);
+
+ switch (state) {
+ case START:
+ if (c == '\r') {
+ state = SEEN_CR;
+ }
+ i++;
+ break;
+
+ case SEEN_CR:
+ if (c == '\n') {
+ // CR-LF, just delete the previous CR
+ sb.deleteCharAt(i - 1);
+ state = START;
+ // no need to advance i, since it's done implicitly
+ } else if (c == '\r') {
+ // CR line ending followed by another
+ // Replace the first with a newline...
+ sb.setCharAt(i - 1, '\n');
+ i++;
+ // ...and stay in the SEEN_CR state
+ } else {
+ // CR line ending, replace it with a newline
+ sb.setCharAt(i - 1, '\n');
+ i++;
+ state = START;
+ }
+ }
+ }
+
+ if (state == SEEN_CR) {
+ // CR line ending, replace it with a newline
+ }
+ }
+}
diff --git a/org.eclipse.vex.core/src/org/eclipse/vex/core/internal/io/DocumentBuilder.java b/org.eclipse.vex.core/src/org/eclipse/vex/core/internal/io/DocumentBuilder.java
index de7b2741..cd7ed24b 100644
--- a/org.eclipse.vex.core/src/org/eclipse/vex/core/internal/io/DocumentBuilder.java
+++ b/org.eclipse.vex.core/src/org/eclipse/vex/core/internal/io/DocumentBuilder.java
@@ -19,6 +19,7 @@ import java.util.List;
import org.eclipse.core.runtime.Assert;
import org.eclipse.core.runtime.QualifiedName;
+import org.eclipse.vex.core.XML;
import org.eclipse.vex.core.internal.dom.Comment;
import org.eclipse.vex.core.internal.dom.Document;
import org.eclipse.vex.core.internal.dom.Element;
@@ -379,44 +380,16 @@ public class DocumentBuilder implements ContentHandler, LexicalHandler {
if (entry != null && entry.pre) {
sb = pendingChars;
+ XML.normalizeNewlines(sb);
} else {
-
- // collapse the space in the pending characters
- sb = new StringBuilder(pendingChars.length());
- boolean ws = false; // true if we're in a run of whitespace
- for (int i = 0; i < pendingChars.length(); i++) {
- final char c = pendingChars.charAt(i);
- if (isXmlWhitespace(c)) {
- ws = true;
- } else {
- if (ws) {
- sb.append(' ');
- ws = false;
- }
- sb.append(c);
- }
- }
- if (ws) {
- sb.append(' ');
- }
- // trim leading and trailing space, if necessary
- if (trimLeading && sb.length() > 0 && sb.charAt(0) == ' ') {
- sb.deleteCharAt(0);
- }
- if (trimTrailing && sb.length() > 0 && sb.charAt(sb.length() - 1) == ' ') {
- sb.setLength(sb.length() - 1);
- }
+ sb = new StringBuilder(pendingChars);
+ XML.normalizeNewlines(sb);
+ sb = XML.compressWhitespace(sb, trimLeading, trimTrailing, false);
}
- normalizeNewlines(sb);
return sb;
}
- private static boolean isXmlWhitespace(final char c) {
- // whitespace according to the W3C recommendation (http://www.w3.org/TR/REC-xml/#NT-S)
- return c == 0x20 || c == 0x9 || c == 0xD || c == 0xA;
- }
-
private boolean isBlock(final Node node) {
return policy != null && policy.isBlock(node);
}
@@ -450,59 +423,6 @@ public class DocumentBuilder implements ContentHandler, LexicalHandler {
});
}
- /**
- * Convert lines that end in CR and CRLFs to plain newlines.
- *
- * @param sb
- * StringBuffer to be normalized.
- */
- private void normalizeNewlines(final StringBuilder sb) {
-
- // State machine states
- final int START = 0;
- final int SEEN_CR = 1;
-
- int state = START;
- int i = 0;
- while (i < sb.length()) {
- // No simple 'for' here, since we may delete chars
-
- final char c = sb.charAt(i);
-
- switch (state) {
- case START:
- if (c == '\r') {
- state = SEEN_CR;
- }
- i++;
- break;
-
- case SEEN_CR:
- if (c == '\n') {
- // CR-LF, just delete the previous CR
- sb.deleteCharAt(i - 1);
- state = START;
- // no need to advance i, since it's done implicitly
- } else if (c == '\r') {
- // CR line ending followed by another
- // Replace the first with a newline...
- sb.setCharAt(i - 1, '\n');
- i++;
- // ...and stay in the SEEN_CR state
- } else {
- // CR line ending, replace it with a newline
- sb.setCharAt(i - 1, '\n');
- i++;
- state = START;
- }
- }
- }
-
- if (state == SEEN_CR) {
- // CR line ending, replace it with a newline
- }
- }
-
private static class StackEntry {
public Element element;
public int offset;

Back to the top