summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSteffen Pingel2011-12-27 07:46:11 (EST)
committerSteffen Pingel2011-12-27 07:46:11 (EST)
commitf35d3f54054b639f04c1ef8734dc499646375c8d (patch)
tree00659c61032349617210405dbdbc00f4e280e3a5
parent985b7edd53996894d750e1d31a92d64a3a34d711 (diff)
downloadorg.eclipse.mylyn.commons-f35d3f54054b639f04c1ef8734dc499646375c8d.zip
org.eclipse.mylyn.commons-f35d3f54054b639f04c1ef8734dc499646375c8d.tar.gz
org.eclipse.mylyn.commons-f35d3f54054b639f04c1ef8734dc499646375c8d.tar.bz2
NEW - bug 367573: [api] provide API for parsing HTML streams in
o.e.m.commons.core https://bugs.eclipse.org/bugs/show_bug.cgi?id=367573 Change-Id: I8ebcc689787daea66f3a5c98389d510a3aa2ffae
-rw-r--r--org.eclipse.mylyn.commons.core/src/org/eclipse/mylyn/commons/core/HtmlStreamTokenizer.java1141
-rw-r--r--org.eclipse.mylyn.commons.core/src/org/eclipse/mylyn/commons/core/HtmlTag.java374
-rw-r--r--org.eclipse.mylyn.commons.core/src/org/eclipse/mylyn/commons/core/HtmlUtil.java20
-rw-r--r--org.eclipse.mylyn.commons.net/src/org/eclipse/mylyn/commons/net/HtmlStreamTokenizer.java2
-rw-r--r--org.eclipse.mylyn.commons.net/src/org/eclipse/mylyn/commons/net/HtmlTag.java2
-rw-r--r--org.eclipse.mylyn.commons.repositories.http.core/src/org/eclipse/mylyn/commons/repositories/http/core/CommonHttpClient.java5
6 files changed, 1544 insertions, 0 deletions
diff --git a/org.eclipse.mylyn.commons.core/src/org/eclipse/mylyn/commons/core/HtmlStreamTokenizer.java b/org.eclipse.mylyn.commons.core/src/org/eclipse/mylyn/commons/core/HtmlStreamTokenizer.java
new file mode 100644
index 0000000..ccbf789
--- /dev/null
+++ b/org.eclipse.mylyn.commons.core/src/org/eclipse/mylyn/commons/core/HtmlStreamTokenizer.java
@@ -0,0 +1,1141 @@
+/*******************************************************************************
+ * Copyright (c) 2004, 2008 Tasktop Technologies and others.
+ * All rights reserved. This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License v1.0
+ * which accompanies this distribution, and is available at
+ * http://www.eclipse.org/legal/epl-v10.html
+ *
+ * Contributors:
+ * Tasktop Technologies - initial API and implementation
+ *******************************************************************************/
+
+package org.eclipse.mylyn.commons.core;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.Reader;
+import java.net.URL;
+import java.text.ParseException;
+import java.util.HashMap;
+import java.util.Locale;
+
+/**
+ * Parses HTML into tokens.
+ *
+ * @author Shawn Minto
+ * @since 3.7
+ */
+public class HtmlStreamTokenizer {
+
+ /** parser state */
+ private State state;
+
+ /** reader from which to parse the text */
+ private final BufferedReader in;
+
+ /** base URL for resolving relative URLs */
+ private final URL base;
+
+ /** buffer holding the text of the current token */
+ private final StringBuffer textBuffer;
+
+ /** buffer holding whitespace preceding the current token */
+ private final StringBuffer whitespaceBuffer;
+
+ /**
+ * holds a token that was read and then put back in the queue to be returned again on <code>nextToken</code> call
+ */
+ private Token pushbackToken;
+
+ /**
+ * holds a character that was read and then determined not to be part of the current token
+ */
+ private int pushbackChar;
+
+ /** current quote delimiter (single or double) */
+ private int quoteChar;
+
+ /** Allow class client to choose if tag attributes are escaped or not */
+ private boolean escapeTagValues;
+
+ /**
+ * Constructor.
+ *
+ * @param in
+ * reader for the HTML document to tokenize
+ * @param base
+ * URL for resolving relative URLs
+ */
+ public HtmlStreamTokenizer(Reader in, URL base) {
+ textBuffer = new StringBuffer();
+ whitespaceBuffer = new StringBuffer();
+ pushbackChar = 0;
+ state = State.TEXT;
+ this.in = new BufferedReader(in);
+ this.base = base;
+ escapeTagValues = true;
+ }
+
+ public void escapeTagAttributes(boolean value) {
+ escapeTagValues = value;
+ }
+
+ /**
+ * Returns the next token from the stream.
+ */
+ public Token nextToken() throws IOException, ParseException {
+ if (pushbackToken != null) {
+ Token token = pushbackToken;
+ pushbackToken = null;
+ return token;
+ }
+
+ int closingComment = 0;
+
+ textBuffer.setLength(0);
+ whitespaceBuffer.setLength(0);
+ do {
+ int ch;
+ if (pushbackChar != 0) {
+ ch = pushbackChar;
+ pushbackChar = 0;
+ } else {
+ ch = in.read();
+ }
+ if (ch < 0) {
+ State oldState = state;
+ state = State.EOF;
+ if (textBuffer.length() > 0 && oldState == State.TEXT) {
+ return new Token(textBuffer, whitespaceBuffer, false);
+ } else {
+ return new Token();
+ }
+ }
+ if (state == State.TEXT) {
+ if (ch == '<') {
+ state = State.TAG;
+ if (textBuffer.length() > 0) {
+ return new Token(textBuffer, whitespaceBuffer, false);
+ }
+ } else if (Character.isWhitespace((char) ch)) {
+ pushbackChar = ch;
+ state = State.WS;
+ if (textBuffer.length() > 0) {
+ return new Token(textBuffer, whitespaceBuffer, false);
+ }
+ } else {
+ textBuffer.append((char) ch);
+ }
+ } else if (state == State.WS) {
+ if (!Character.isWhitespace((char) ch)) {
+ pushbackChar = ch;
+ state = State.TEXT;
+ } else {
+ whitespaceBuffer.append((char) ch);
+ }
+ } else if (state == State.TAG) {
+ if (ch == '>') {
+ state = State.TEXT;
+ HtmlTag tag = new HtmlTag(base);
+ parseTag(textBuffer.toString(), tag, escapeTagValues);
+ return new Token(tag, whitespaceBuffer);
+ }
+ if (ch == '<' && textBuffer.length() == 0) {
+ textBuffer.append("<<"); //$NON-NLS-1$
+ state = State.TEXT;
+ } else if (ch == '-' && textBuffer.length() == 2 && textBuffer.charAt(1) == '-'
+ && textBuffer.charAt(0) == '!') {
+ textBuffer.setLength(0);
+ state = State.COMMENT;
+ } else if (ch == '\'' || ch == '"') {
+ quoteChar = ch;
+ textBuffer.append((char) ch);
+ state = State.TAG_QUOTE;
+ } else {
+ textBuffer.append((char) ch);
+ }
+ } else if (state == State.TAG_QUOTE) {
+ if (ch == '>') {
+ pushbackChar = ch;
+ state = State.TAG;
+ } else {
+ textBuffer.append((char) ch);
+ if (ch == quoteChar) {
+ state = State.TAG;
+ }
+ }
+ } else if (state == State.COMMENT) {
+ if (ch == '>' && closingComment >= 2) {
+ textBuffer.setLength(textBuffer.length() - 2);
+ closingComment = 0;
+ state = State.TEXT;
+ return new Token(textBuffer, whitespaceBuffer, true);
+ }
+ if (ch == '-') {
+ closingComment++;
+ } else {
+ closingComment = 0;
+ }
+ textBuffer.append((char) ch);
+ }
+ } while (true);
+ }
+
+ /**
+ * Pushes the token back into the queue, to be returned by the subsequent call to <code>nextToken</code>
+ */
+ public void pushback(Token token) {
+ pushbackToken = token;
+ }
+
+ /**
+ * Parses an HTML tag out of a string of characters.
+ */
+ private static void parseTag(String s, HtmlTag tag, boolean escapeValues) throws ParseException {
+
+ int i = 0;
+ for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
+ // just move forward
+ }
+ if (i == s.length()) {
+ throw new ParseException("parse empty tag", 0); //$NON-NLS-1$
+ }
+
+ int start = i;
+ for (; i < s.length() && !Character.isWhitespace(s.charAt(i)); i++) {
+ // just move forward
+ }
+ tag.setTagName(s.substring(start, i));
+
+ for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
+ // just move forward
+ }
+ if (i == s.length()) {
+ return;
+ } else {
+ parseAttributes(tag, s, i, escapeValues);
+ return;
+ }
+ }
+
+ /**
+ * parses HTML tag attributes from a buffer and sets them in an HtmlTag
+ */
+ private static void parseAttributes(HtmlTag tag, String s, int i, boolean escapeValues) throws ParseException {
+ while (i < s.length()) {
+ // skip whitespace
+ while (i < s.length() && Character.isWhitespace(s.charAt(i))) {
+ i++;
+ }
+
+ if (i == s.length()) {
+ return;
+ }
+
+ // read the attribute name -- the rule might be looser than the RFC
+ // specifies:
+ // everything up to a space or an equal sign is included
+ int start = i;
+ for (; i < s.length() && !Character.isWhitespace(s.charAt(i)) && s.charAt(i) != '='; i++) {
+ // just move forward
+ }
+ String attributeName = s.substring(start, i).toLowerCase(Locale.ENGLISH);
+
+ if (attributeName.equals("/")) { //$NON-NLS-1$
+ tag.setSelfTerminating(true);
+ continue;
+ }
+
+ for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
+ // just move forward
+ }
+ if (i == s.length() || s.charAt(i) != '=') {
+ // no attribute value
+ tag.setAttribute(attributeName, ""); //$NON-NLS-1$
+ continue;
+ }
+
+ // skip whitespace to the start of attribute value
+ for (i = i + 1; i < s.length() && Character.isWhitespace(s.charAt(i)); i++) {
+ // just move forward
+ }
+ if (i == s.length()) {
+ return;
+ }
+
+ // read the attribute value -- the rule for unquoted attribute value
+ // is
+ // looser than the one in Conolly's W3C 1996 lexical analyzer draft:
+ // everything
+ // is included up to the next space
+ String attributeValue;
+ if (s.charAt(i) == '"') {
+ start = ++i;
+ for (; i < s.length() && s.charAt(i) != '"'; i++) {
+ // just move forward
+ }
+ if (i == s.length()) {
+ return; // shouldn't happen if input returned by nextToken
+ }
+ if (escapeValues) {
+ attributeValue = unescape(s.substring(start, i));
+ } else {
+ attributeValue = s.substring(start, i);
+ }
+ i++;
+ } else if (s.charAt(i) == '\'') {
+ start = ++i;
+ for (; i < s.length() && s.charAt(i) != '\''; i++) {
+ // just move forward
+ }
+ if (i == s.length()) {
+ return; // shouldn't happen if input returned by nextToken
+ }
+ attributeValue = unescape(s.substring(start, i));
+ i++;
+ } else {
+ start = i;
+ for (; i < s.length() && !Character.isWhitespace(s.charAt(i)); i++) {
+ // just move forward
+ }
+ attributeValue = s.substring(start, i);
+ }
+ tag.setAttribute(attributeName, attributeValue);
+ }
+ }
+
+ /**
+ * Returns a string with HTML escapes changed into their corresponding characters.
+ *
+ * @deprecated use {@link StringEscapeUtils#unescapeHtml(String)} instead
+ */
+ @Deprecated
+ public static String unescape(String s) {
+ if (s.indexOf('&') == -1) {
+ return s;
+ } else {
+ StringBuffer sb = new StringBuffer(s);
+ unescape(sb);
+ return sb.toString();
+ }
+ }
+
+ /**
+ * Replaces (in-place) HTML escapes in a StringBuffer with their corresponding characters.
+ *
+ * @deprecated use {@link StringEscapeUtils#unescapeHtml(String)} instead
+ */
+ @Deprecated
+ public static StringBuffer unescape(StringBuffer sb) {
+ int i = 0; // index into the unprocessed section of the buffer
+ int j = 0; // index into the processed section of the buffer
+
+ while (i < sb.length()) {
+ char ch = sb.charAt(i);
+ if (ch == '&') {
+ int start = i;
+ String escape = null;
+ for (i = i + 1; i < sb.length(); i++) {
+ ch = sb.charAt(i);
+ if (!Character.isLetterOrDigit(ch) && !(ch == '#' && i == (start + 1))) {
+ escape = sb.substring(start + 1, i);
+ break;
+ }
+ }
+ if (i == sb.length() && i != (start + 1)) {
+ escape = sb.substring(start + 1);
+ }
+ if (escape != null) {
+ Character character = parseReference(escape);
+ if (character != null
+ && !((0x0A == character || 0x0D == character || 0x09 == ch)
+ || (character >= 0x20 && character <= 0xD7FF)
+ || (character >= 0xE000 && character <= 0xFFFD) || (character >= 0x10000 && character <= 0x10FFFF))) {
+ // Character is an invalid xml character
+ // http://www.w3.org/TR/REC-xml/#charsets
+ character = null;
+ }
+ if (character != null) {
+ ch = character.charValue();
+ } else {
+ // not an HTML escape; rewind
+ i = start;
+ ch = '&';
+ }
+ }
+ }
+ sb.setCharAt(j, ch);
+ i++;
+ j++;
+ }
+
+ sb.setLength(j);
+ return sb;
+ }
+
+ /**
+ * Parses HTML character and entity references and returns the corresponding character.
+ */
+ private static Character parseReference(String s) {
+ if (s.length() == 0) {
+ return null;
+ }
+
+ if (s.charAt(0) == '#') {
+ // character reference
+ if (s.length() == 1) {
+ return null;
+ }
+
+ try {
+ int value;
+ if (s.charAt(1) == 'x') {
+ // Hex reference
+ value = Integer.parseInt(s.substring(2), 16);
+ } else {
+ // Decimal reference
+ value = Integer.parseInt(s.substring(1));
+ }
+ return new Character((char) value);
+ } catch (NumberFormatException e) {
+ return null;
+ }
+ } else {
+ return entities.get(s);
+ }
+ }
+
+ /**
+ * Class for current token.
+ */
+ public static class Token {
+ public static final Type EOF = new Type();
+
+ public static final Type TEXT = new Type();
+
+ public static final Type TAG = new Type();
+
+ public static final Type COMMENT = new Type();
+
+ /** token's type */
+ private Type type;
+
+ /** token's value */
+ private final Object value;
+
+ /** whitespace preceding the token */
+ private final StringBuffer whitespace;
+
+ /**
+ * Constructor for the EOF token.
+ */
+ protected Token() {
+ type = EOF;
+ value = null;
+ whitespace = null;
+ }
+
+ /**
+ * Constructor for the HTML tag tokens.
+ */
+ protected Token(HtmlTag tag, StringBuffer whitespace) {
+ type = TAG;
+ value = tag;
+ this.whitespace = whitespace;
+ }
+
+ /**
+ * Constructor for regular text and comments.
+ */
+ protected Token(StringBuffer text, StringBuffer whitespace, boolean comment) {
+ if (comment) {
+ type = COMMENT;
+ } else {
+ type = TEXT;
+ }
+ this.value = text;
+ this.whitespace = whitespace;
+ }
+
+ /**
+ * Returns the token's type.
+ */
+ public Type getType() {
+ return type;
+ }
+
+ /**
+ * Returns the whitespace preceding the token.
+ */
+ public StringBuffer getWhitespace() {
+ return whitespace;
+ }
+
+ /**
+ * Returns the token's value. This is an HtmlTag for tokens of type <code>TAG</code> and a StringBuffer for
+ * tokens of type <code>TEXT</code> and <code>COMMENT</code>. For tokens of type <code>EOF</code>, the value is
+ * <code>null</code>.
+ */
+ public Object getValue() {
+ return value;
+ }
+
+ /**
+ * Returns the string representation of the token, including the preceding whitespace.
+ */
+ @Override
+ public String toString() {
+ StringBuffer sb = new StringBuffer();
+ if (whitespace != null) {
+ sb.append(whitespace);
+ }
+ if (value != null) {
+ if (type == TAG) {
+ // sb.append('<');
+ } else if (type == COMMENT) {
+ sb.append("<!--"); //$NON-NLS-1$
+ }
+ sb.append(value);
+ if (type == TAG) {
+ // if(value instanceof HtmlTag) {
+ // HtmlTag htmlTag = (HtmlTag)value;
+ // if(htmlTag.getTagName().startsWith("?xml")) {
+ // sb.append("?>");
+ // }
+ // } else {
+ // sb.append('>');
+
+ } else if (type == COMMENT) {
+ sb.append("-->"); //$NON-NLS-1$
+ }
+
+ }
+ return sb.toString();
+ }
+
+ /**
+ * Private enum class for token type.
+ */
+ private static class Type {
+ private Type() {
+ // don't need to do anything
+ }
+ }
+ }
+
+ /**
+ * Enum class for parser state.
+ */
+ private static class State {
+ static final State EOF = new State();
+
+ static final State COMMENT = new State();
+
+ static final State TEXT = new State();
+
+ static final State TAG = new State();
+
+ static final State WS = new State();
+
+ static final State TAG_QUOTE = new State();
+
+ private State() {
+ // don't need to do anything
+ }
+ }
+
+ /** names and values of HTML entity references */
+ private static HashMap<String, Character> entities;
+
+ /*
+ * Based on ISO 8879.
+ *
+ * Portions (c) International Organization for Standardization 1986
+ * Permission to copy in any form is granted for use with conforming SGML
+ * systems and applications as defined in ISO 8879, provided this notice is
+ * included in all copies.
+ *
+ */
+ static {
+ entities = new HashMap<String, Character>();
+ entities.put("nbsp", Character.valueOf('\240')); // no-break //$NON-NLS-1$
+ // space =
+ // non-breaking
+ // space
+ entities.put("iexcl", Character.valueOf('\241')); // inverted //$NON-NLS-1$
+ // exclamation
+ // mark
+ entities.put("cent", Character.valueOf('\242')); // cent sign //$NON-NLS-1$
+ entities.put("pound", Character.valueOf('\243')); // pound //$NON-NLS-1$
+ // sign
+ entities.put("curren", Character.valueOf('\244')); // currency //$NON-NLS-1$
+ // sign
+ entities.put("yen", Character.valueOf('\245')); // yen sign = //$NON-NLS-1$
+ // yuan sign
+ entities.put("brvbar", Character.valueOf('\246')); // broken //$NON-NLS-1$
+ // bar =
+ // broken
+ // vertical
+ // bar
+ entities.put("sect", Character.valueOf('\247')); // section //$NON-NLS-1$
+ // sign
+ entities.put("uml", Character.valueOf('\250')); // diaeresis = //$NON-NLS-1$
+ // spacing
+ // diaeresis
+ entities.put("copy", Character.valueOf('\251')); // copyright //$NON-NLS-1$
+ // sign
+ entities.put("ordf", Character.valueOf('\252')); // feminine //$NON-NLS-1$
+ // ordinal
+ // indicator
+ entities.put("laquo", Character.valueOf('\253')); // left-pointing //$NON-NLS-1$
+ // double
+ // angle
+ // quotation
+ // mark =
+ // left
+ // pointing
+ // guillemet
+ entities.put("not", Character.valueOf('\254')); // not sign //$NON-NLS-1$
+ entities.put("shy", Character.valueOf('\255')); // soft hyphen = //$NON-NLS-1$
+ // discretionary
+ // hyphen
+ entities.put("reg", Character.valueOf('\256')); // registered //$NON-NLS-1$
+ // sign =
+ // registered
+ // trade mark
+ // sign
+ entities.put("macr", Character.valueOf('\257')); // macron = //$NON-NLS-1$
+ // spacing
+ // macron =
+ // overline
+ // = APL
+ // overbar
+ entities.put("deg", Character.valueOf('\260')); // degree sign //$NON-NLS-1$
+ entities.put("plusmn", Character.valueOf('\261')); // plus-minus //$NON-NLS-1$
+ // sign =
+ // plus-or-minus
+ // sign
+ entities.put("sup2", Character.valueOf('\262')); // superscript //$NON-NLS-1$
+ // two =
+ // superscript
+ // digit two
+ // = squared
+ entities.put("sup3", Character.valueOf('\263')); // superscript //$NON-NLS-1$
+ // three =
+ // superscript
+ // digit
+ // three =
+ // cubed
+ entities.put("acute", Character.valueOf('\264')); // acute //$NON-NLS-1$
+ // accent =
+ // spacing
+ // acute
+ entities.put("micro", Character.valueOf('\265')); // micro //$NON-NLS-1$
+ // sign
+ entities.put("para", Character.valueOf('\266')); // pilcrow //$NON-NLS-1$
+ // sign =
+ // paragraph
+ // sign
+ entities.put("middot", Character.valueOf('\267')); // middle //$NON-NLS-1$
+ // dot =
+ // Georgian
+ // comma =
+ // Greek
+ // middle
+ // dot
+ entities.put("cedil", Character.valueOf('\270')); // cedilla = //$NON-NLS-1$
+ // spacing
+ // cedilla
+ entities.put("sup1", Character.valueOf('\271')); // superscript //$NON-NLS-1$
+ // one =
+ // superscript
+ // digit one
+ entities.put("ordm", Character.valueOf('\272')); // masculine //$NON-NLS-1$
+ // ordinal
+ // indicator
+ entities.put("raquo", Character.valueOf('\273')); // right-pointing //$NON-NLS-1$
+ // double
+ // angle
+ // quotation
+ // mark =
+ // right
+ // pointing
+ // guillemet
+ entities.put("frac14", Character.valueOf('\274')); // vulgar //$NON-NLS-1$
+ // fraction
+ // one
+ // quarter =
+ // fraction
+ // one
+ // quarter
+ entities.put("frac12", Character.valueOf('\275')); // vulgar //$NON-NLS-1$
+ // fraction
+ // one half
+ // =
+ // fraction
+ // one half
+ entities.put("frac34", Character.valueOf('\276')); // vulgar //$NON-NLS-1$
+ // fraction
+ // three
+ // quarters
+ // =
+ // fraction
+ // three
+ // quarters
+ entities.put("iquest", Character.valueOf('\277')); // inverted //$NON-NLS-1$
+ // question
+ // mark =
+ // turned
+ // question
+ // mark
+ entities.put("Agrave", Character.valueOf('\300')); // latin //$NON-NLS-1$
+ // capital
+ // letter A
+ // with
+ // grave =
+ // latin
+ // capital
+ // letter A
+ // grave
+ entities.put("Aacute", Character.valueOf('\301')); // latin //$NON-NLS-1$
+ // capital
+ // letter A
+ // with
+ // acute
+ entities.put("Acirc", Character.valueOf('\302')); // latin //$NON-NLS-1$
+ // capital
+ // letter A
+ // with
+ // circumflex
+ entities.put("Atilde", Character.valueOf('\303')); // latin //$NON-NLS-1$
+ // capital
+ // letter A
+ // with
+ // tilde
+ entities.put("Auml", Character.valueOf('\304')); // latin //$NON-NLS-1$
+ // capital
+ // letter A
+ // with
+ // diaeresis
+ entities.put("Aring", Character.valueOf('\305')); // latin //$NON-NLS-1$
+ // capital
+ // letter A
+ // with ring
+ // above =
+ // latin
+ // capital
+ // letter A
+ // ring
+ entities.put("AElig", Character.valueOf('\306')); // latin //$NON-NLS-1$
+ // capital
+ // letter AE
+ // = latin
+ // capital
+ // ligature
+ // AE
+ entities.put("Ccedil", Character.valueOf('\307')); // latin //$NON-NLS-1$
+ // capital
+ // letter C
+ // with
+ // cedilla
+ entities.put("Egrave", Character.valueOf('\310')); // latin //$NON-NLS-1$
+ // capital
+ // letter E
+ // with
+ // grave
+ entities.put("Eacute", Character.valueOf('\311')); // latin //$NON-NLS-1$
+ // capital
+ // letter E
+ // with
+ // acute
+ entities.put("Ecirc", Character.valueOf('\312')); // latin //$NON-NLS-1$
+ // capital
+ // letter E
+ // with
+ // circumflex
+ entities.put("Euml", Character.valueOf('\313')); // latin //$NON-NLS-1$
+ // capital
+ // letter E
+ // with
+ // diaeresis
+ entities.put("Igrave", Character.valueOf('\314')); // latin //$NON-NLS-1$
+ // capital
+ // letter I
+ // with
+ // grave
+ entities.put("Iacute", Character.valueOf('\315')); // latin //$NON-NLS-1$
+ // capital
+ // letter I
+ // with
+ // acute
+ entities.put("Icirc", Character.valueOf('\316')); // latin //$NON-NLS-1$
+ // capital
+ // letter I
+ // with
+ // circumflex
+ entities.put("Iuml", Character.valueOf('\317')); // latin //$NON-NLS-1$
+ // capital
+ // letter I
+ // with
+ // diaeresis
+ entities.put("ETH", Character.valueOf('\320')); // latin capital //$NON-NLS-1$
+ // letter ETH
+ entities.put("Ntilde", Character.valueOf('\321')); // latin //$NON-NLS-1$
+ // capital
+ // letter N
+ // with
+ // tilde
+ entities.put("Ograve", Character.valueOf('\322')); // latin //$NON-NLS-1$
+ // capital
+ // letter O
+ // with
+ // grave
+ entities.put("Oacute", Character.valueOf('\323')); // latin //$NON-NLS-1$
+ // capital
+ // letter O
+ // with
+ // acute
+ entities.put("Ocirc", Character.valueOf('\324')); // latin //$NON-NLS-1$
+ // capital
+ // letter O
+ // with
+ // circumflex
+ entities.put("Otilde", Character.valueOf('\325')); // latin //$NON-NLS-1$
+ // capital
+ // letter O
+ // with
+ // tilde
+ entities.put("Ouml", Character.valueOf('\326')); // latin //$NON-NLS-1$
+ // capital
+ // letter O
+ // with
+ // diaeresis
+ entities.put("times", Character.valueOf('\327')); // multiplication //$NON-NLS-1$
+ // sign
+ entities.put("Oslash", Character.valueOf('\330')); // latin //$NON-NLS-1$
+ // capital
+ // letter O
+ // with
+ // stroke =
+ // latin
+ // capital
+ // letter O
+ // slash
+ entities.put("Ugrave", Character.valueOf('\331')); // latin //$NON-NLS-1$
+ // capital
+ // letter U
+ // with
+ // grave
+ entities.put("Uacute", Character.valueOf('\332')); // latin //$NON-NLS-1$
+ // capital
+ // letter U
+ // with
+ // acute
+ entities.put("Ucirc", Character.valueOf('\333')); // latin //$NON-NLS-1$
+ // capital
+ // letter U
+ // with
+ // circumflex
+ entities.put("Uuml", Character.valueOf('\334')); // latin //$NON-NLS-1$
+ // capital
+ // letter U
+ // with
+ // diaeresis
+ entities.put("Yacute", Character.valueOf('\335')); // latin //$NON-NLS-1$
+ // capital
+ // letter Y
+ // with
+ // acute
+ entities.put("THORN", Character.valueOf('\336')); // latin //$NON-NLS-1$
+ // capital
+ // letter
+ // THORN
+ entities.put("szlig", Character.valueOf('\337')); // latin //$NON-NLS-1$
+ // small
+ // letter
+ // sharp s =
+ // ess-zed
+ entities.put("agrave", Character.valueOf('\340')); // latin //$NON-NLS-1$
+ // small
+ // letter a
+ // with
+ // grave =
+ // latin
+ // small
+ // letter a
+ // grave
+ entities.put("aacute", Character.valueOf('\341')); // latin //$NON-NLS-1$
+ // small
+ // letter a
+ // with
+ // acute
+ entities.put("acirc", Character.valueOf('\342')); // latin //$NON-NLS-1$
+ // small
+ // letter a
+ // with
+ // circumflex
+ entities.put("atilde", Character.valueOf('\343')); // latin //$NON-NLS-1$
+ // small
+ // letter a
+ // with
+ // tilde
+ entities.put("auml", Character.valueOf('\344')); // latin //$NON-NLS-1$
+ // small
+ // letter a
+ // with
+ // diaeresis
+ entities.put("aring", Character.valueOf('\345')); // latin //$NON-NLS-1$
+ // small
+ // letter a
+ // with ring
+ // above =
+ // latin
+ // small
+ // letter a
+ // ring
+ entities.put("aelig", Character.valueOf('\346')); // latin //$NON-NLS-1$
+ // small
+ // letter ae
+ // = latin
+ // small
+ // ligature
+ // ae
+ entities.put("ccedil", Character.valueOf('\347')); // latin //$NON-NLS-1$
+ // small
+ // letter c
+ // with
+ // cedilla
+ entities.put("egrave", Character.valueOf('\350')); // latin //$NON-NLS-1$
+ // small
+ // letter e
+ // with
+ // grave
+ entities.put("eacute", Character.valueOf('\351')); // latin //$NON-NLS-1$
+ // small
+ // letter e
+ // with
+ // acute
+ entities.put("ecirc", Character.valueOf('\352')); // latin //$NON-NLS-1$
+ // small
+ // letter e
+ // with
+ // circumflex
+ entities.put("euml", Character.valueOf('\353')); // latin //$NON-NLS-1$
+ // small
+ // letter e
+ // with
+ // diaeresis
+ entities.put("igrave", Character.valueOf('\354')); // latin //$NON-NLS-1$
+ // small
+ // letter i
+ // with
+ // grave
+ entities.put("iacute", Character.valueOf('\355')); // latin //$NON-NLS-1$
+ // small
+ // letter i
+ // with
+ // acute
+ entities.put("icirc", Character.valueOf('\356')); // latin //$NON-NLS-1$
+ // small
+ // letter i
+ // with
+ // circumflex
+ entities.put("iuml", Character.valueOf('\357')); // latin //$NON-NLS-1$
+ // small
+ // letter i
+ // with
+ // diaeresis
+ entities.put("eth", Character.valueOf('\360')); // latin small //$NON-NLS-1$
+ // letter eth
+ entities.put("ntilde", Character.valueOf('\361')); // latin //$NON-NLS-1$
+ // small
+ // letter n
+ // with
+ // tilde
+ entities.put("ograve", Character.valueOf('\362')); // latin //$NON-NLS-1$
+ // small
+ // letter o
+ // with
+ // grave
+ entities.put("oacute", Character.valueOf('\363')); // latin //$NON-NLS-1$
+ // small
+ // letter o
+ // with
+ // acute
+ entities.put("ocirc", Character.valueOf('\364')); // latin //$NON-NLS-1$
+ // small
+ // letter o
+ // with
+ // circumflex
+ entities.put("otilde", Character.valueOf('\365')); // latin //$NON-NLS-1$
+ // small
+ // letter o
+ // with
+ // tilde
+ entities.put("ouml", Character.valueOf('\366')); // latin //$NON-NLS-1$
+ // small
+ // letter o
+ // with
+ // diaeresis
+ entities.put("divide", Character.valueOf('\367')); // division //$NON-NLS-1$
+ // sign
+ entities.put("oslash", Character.valueOf('\370')); // latin //$NON-NLS-1$
+ // small
+ // letter o
+ // with
+ // stroke =
+ // latin
+ // small
+ // letter o
+ // slash
+ entities.put("ugrave", Character.valueOf('\371')); // latin //$NON-NLS-1$
+ // small
+ // letter u
+ // with
+ // grave
+ entities.put("uacute", Character.valueOf('\372')); // latin //$NON-NLS-1$
+ // small
+ // letter u
+ // with
+ // acute
+ entities.put("ucirc", Character.valueOf('\373')); // latin //$NON-NLS-1$
+ // small
+ // letter u
+ // with
+ // circumflex
+ entities.put("uuml", Character.valueOf('\374')); // latin //$NON-NLS-1$
+ // small
+ // letter u
+ // with
+ // diaeresis
+ entities.put("yacute", Character.valueOf('\375')); // latin //$NON-NLS-1$
+ // small
+ // letter y
+ // with
+ // acute
+ entities.put("thorn", Character.valueOf('\376')); // latin //$NON-NLS-1$
+ // small
+ // letter
+ // thorn
+ entities.put("yuml", Character.valueOf('\377')); // latin //$NON-NLS-1$
+ // small
+ // letter y
+ // with
+ // diaeresis
+
+ // Special characters
+ entities.put("quot", Character.valueOf('\42')); // quotation //$NON-NLS-1$
+ // mark = APL
+ // quote
+ entities.put("amp", Character.valueOf('\46')); // ampersand //$NON-NLS-1$
+ entities.put("lt", Character.valueOf('\74')); // less-than //$NON-NLS-1$
+ // sign
+ entities.put("gt", Character.valueOf('\76')); // greater-than //$NON-NLS-1$
+ // sign
+ // Latin Extended-A
+ entities.put("OElig", Character.valueOf('\u0152')); // latin //$NON-NLS-1$
+ // capital
+ // ligature
+ // OE
+ entities.put("oelig", Character.valueOf('\u0153')); // latin //$NON-NLS-1$
+ // small
+ // ligature
+ // oe,
+ // ligature
+ // is a
+ // misnomer,
+ // this is a
+ // separate
+ // character
+ // in some
+ // languages
+ entities.put("Scaron", Character.valueOf('\u0160')); // latin //$NON-NLS-1$
+ // capital
+ // letter
+ // S
+ // with
+ // caron
+ entities.put("scaron", Character.valueOf('\u0161')); // latin //$NON-NLS-1$
+ // small
+ // letter
+ // s
+ // with
+ // caron
+ entities.put("Yuml", Character.valueOf('\u0178')); // latin //$NON-NLS-1$
+ // capital
+ // letter Y
+ // with
+ // diaeresis
+ // Spacing Modifier Letters
+ entities.put("circ", Character.valueOf('\u02c6')); // modifier //$NON-NLS-1$
+ // letter
+ // circumflex
+ // accent
+ entities.put("tilde", Character.valueOf('\u02dc')); // small //$NON-NLS-1$
+ // tilde
+ // General punctuation
+ entities.put("ensp", Character.valueOf('\u2002')); // en space //$NON-NLS-1$
+ entities.put("emsp", Character.valueOf('\u2003')); // em space //$NON-NLS-1$
+ entities.put("thinsp", Character.valueOf('\u2009')); // thin //$NON-NLS-1$
+ // space
+ entities.put("zwnj", Character.valueOf('\u200c')); // zero //$NON-NLS-1$
+ // width
+ // non-joiner
+ entities.put("zwj", Character.valueOf('\u200d')); // zero //$NON-NLS-1$
+ // width
+ // joiner
+ entities.put("lrm", Character.valueOf('\u200e')); // left-to-right //$NON-NLS-1$
+ // mark
+ entities.put("rlm", Character.valueOf('\u200f')); // right-to-left //$NON-NLS-1$
+ // mark
+ entities.put("ndash", Character.valueOf('\u2013')); // en dash //$NON-NLS-1$
+ entities.put("mdash", Character.valueOf('\u2014')); // em dash //$NON-NLS-1$
+ entities.put("lsquo", Character.valueOf('\u2018')); // left //$NON-NLS-1$
+ // single
+ // quotation
+ // mark
+ entities.put("rsquo", Character.valueOf('\u2019')); // right //$NON-NLS-1$
+ // single
+ // quotation
+ // mark
+ entities.put("sbquo", Character.valueOf('\u201a')); // single //$NON-NLS-1$
+ // low-9
+ // quotation
+ // mark
+ entities.put("ldquo", Character.valueOf('\u201c')); // left //$NON-NLS-1$
+ // double
+ // quotation
+ // mark
+ entities.put("rdquo", Character.valueOf('\u201d')); // right //$NON-NLS-1$
+ // double
+ // quotation
+ // mark
+ entities.put("bdquo", Character.valueOf('\u201e')); // double //$NON-NLS-1$
+ // low-9
+ // quotation
+ // mark
+ entities.put("dagger", Character.valueOf('\u2020')); // dagger //$NON-NLS-1$
+ entities.put("Dagger", Character.valueOf('\u2021')); // double //$NON-NLS-1$
+ // dagger
+ entities.put("permil", Character.valueOf('\u2030')); // per //$NON-NLS-1$
+ // mille
+ // sign
+ entities.put("lsaquo", Character.valueOf('\u2039')); // single //$NON-NLS-1$
+ // left-pointing
+ // angle
+ // quotation
+ // mark,
+ // not
+ // yet
+ // standardized
+ entities.put("rsaquo", Character.valueOf('\u203a')); // single //$NON-NLS-1$
+ // right-pointing
+ // angle
+ // quotation
+ // mark,
+ // not
+ // yet
+ // standardized
+ entities.put("euro", Character.valueOf('\u20ac')); // euro sign //$NON-NLS-1$
+ }
+}
diff --git a/org.eclipse.mylyn.commons.core/src/org/eclipse/mylyn/commons/core/HtmlTag.java b/org.eclipse.mylyn.commons.core/src/org/eclipse/mylyn/commons/core/HtmlTag.java
new file mode 100644
index 0000000..e03f1ac
--- /dev/null
+++ b/org.eclipse.mylyn.commons.core/src/org/eclipse/mylyn/commons/core/HtmlTag.java
@@ -0,0 +1,374 @@
+/*******************************************************************************
+ * Copyright (c) 2004, 2009 Tasktop Technologies and others.
+ * All rights reserved. This program and the accompanying materials
+ * are made available under the terms of the Eclipse Public License v1.0
+ * which accompanies this distribution, and is available at
+ * http://www.eclipse.org/legal/epl-v10.html
+ *
+ * Contributors:
+ * Tasktop Technologies - initial API and implementation
+ *******************************************************************************/
+
+package org.eclipse.mylyn.commons.core;
+
+import java.net.URL;
+import java.text.ParseException;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.Locale;
+
+import javax.swing.text.html.HTML.Tag;
+
+/**
+ * Class representing an HTML (3.2) tag and its attributes.
+ *
+ * @author Shawn Minto
+ * @since 3.7
+ */
+public class HtmlTag {
+ /** tag's name */
+ private String tagName;
+
+ /** tag type enum */
+ private Tag tagType;
+
+ /** true if the tag is a closing tag */
+ private boolean isEndTag;
+
+ /** tag's attributes (keys are lowercase attribute names) */
+ private HashMap<String, String> attributes;
+
+ /** tag's base url */
+ private final URL baseUrl;
+
+ /** tag is self terminated */
+ private boolean selfTerminating;
+
+ /**
+ * Basic constructor. The tag is uninitialized.
+ */
+ public HtmlTag() {
+ tagName = null;
+ tagType = Type.UNKNOWN;
+ isEndTag = false;
+ attributes = new HashMap<String, String>();
+ baseUrl = null;
+ }
+
+ /**
+ * Copy constructor.
+ */
+ @SuppressWarnings("unchecked")
+ public HtmlTag(HtmlTag htmltag) {
+ tagName = null;
+ tagType = Type.UNKNOWN;
+ isEndTag = false;
+ attributes = new HashMap<String, String>();
+ tagName = htmltag.tagName;
+ baseUrl = htmltag.baseUrl;
+ tagType = htmltag.tagType;
+ isEndTag = htmltag.isEndTag;
+ attributes = (HashMap<String, String>) htmltag.attributes.clone();
+ }
+
+ /**
+ * Constructor.
+ */
+ public HtmlTag(String s) throws ParseException {
+ attributes = new HashMap<String, String>();
+ setTagName(s);
+ baseUrl = null;
+ }
+
+ /**
+ * Constructor creating an otherwise empty tag, but with a given base url.
+ */
+ public HtmlTag(URL url) {
+ tagName = null;
+ tagType = Type.UNKNOWN;
+ isEndTag = false;
+ attributes = new HashMap<String, String>();
+ baseUrl = url;
+ }
+
+ /**
+ * Returns the tag's type (linked to the tag's name).
+ */
+ public Tag getTagType() {
+ return tagType;
+ }
+
+ /**
+ * Returns the tag's name (e.g., "HEAD", "P", etc.).
+ */
+ public String getTagName() {
+ return tagName;
+ }
+
+ /**
+ * Sets the tag's name and type, if known.
+ *
+ * @throws IllegalArgumentException
+ * if the argument is <code>null</code> or empty string
+ */
+ public void setTagName(String s) throws IllegalArgumentException {
+ if (s == null || s.length() == 0) {
+ throw new IllegalArgumentException("Empty tag name"); //$NON-NLS-1$
+ }
+ if (s.charAt(0) == '/') {
+ isEndTag = true;
+ s = s.substring(1);
+ }
+ if (s.length() == 0) {
+ throw new IllegalArgumentException("Empty tag name"); //$NON-NLS-1$
+ }
+ tagName = s;
+ tagType = tags.get(s.toUpperCase(Locale.ENGLISH));
+ if (tagType == null) {
+ tagType = Type.UNKNOWN;
+ }
+ }
+
+ /**
+ * Returns <code>true</code> if the tag is a closing tag.
+ */
+ public boolean isEndTag() {
+ return isEndTag;
+ }
+
+ /**
+ * Returns the value of a tag's attribute as an integer.
+ */
+ public int getIntAttribute(String s) throws NumberFormatException {
+ return Integer.parseInt(getAttribute(s));
+ }
+
+ /**
+ * Returns the value of a tag's attribute, or NULL if it doesn't exist.
+ */
+ public String getAttribute(String s) {
+ return attributes.get(s);
+ }
+
+ /**
+ * Returns <code>true</code> if the tag contains attribute with the given name.
+ */
+ public boolean hasAttribute(String s) {
+ return getAttribute(s) != null;
+ }
+
+ /**
+ * Sets the value of a tag's attribute.
+ */
+ public void setAttribute(String name, String value) {
+ attributes.put(name.toLowerCase(Locale.ENGLISH), value);
+ }
+
+ public StringBuffer getURLs() {
+ StringBuffer sb = new StringBuffer();
+
+ Iterator<String> attributeNames = attributes.keySet().iterator();
+ Iterator<String> attributeValues = attributes.values().iterator();
+ while (attributeNames.hasNext()) {
+ String attributeName = attributeNames.next();
+ if (attributeName.compareTo("href") == 0 || attributeName.compareTo("src") == 0) { //$NON-NLS-1$ //$NON-NLS-2$
+ String target = attributeValues.next();
+ if (!target.endsWith(".jpg") && !target.endsWith(".gif") && !target.endsWith(".css") //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
+ && !target.endsWith(".js") && !target.startsWith("mailto") && target.lastIndexOf("#") == -1 //$NON-NLS-1$ //$NON-NLS-2$ //$NON-NLS-3$
+ && target.length() > 0) {
+
+ for (int i = 0; i < target.length(); i++) {
+ char ch = target.charAt(i);
+ if (!Character.isWhitespace(ch)) {
+ if (i > 0) {
+ target = target.substring(i + 1);
+ }
+ break;
+ }
+ }
+ target = target.replace('\\', '/');
+
+ if (target.startsWith("news:") || (target.indexOf("://") != -1 && target.length() >= 7)) { //$NON-NLS-1$ //$NON-NLS-2$
+ // Absolute URL
+ if (target.substring(0, 7).compareToIgnoreCase("http://") == 0) { //$NON-NLS-1$
+ sb.append(target);
+ }
+ } else {
+ // Relative URL
+
+ String baseDir = baseUrl.getPath();
+ int lastSep = -1;
+ for (int i = 0; i < baseDir.length(); i++) {
+ char ch = baseDir.charAt(i);
+ if (ch == '/') {
+ lastSep = i;
+ } else if (ch == '?') {
+ break;
+ }
+ }
+ if (lastSep >= 0) {
+ baseDir = baseDir.substring(0, lastSep);
+ }
+ while (baseDir.length() > 1 && baseDir.endsWith("/.")) { //$NON-NLS-1$
+ baseDir = baseDir.substring(0, baseDir.length() - 2);
+ }
+
+ if (target.startsWith("//")) { //$NON-NLS-1$
+ sb.append(baseUrl.getProtocol() + ":" + target); //$NON-NLS-1$
+ } else if (target.startsWith("/")) { //$NON-NLS-1$
+ sb.append(baseUrl.getProtocol() + "://" + baseUrl.getHost() + target); //$NON-NLS-1$
+ } else {
+ while (target.startsWith("../")) { //$NON-NLS-1$
+ if (baseDir.length() > 0) {
+ // can't go above root
+ baseDir = baseDir.substring(0, baseDir.lastIndexOf("/")); //$NON-NLS-1$
+ }
+ target = target.substring(3);
+ }
+ sb.append(baseUrl.getProtocol() + "://" + baseUrl.getHost() + baseDir + "/" + target); //$NON-NLS-1$ //$NON-NLS-2$
+ }
+ }
+ }
+ } else {
+ attributeValues.next();
+ }
+ }
+
+ return sb;
+ }
+
+ @Override
+ public String toString() {
+ StringBuffer sb = new StringBuffer();
+ sb.append('<');
+ if (isEndTag) {
+ sb.append('/');
+ }
+ sb.append(tagName);
+ Iterator<String> keys = attributes.keySet().iterator();
+ Iterator<String> values = attributes.values().iterator();
+ while (keys.hasNext()) {
+ String name = keys.next();
+ sb.append(' ');
+ sb.append(name);
+ String value = values.next();
+ sb.append("=\""); //$NON-NLS-1$
+ if (value.length() > 0) {
+ sb.append(value);
+ }
+ sb.append('"');
+ }
+ if (selfTerminating) {
+ sb.append('/');
+ }
+ sb.append('>');
+
+ return sb.toString();
+ }
+
+ /**
+ * Enum class for tag types.
+ */
+ public static class Type extends Tag {
+ public static final Tag UNKNOWN = new Tag();
+
+ public static final Tag THEAD = new Type("THEAD"); //$NON-NLS-1$
+
+ public static final Tag DOCTYPE = new Type("!DOCTYPE"); //$NON-NLS-1$
+
+ public static final Tag LABEL = new Type("LABEL"); //$NON-NLS-1$
+
+ private Type(String name) {
+ super(name);
+ }
+ }
+
+ private static HashMap<String, Tag> tags;
+ static {
+ tags = new HashMap<String, Tag>();
+ tags.put("A", Tag.A); //$NON-NLS-1$
+ tags.put("ADDRESS", Tag.ADDRESS); //$NON-NLS-1$
+ tags.put("APPLET", Tag.APPLET); //$NON-NLS-1$
+ tags.put("AREA", Tag.AREA); //$NON-NLS-1$
+ tags.put("B", Tag.B); //$NON-NLS-1$
+ tags.put("BASE", Tag.BASE); //$NON-NLS-1$
+ tags.put("BASEFONT", Tag.BASEFONT); //$NON-NLS-1$
+ tags.put("BIG", Tag.BIG); //$NON-NLS-1$
+ tags.put("BLOCKQUOTE", Tag.BLOCKQUOTE); //$NON-NLS-1$
+ tags.put("BODY", Tag.BODY); //$NON-NLS-1$
+ tags.put("BR", Tag.BR); //$NON-NLS-1$
+ tags.put("CAPTION", Tag.CAPTION); //$NON-NLS-1$
+ tags.put("CENTER", Tag.CENTER); //$NON-NLS-1$
+ tags.put("CITE", Tag.CITE); //$NON-NLS-1$
+ tags.put("CODE", Tag.CODE); //$NON-NLS-1$
+ tags.put("DD", Tag.DD); //$NON-NLS-1$
+ tags.put("DFN", Tag.DFN); //$NON-NLS-1$
+ tags.put("DIR", Tag.DIR); //$NON-NLS-1$
+ tags.put("DIV", Tag.DIV); //$NON-NLS-1$
+ tags.put("DL", Tag.DL); //$NON-NLS-1$
+ tags.put("!DOCTYPE", Type.DOCTYPE); //$NON-NLS-1$
+ tags.put("DT", Tag.DT); //$NON-NLS-1$
+ tags.put("EM", Tag.EM); //$NON-NLS-1$
+ tags.put("FONT", Tag.FONT); //$NON-NLS-1$
+ tags.put("FORM", Tag.FORM); //$NON-NLS-1$
+ tags.put("FRAME", Tag.FRAME); //$NON-NLS-1$
+ tags.put("FRAMESET", Tag.FRAMESET); //$NON-NLS-1$
+ tags.put("H1", Tag.H1); //$NON-NLS-1$
+ tags.put("H2", Tag.H2); //$NON-NLS-1$
+ tags.put("H3", Tag.H3); //$NON-NLS-1$
+ tags.put("H4", Tag.H4); //$NON-NLS-1$
+ tags.put("H5", Tag.H5); //$NON-NLS-1$
+ tags.put("H6", Tag.H6); //$NON-NLS-1$
+ tags.put("HEAD", Tag.HEAD); //$NON-NLS-1$
+ tags.put("HTML", Tag.HTML); //$NON-NLS-1$
+ tags.put("HR", Tag.HR); //$NON-NLS-1$
+ tags.put("I", Tag.I); //$NON-NLS-1$
+ tags.put("IMG", Tag.IMG); //$NON-NLS-1$
+ tags.put("INPUT", Tag.INPUT); //$NON-NLS-1$
+ tags.put("ISINDEX", Tag.ISINDEX); //$NON-NLS-1$
+ tags.put("KBD", Tag.KBD); //$NON-NLS-1$
+ tags.put("LI", Tag.LI); //$NON-NLS-1$
+ tags.put("LABEL", Type.LABEL); //$NON-NLS-1$
+ tags.put("LINK", Tag.LINK); //$NON-NLS-1$
+ tags.put("MAP", Tag.MAP); //$NON-NLS-1$
+ tags.put("MENU", Tag.MENU); //$NON-NLS-1$
+ tags.put("META", Tag.META); //$NON-NLS-1$
+ tags.put("NOFRAMES", Tag.NOFRAMES); //$NON-NLS-1$
+ tags.put("OBJECT", Tag.OBJECT); //$NON-NLS-1$
+ tags.put("OL", Tag.OL); //$NON-NLS-1$
+ tags.put("OPTION", Tag.OPTION); //$NON-NLS-1$
+ tags.put("P", Tag.P); //$NON-NLS-1$
+ tags.put("PARAM", Tag.PARAM); //$NON-NLS-1$
+ tags.put("PRE", Tag.PRE); //$NON-NLS-1$
+ tags.put("S", Tag.S); //$NON-NLS-1$
+ tags.put("SAMP", Tag.SAMP); //$NON-NLS-1$
+ tags.put("SCRIPT", Tag.SCRIPT); //$NON-NLS-1$
+ tags.put("SELECT", Tag.SELECT); //$NON-NLS-1$
+ tags.put("SMALL", Tag.SMALL); //$NON-NLS-1$
+ tags.put("SPAN", Tag.SPAN); //$NON-NLS-1$
+ tags.put("STRONG", Tag.STRONG); //$NON-NLS-1$
+ tags.put("STYLE", Tag.STYLE); //$NON-NLS-1$
+ tags.put("SUB", Tag.SUB); //$NON-NLS-1$
+ tags.put("SUP", Tag.SUP); //$NON-NLS-1$
+ tags.put("TABLE", Tag.TABLE); //$NON-NLS-1$
+ tags.put("TD", Tag.TD); //$NON-NLS-1$
+ tags.put("TEXTAREA", Tag.TEXTAREA); //$NON-NLS-1$
+ tags.put("TH", Tag.TH); //$NON-NLS-1$
+ tags.put("THEAD", Type.THEAD); //$NON-NLS-1$
+ tags.put("TITLE", Tag.TITLE); //$NON-NLS-1$
+ tags.put("TR", Tag.TR); //$NON-NLS-1$
+ tags.put("TT", Tag.TT); //$NON-NLS-1$
+ tags.put("U", Tag.U); //$NON-NLS-1$
+ tags.put("UL", Tag.UL); //$NON-NLS-1$
+ tags.put("VAR", Tag.VAR); //$NON-NLS-1$
+ }
+
+ public void setSelfTerminating(boolean terminating) {
+ this.selfTerminating = terminating;
+
+ }
+
+ public boolean isSelfTerminating() {
+ return selfTerminating;
+ }
+}
diff --git a/org.eclipse.mylyn.commons.core/src/org/eclipse/mylyn/commons/core/HtmlUtil.java b/org.eclipse.mylyn.commons.core/src/org/eclipse/mylyn/commons/core/HtmlUtil.java
index 471fb54..c24e6f9 100644
--- a/org.eclipse.mylyn.commons.core/src/org/eclipse/mylyn/commons/core/HtmlUtil.java
+++ b/org.eclipse.mylyn.commons.core/src/org/eclipse/mylyn/commons/core/HtmlUtil.java
@@ -13,7 +13,9 @@ package org.eclipse.mylyn.commons.core;
import java.io.IOException;
import java.io.StringReader;
+import java.text.ParseException;
+import org.eclipse.mylyn.commons.core.HtmlStreamTokenizer.Token;
import org.eclipse.mylyn.internal.commons.core.Html2TextReader;
/**
@@ -41,4 +43,22 @@ public class HtmlUtil {
return sb.toString();
}
+ /**
+ * @since 3.7
+ */
+ public static String getTextContent(HtmlStreamTokenizer tokenizer) throws IOException, ParseException {
+ StringBuilder sb = new StringBuilder();
+ for (Token token = tokenizer.nextToken(); token.getType() != Token.EOF; token = tokenizer.nextToken()) {
+ if (token.getType() == Token.TEXT) {
+ sb.append(token.toString().trim());
+ sb.append(" "); //$NON-NLS-1$
+ } else if (token.getType() == Token.COMMENT) {
+ // ignore
+ } else {
+ break;
+ }
+ }
+ return sb.toString().trim();
+ }
+
}
diff --git a/org.eclipse.mylyn.commons.net/src/org/eclipse/mylyn/commons/net/HtmlStreamTokenizer.java b/org.eclipse.mylyn.commons.net/src/org/eclipse/mylyn/commons/net/HtmlStreamTokenizer.java
index f3194c4..c374b44 100644
--- a/org.eclipse.mylyn.commons.net/src/org/eclipse/mylyn/commons/net/HtmlStreamTokenizer.java
+++ b/org.eclipse.mylyn.commons.net/src/org/eclipse/mylyn/commons/net/HtmlStreamTokenizer.java
@@ -26,7 +26,9 @@ import org.apache.commons.lang.StringEscapeUtils;
*
* @author Shawn Minto
* @since 2.0
+ * @deprecated use org.eclipse.mylyn.commons.core.HtmlStreamTokenizer instead.
*/
+@Deprecated
public class HtmlStreamTokenizer {
/** parser state */
diff --git a/org.eclipse.mylyn.commons.net/src/org/eclipse/mylyn/commons/net/HtmlTag.java b/org.eclipse.mylyn.commons.net/src/org/eclipse/mylyn/commons/net/HtmlTag.java
index f890a56..bf761f4 100644
--- a/org.eclipse.mylyn.commons.net/src/org/eclipse/mylyn/commons/net/HtmlTag.java
+++ b/org.eclipse.mylyn.commons.net/src/org/eclipse/mylyn/commons/net/HtmlTag.java
@@ -22,9 +22,11 @@ import javax.swing.text.html.HTML.Tag;
/**
* Class representing an HTML (3.2) tag and its attributes.
*
+ * @deprecated use org.eclipse.mylyn.commons.core.HtmlTag instead.
* @author Shawn Minto
* @since 2.0
*/
+@Deprecated
public class HtmlTag {
/** tag's name */
private String tagName;
diff --git a/org.eclipse.mylyn.commons.repositories.http.core/src/org/eclipse/mylyn/commons/repositories/http/core/CommonHttpClient.java b/org.eclipse.mylyn.commons.repositories.http.core/src/org/eclipse/mylyn/commons/repositories/http/core/CommonHttpClient.java
index 87404aa..47725df 100644
--- a/org.eclipse.mylyn.commons.repositories.http.core/src/org/eclipse/mylyn/commons/repositories/http/core/CommonHttpClient.java
+++ b/org.eclipse.mylyn.commons.repositories.http.core/src/org/eclipse/mylyn/commons/repositories/http/core/CommonHttpClient.java
@@ -19,6 +19,7 @@ import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.conn.ClientConnectionManager;
import org.apache.http.impl.client.AbstractHttpClient;
import org.apache.http.impl.client.ContentEncodingHttpClient;
+import org.apache.http.protocol.HttpContext;
import org.apache.http.protocol.SyncBasicHttpContext;
import org.eclipse.core.runtime.IProgressMonitor;
import org.eclipse.mylyn.commons.core.operations.IOperationMonitor;
@@ -50,6 +51,10 @@ public class CommonHttpClient {
return HttpUtil.execute(getHttpClient(), HttpUtil.createHost(request), context, request, monitor);
}
+ public HttpContext getContext() {
+ return context;
+ }
+
public synchronized AbstractHttpClient getHttpClient() {
if (httpClient == null) {
httpClient = createHttpClient(null);