/******************************************************************************* * Copyright (c) 2003 - 2005 University Of British Columbia and others. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html * * Contributors: * University Of British Columbia - initial API and implementation *******************************************************************************/ package org.eclipse.mylar.bugzilla.core.internal; import java.io.BufferedReader; import java.io.IOException; import java.io.Reader; import java.net.URL; import java.text.ParseException; import java.util.HashMap; public class HtmlStreamTokenizer { /** parser state */ private State state; /** reader from which to parse the text */ private BufferedReader in; /** base URL for resolving relative URLs */ private URL base; /** buffer holding the text of the current token */ private StringBuffer textBuffer; /** buffer holding whitespace preceding the current token */ private StringBuffer whitespaceBuffer; /** holds a token that was read and then put back in the queue to be returned again on nextToken call */ private Token pushbackToken; /** holds a character that was read and then determined not to be part of the current token */ private int pushbackChar; /** current quote delimiter (single or double) */ private int quoteChar; /** * Constructor. * @param in reader for the HTML document to tokenize * @param base URL for resolving relative URLs */ public HtmlStreamTokenizer(Reader in, URL base) { textBuffer = new StringBuffer(); whitespaceBuffer = new StringBuffer(); pushbackChar = 0; state = State.TEXT; this.in = new BufferedReader(in); this.base = base; } /** * Returns the next token from the stream. */ public Token nextToken() throws IOException, ParseException { if (pushbackToken != null) { Token token = pushbackToken; pushbackToken = null; return token; } int closingComment = 0; textBuffer.setLength(0); whitespaceBuffer.setLength(0); do { int ch; if (pushbackChar != 0) { ch = pushbackChar; pushbackChar = 0; } else { ch = in.read(); } if (ch < 0) { State oldState = state; state = State.EOF; if (textBuffer.length() > 0 && oldState == State.TEXT) { return new Token(textBuffer, whitespaceBuffer, false); } else { return new Token(); } } if (state == State.TEXT) { if (ch == '<') { state = State.TAG; if (textBuffer.length() > 0) return new Token(textBuffer, whitespaceBuffer, false); } else if (Character.isWhitespace((char)ch)) { pushbackChar = ch; state = State.WS; if (textBuffer.length() > 0) return new Token(textBuffer, whitespaceBuffer, false); } else { textBuffer.append((char) ch); } } else if (state == State.WS) { if (!Character.isWhitespace((char)ch)) { pushbackChar = ch; state = State.TEXT; } else { whitespaceBuffer.append((char) ch); } } else if (state == State.TAG) { if (ch == '>') { state = State.TEXT; HtmlTag tag = new HtmlTag(base); parseTag(textBuffer.toString(), tag); return new Token(tag, whitespaceBuffer); } if (ch == '<' && textBuffer.length() == 0) { textBuffer.append("<<"); state = State.TEXT; } else if (ch == '-' && textBuffer.length() == 2 && textBuffer.charAt(1) == '-' && textBuffer.charAt(0) == '!') { textBuffer.setLength(0); state = State.COMMENT; } else if (ch == '\'' || ch == '"') { quoteChar = ch; textBuffer.append((char) ch); state = State.TAG_QUOTE; } else { textBuffer.append((char) ch); } } else if (state == State.TAG_QUOTE) { if (ch == '>') { pushbackChar = ch; state = State.TAG; } else { textBuffer.append((char) ch); if (ch == quoteChar) state = State.TAG; } } else if (state == State.COMMENT) { if (ch == '>' && closingComment >= 2) { textBuffer.setLength(textBuffer.length() - 2); closingComment = 0; state = State.TEXT; return new Token(textBuffer, whitespaceBuffer, true); } if (ch == '-') { closingComment++; } else { closingComment = 0; } textBuffer.append((char) ch); } } while (true); } /** * Pushes the token back into the queue, to be returned by the subsequent * call to nextToken */ public void pushback(Token token) { pushbackToken = token; } /** * Parses an HTML tag out of a string of characters. */ private static void parseTag(String s, HtmlTag tag) throws ParseException { int i = 0; for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++){ // just move forward } if (i == s.length()) throw new ParseException("parse empty tag", 0); int start = i; for (; i < s.length() && !Character.isWhitespace(s.charAt(i)); i++){ // just move forward } tag.setTagName(s.substring(start, i)); for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++){ // just move forward } if (i == s.length()) { return; } else { parseAttributes(tag, s, i); return; } } /** * parses HTML tag attributes from a buffer and sets them in an HtmlTag */ private static void parseAttributes(HtmlTag tag, String s, int i) throws ParseException { while (i < s.length()) { // skip whitespace while (i < s.length() && Character.isWhitespace(s.charAt(i))) i++; if (i == s.length()) return; // read the attribute name -- the rule might be looser than the RFC specifies: // everything up to a space or an equal sign is included int start = i; for (; i < s.length() && !Character.isWhitespace(s.charAt(i)) && s.charAt(i) != '='; i++){ // just move forward } String attributeName = s.substring(start, i).toLowerCase(); for (; i < s.length() && Character.isWhitespace(s.charAt(i)); i++){ // just move forward } if (i == s.length() || s.charAt(i) != '=') { // no attribute value tag.setAttribute(attributeName, ""); continue; } // skip whitespace to the start of attribute value for (i = i+1; i < s.length() && Character.isWhitespace(s.charAt(i)); i++){ // just move forward } if (i == s.length()) return; // read the attribute value -- the rule for unquoted attribute value is // looser than the one in Conolly's W3C 1996 lexical analyzer draft: everything // is included up to the next space String attributeValue; if (s.charAt(i) == '"') { start = ++i; for (; i < s.length() && s.charAt(i) != '"'; i++){ // just move forward } if (i == s.length()) return; // shouldn't happen if input returned by nextToken attributeValue = unescape(s.substring(start, i)); i++; } else if (s.charAt(i) == '\'') { start = ++i; for (; i < s.length() && s.charAt(i) != '\''; i++){ // just move forward } if (i == s.length()) return; // shouldn't happen if input returned by nextToken attributeValue = unescape(s.substring(start, i)); i++; } else { start = i; for (; i < s.length() && !Character.isWhitespace(s.charAt(i)); i++){ // just move forward } attributeValue = s.substring(start, i); } tag.setAttribute(attributeName, attributeValue); } } /** * Returns a string with HTML escapes changed into their corresponding characters. */ public static String unescape(String s) { if (s.indexOf('&') == -1) { return s; } else { StringBuffer sb = new StringBuffer(s); unescape(sb); return sb.toString(); } } /** * Replaces (in-place) HTML escapes in a StringBuffer with their corresponding characters. */ public static StringBuffer unescape(StringBuffer sb) { int i = 0; // index into the unprocessed section of the buffer int j = 0; // index into the processed section of the buffer while (i < sb.length()) { char ch = sb.charAt(i); if (ch == '&') { int start = i; String escape = null; for (i = i+1; i < sb.length(); i++) { ch = sb.charAt(i); if (!Character.isLetterOrDigit(ch) && !(ch == '#' && i == (start+1))) { escape = sb.substring(start+1, i); break; } } if (i == sb.length() && i != (start+1)) { escape = sb.substring(start + 1); } if (escape != null) { Character character = parseReference(escape); if (character != null) { ch = character.charValue(); } else { // not an HTML escape; rewind i = start; ch = '&'; } } } sb.setCharAt(j, ch); i++; j++; } sb.setLength(j); return sb; } /** * Parses HTML character and entity references and returns the * corresponding character. */ private static Character parseReference(String s) { if (s.length() == 0) return null; if (s.charAt(0) == '#') { // character reference if (s.length() == 1) return null; try { int value; if (s.charAt(1) == 'x') { // Hex reference value = Integer.parseInt(s.substring(2), 16); } else { // Decimal reference value = Integer.parseInt(s.substring(1)); } return new Character((char)value); } catch (NumberFormatException e) { return null; } } else { return entities.get(s); } } /** * Class for current token. */ public static class Token { public static final Type EOF = new Type(); public static final Type TEXT = new Type(); public static final Type TAG = new Type(); public static final Type COMMENT = new Type(); /** token's type */ private Type type; /** token's value */ private Object value; /** whitespace preceding the token */ private StringBuffer whitespace; /** * Constructor for the EOF token. */ protected Token() { type = EOF; value = null; whitespace = null; } /** * Constructor for the HTML tag tokens. */ protected Token(HtmlTag tag, StringBuffer whitespace) { type = TAG; value = tag; this.whitespace = whitespace; } /** * Constructor for regular text and comments. */ protected Token(StringBuffer text, StringBuffer whitespace, boolean comment) { if (comment) { type = COMMENT; } else { type = TEXT; } this.value = text; this.whitespace = whitespace; } /** * Returns the token's type. */ public Type getType() { return type; } /** * Returns the whitespace preceding the token. */ public StringBuffer getWhitespace() { return whitespace; } /** * Returns the token's value. This is an HtmlTag for tokens of type TAG * and a StringBuffer for tokens of type TEXT and COMMENT. * For tokens of type EOF, the value is null. */ public Object getValue() { return value; } /** * Returns the string representation of the token, including the preceding whitespace. */ @Override public String toString() { StringBuffer sb = new StringBuffer(); if (whitespace != null) { sb.append(whitespace); } if (value != null) { if (type == TAG) { sb.append('<'); } else if (type == COMMENT) { sb.append("'); } else if (type == COMMENT) { sb.append("-->"); } } return sb.toString(); } /** * Private enum class for token type. */ private static class Type { private Type() { // don't need to do anything } } } /** * Enum class for parser state. */ private static class State { static final State EOF = new State(); static final State COMMENT = new State(); static final State TEXT = new State(); static final State TAG = new State(); static final State WS = new State(); static final State TAG_QUOTE = new State(); private State() { // don't need to do anything } } /** names and values of HTML entity references */ private static HashMap entities; /* * Based on ISO 8879. * * Portions © International Organization for Standardization 1986 Permission * to copy in any form is granted for use with conforming SGML systems and * applications as defined in ISO 8879, provided this notice is included in * all copies. * */ static { entities = new HashMap(); entities.put(new String("nbsp"), new Character('\240')); // no-break space = non-breaking space entities.put(new String("iexcl"), new Character('\241')); // inverted exclamation mark entities.put(new String("cent"), new Character('\242')); // cent sign entities.put(new String("pound"), new Character('\243')); // pound sign entities.put(new String("curren"), new Character('\244')); // currency sign entities.put(new String("yen"), new Character('\245')); // yen sign = yuan sign entities.put(new String("brvbar"), new Character('\246')); // broken bar = broken vertical bar entities.put(new String("sect"), new Character('\247')); // section sign entities.put(new String("uml"), new Character('\250')); // diaeresis = spacing diaeresis entities.put(new String("copy"), new Character('\251')); // copyright sign entities.put(new String("ordf"), new Character('\252')); // feminine ordinal indicator entities.put(new String("laquo"), new Character('\253')); // left-pointing double angle quotation mark = left pointing guillemet entities.put(new String("not"), new Character('\254')); // not sign entities.put(new String("shy"), new Character('\255')); // soft hyphen = discretionary hyphen entities.put(new String("reg"), new Character('\256')); // registered sign = registered trade mark sign entities.put(new String("macr"), new Character('\257')); // macron = spacing macron = overline = APL overbar entities.put(new String("deg"), new Character('\260')); // degree sign entities.put(new String("plusmn"), new Character('\261')); // plus-minus sign = plus-or-minus sign entities.put(new String("sup2"), new Character('\262')); // superscript two = superscript digit two = squared entities.put(new String("sup3"), new Character('\263')); // superscript three = superscript digit three = cubed entities.put(new String("acute"), new Character('\264')); // acute accent = spacing acute entities.put(new String("micro"), new Character('\265')); // micro sign entities.put(new String("para"), new Character('\266')); // pilcrow sign = paragraph sign entities.put(new String("middot"), new Character('\267')); // middle dot = Georgian comma = Greek middle dot entities.put(new String("cedil"), new Character('\270')); // cedilla = spacing cedilla entities.put(new String("sup1"), new Character('\271')); // superscript one = superscript digit one entities.put(new String("ordm"), new Character('\272')); // masculine ordinal indicator entities.put(new String("raquo"), new Character('\273')); // right-pointing double angle quotation mark = right pointing guillemet entities.put(new String("frac14"), new Character('\274')); // vulgar fraction one quarter = fraction one quarter entities.put(new String("frac12"), new Character('\275')); // vulgar fraction one half = fraction one half entities.put(new String("frac34"), new Character('\276')); // vulgar fraction three quarters = fraction three quarters entities.put(new String("iquest"), new Character('\277')); // inverted question mark = turned question mark entities.put(new String("Agrave"), new Character('\300')); // latin capital letter A with grave = latin capital letter A grave entities.put(new String("Aacute"), new Character('\301')); // latin capital letter A with acute entities.put(new String("Acirc"), new Character('\302')); // latin capital letter A with circumflex entities.put(new String("Atilde"), new Character('\303')); // latin capital letter A with tilde entities.put(new String("Auml"), new Character('\304')); // latin capital letter A with diaeresis entities.put(new String("Aring"), new Character('\305')); // latin capital letter A with ring above = latin capital letter A ring entities.put(new String("AElig"), new Character('\306')); // latin capital letter AE = latin capital ligature AE entities.put(new String("Ccedil"), new Character('\307')); // latin capital letter C with cedilla entities.put(new String("Egrave"), new Character('\310')); // latin capital letter E with grave entities.put(new String("Eacute"), new Character('\311')); // latin capital letter E with acute entities.put(new String("Ecirc"), new Character('\312')); // latin capital letter E with circumflex entities.put(new String("Euml"), new Character('\313')); // latin capital letter E with diaeresis entities.put(new String("Igrave"), new Character('\314')); // latin capital letter I with grave entities.put(new String("Iacute"), new Character('\315')); // latin capital letter I with acute entities.put(new String("Icirc"), new Character('\316')); // latin capital letter I with circumflex entities.put(new String("Iuml"), new Character('\317')); // latin capital letter I with diaeresis entities.put(new String("ETH"), new Character('\320')); // latin capital letter ETH entities.put(new String("Ntilde"), new Character('\321')); // latin capital letter N with tilde entities.put(new String("Ograve"), new Character('\322')); // latin capital letter O with grave entities.put(new String("Oacute"), new Character('\323')); // latin capital letter O with acute entities.put(new String("Ocirc"), new Character('\324')); // latin capital letter O with circumflex entities.put(new String("Otilde"), new Character('\325')); // latin capital letter O with tilde entities.put(new String("Ouml"), new Character('\326')); // latin capital letter O with diaeresis entities.put(new String("times"), new Character('\327')); // multiplication sign entities.put(new String("Oslash"), new Character('\330')); // latin capital letter O with stroke = latin capital letter O slash entities.put(new String("Ugrave"), new Character('\331')); // latin capital letter U with grave entities.put(new String("Uacute"), new Character('\332')); // latin capital letter U with acute entities.put(new String("Ucirc"), new Character('\333')); // latin capital letter U with circumflex entities.put(new String("Uuml"), new Character('\334')); // latin capital letter U with diaeresis entities.put(new String("Yacute"), new Character('\335')); // latin capital letter Y with acute entities.put(new String("THORN"), new Character('\336')); // latin capital letter THORN entities.put(new String("szlig"), new Character('\337')); // latin small letter sharp s = ess-zed entities.put(new String("agrave"), new Character('\340')); // latin small letter a with grave = latin small letter a grave entities.put(new String("aacute"), new Character('\341')); // latin small letter a with acute entities.put(new String("acirc"), new Character('\342')); // latin small letter a with circumflex entities.put(new String("atilde"), new Character('\343')); // latin small letter a with tilde entities.put(new String("auml"), new Character('\344')); // latin small letter a with diaeresis entities.put(new String("aring"), new Character('\345')); // latin small letter a with ring above = latin small letter a ring entities.put(new String("aelig"), new Character('\346')); // latin small letter ae = latin small ligature ae entities.put(new String("ccedil"), new Character('\347')); // latin small letter c with cedilla entities.put(new String("egrave"), new Character('\350')); // latin small letter e with grave entities.put(new String("eacute"), new Character('\351')); // latin small letter e with acute entities.put(new String("ecirc"), new Character('\352')); // latin small letter e with circumflex entities.put(new String("euml"), new Character('\353')); // latin small letter e with diaeresis entities.put(new String("igrave"), new Character('\354')); // latin small letter i with grave entities.put(new String("iacute"), new Character('\355')); // latin small letter i with acute entities.put(new String("icirc"), new Character('\356')); // latin small letter i with circumflex entities.put(new String("iuml"), new Character('\357')); // latin small letter i with diaeresis entities.put(new String("eth"), new Character('\360')); // latin small letter eth entities.put(new String("ntilde"), new Character('\361')); // latin small letter n with tilde entities.put(new String("ograve"), new Character('\362')); // latin small letter o with grave entities.put(new String("oacute"), new Character('\363')); // latin small letter o with acute entities.put(new String("ocirc"), new Character('\364')); // latin small letter o with circumflex entities.put(new String("otilde"), new Character('\365')); // latin small letter o with tilde entities.put(new String("ouml"), new Character('\366')); // latin small letter o with diaeresis entities.put(new String("divide"), new Character('\367')); // division sign entities.put(new String("oslash"), new Character('\370')); // latin small letter o with stroke = latin small letter o slash entities.put(new String("ugrave"), new Character('\371')); // latin small letter u with grave entities.put(new String("uacute"), new Character('\372')); // latin small letter u with acute entities.put(new String("ucirc"), new Character('\373')); // latin small letter u with circumflex entities.put(new String("uuml"), new Character('\374')); // latin small letter u with diaeresis entities.put(new String("yacute"), new Character('\375')); // latin small letter y with acute entities.put(new String("thorn"), new Character('\376')); // latin small letter thorn entities.put(new String("yuml"), new Character('\377')); // latin small letter y with diaeresis // Special characters entities.put(new String("quot"), new Character('\42')); // quotation mark = APL quote entities.put(new String("amp"), new Character('\46')); // ampersand entities.put(new String("lt"), new Character('\74')); // less-than sign entities.put(new String("gt"), new Character('\76')); // greater-than sign // Latin Extended-A entities.put(new String("OElig"), new Character('\u0152')); // latin capital ligature OE entities.put(new String("oelig"), new Character('\u0153')); // latin small ligature oe, ligature is a misnomer, this is a separate character in some languages entities.put(new String("Scaron"), new Character('\u0160')); // latin capital letter S with caron entities.put(new String("scaron"), new Character('\u0161')); // latin small letter s with caron entities.put(new String("Yuml"), new Character('\u0178')); // latin capital letter Y with diaeresis // Spacing Modifier Letters entities.put(new String("circ"), new Character('\u02c6')); // modifier letter circumflex accent entities.put(new String("tilde"), new Character('\u02dc')); // small tilde // General punctuation entities.put(new String("ensp"), new Character('\u2002')); // en space entities.put(new String("emsp"), new Character('\u2003')); // em space entities.put(new String("thinsp"), new Character('\u2009')); // thin space entities.put(new String("zwnj"), new Character('\u200c')); // zero width non-joiner entities.put(new String("zwj"), new Character('\u200d')); // zero width joiner entities.put(new String("lrm"), new Character('\u200e')); // left-to-right mark entities.put(new String("rlm"), new Character('\u200f')); // right-to-left mark entities.put(new String("ndash"), new Character('\u2013')); // en dash entities.put(new String("mdash"), new Character('\u2014')); // em dash entities.put(new String("lsquo"), new Character('\u2018')); // left single quotation mark entities.put(new String("rsquo"), new Character('\u2019')); // right single quotation mark entities.put(new String("sbquo"), new Character('\u201a')); // single low-9 quotation mark entities.put(new String("ldquo"), new Character('\u201c')); // left double quotation mark entities.put(new String("rdquo"), new Character('\u201d')); // right double quotation mark entities.put(new String("bdquo"), new Character('\u201e')); // double low-9 quotation mark entities.put(new String("dagger"), new Character('\u2020')); // dagger entities.put(new String("Dagger"), new Character('\u2021')); // double dagger entities.put(new String("permil"), new Character('\u2030')); // per mille sign entities.put(new String("lsaquo"), new Character('\u2039')); // single left-pointing angle quotation mark, not yet standardized entities.put(new String("rsaquo"), new Character('\u203a')); // single right-pointing angle quotation mark, not yet standardized entities.put(new String("euro"), new Character('\u20ac')); // euro sign } }