Skip to main content
summaryrefslogtreecommitdiffstats
blob: b9c7f589a9e706346e835d79da1495dc0eb1b222 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
/*******************************************************************************
 * Copyright (c) 2000, 2003 IBM Corporation and others.
 * All rights reserved. This program and the accompanying materials 
 * are made available under the terms of the Common Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/cpl-v10.html
 * 
 * Contributors:
 *     IBM Corporation - initial API and implementation
 *******************************************************************************/

package org.eclipse.help.internal.search;

import java.io.*;
import java.net.*;
import java.util.*;

import org.apache.lucene.demo.html.*;
import org.eclipse.help.internal.*;

/**
 * Parser HTML documents.
 * Extracts document encoding from header,
 * and delegates to lucene HTML parser for extraction
 * of title, summary, and content.
 */
public class HTMLDocParser {
	// maximum number of characters that will be searched
	// from the beginning of HTML document to charset declaration
	private static final int MAX_OFFSET = 2048;

	// elements, atributes and values contstants
	final String ELEMENT_META = "META";
	final String ELEMENT_BODY = "body";
	final String ELEMENT_HEAD = "head";
	final String ATTRIBUTE_HTTP = "http-equiv";
	final String ATTRIBUTE_HTTP_VALUE = "content-type";
	final String ATTRIBUTE_CONTENT = "content";

	// states for parsing elements
	final int STATE_ELEMENT_START = 0;
	final int STATE_ELEMENT_AFTER_LT = 1;
	final int STATE_ELEMENT_AFTER_LT_SLASH = 2;
	final int STATE_ELEMENT_META = 3;
	// states for parsing HTTP-EQUIV attribute 
	final int STATE_HTTP_START = 0;
	final int STATE_HTTP_AFTER_NAME = 1;
	final int STATE_HTTP_AFTER_EQ = 2;
	final int STATE_HTTP_DONE = 3;
	// states for parsing CONTENT attribute
	final int STATE_CONTENT_START = 0;
	final int STATE_CONTENT_AFTER_NAME = 1;
	final int STATE_CONTENT_AFTER_EQ = 2;
	final int STATE_CONTENT_DONE = 3;

	private HTMLParser htmlParser;
	private InputStream inputStream = null;
	/**
	 * @param url
	 * @throws IOException
	 */
	public void openDocument(URL url) throws IOException {
		inputStream = url.openStream();

		String encoding = getCharsetFromHTML(inputStream);
		try {
			inputStream.close();
		} catch (IOException closeIOE) {
		}
		inputStream = url.openStream();
		if (encoding != null) {
			try {
				htmlParser =
					new HTMLParser(
						new InputStreamReader(inputStream, encoding));

			} catch (UnsupportedEncodingException uee) {
				if (HelpBasePlugin.DEBUG_SEARCH) {
					System.out.println(
						this.getClass().getName()
							+ " JVM does not support encoding "
							+ encoding
							+ " specified in document "
							+ url.getPath()
							+ ". Default encoding will be used during indexing.");
				}
				htmlParser = new HTMLParser(new InputStreamReader(inputStream));
			}
		} else {
			if (HelpBasePlugin.DEBUG_SEARCH) {
				System.out.println(
					this.getClass().getName()
						+ " Encoding not found in document "
						+ url.getPath()
						+ ". Default encoding will be used during indexing.");
			}
			htmlParser = new HTMLParser(new InputStreamReader(inputStream));
		}
	}
	/**
	 * Releases resources (closes streams)
	 */
	public void closeDocument() {
		if (inputStream != null) {
			try {
				inputStream.close();
			} catch (IOException closeIOE) {
			}
		}
	}
	public String getTitle() throws IOException {
		if (htmlParser == null) {
			throw new NullPointerException();
		}
		try {
			return htmlParser.getTitle();
		} catch (InterruptedException ie) {
			return "";
		}
	}
	public String getSummary() throws IOException {
		if (htmlParser == null) {
			throw new NullPointerException();
		}
		try {
			return htmlParser.getSummary();
		} catch (InterruptedException ie) {
			return "";
		}
	}
	public Reader getContentReader() throws IOException {
		if (htmlParser == null) {
			throw new NullPointerException();
		}
		return htmlParser.getReader();
	}
	/**
	 * Private.
	 * Parses  HTML to extract document encoding specified in HTTP
	 * equivalent META tag in the document header. Example of such META tag is
	 * <META HTTP-EQUIV="content-type" CONTENT="text/html; charset=UTF-8">
	 * @return String or null if encoding not found
	 */
	public String getCharsetFromHTML(InputStream is) {
		// Set up an ascii reader for the document (documents should not use
		// other characters before encoding is defined)
		Reader asciiReader = new ASCIIReader(is, MAX_OFFSET);
		StreamTokenizer tokenizer = new StreamTokenizer(asciiReader);

		// tokenizer.eolIsSignificant(false);// default false
		// tokenizer.slashSlashComments(false); // default false
		// tokenizer.slashStarComments(false);// default false
		tokenizer.lowerCaseMode(false);

		// tokenizer.quoteChar('\"'); // default quote char
		tokenizer.ordinaryChar('\''); // default quote char
		tokenizer.ordinaryChar('/'); // default comment character

		String charset = getCharsetFromHTMLTokens(tokenizer);
		if (asciiReader != null) {
			try {
				asciiReader.close();
			} catch (IOException ioe) {
			}
		}
		return charset;
	}
	public String getCharsetFromHTMLTokens(StreamTokenizer tokenizer) {
		// keeps track of content attribute attribute until parsing
		// of the meta tag is complete
		String contentValue = null;

		// initialize states
		int stateContent = STATE_HTTP_START;
		int stateElement = STATE_ELEMENT_START;
		int stateHttp = STATE_HTTP_START;

		try {
			// in the worst case, process tokens until end of file
			for (int token = tokenizer.nextToken();
				token != StreamTokenizer.TT_EOF;
				token = tokenizer.nextToken()) {
				// debug tokens
				//				if (token == StreamTokenizer.TT_WORD) {
				//					System.out.println("word     =" + tokenizer.sval);
				//				} else if (token == StreamTokenizer.TT_NUMBER) {
				//					System.out.println("number   =" + tokenizer.nval);
				//				} else if (token == StreamTokenizer.TT_EOL) {
				//					System.out.println("endofline=");
				//				} else if ((char) token == '\"') {
				//					System.out.println("\"     =" + tokenizer.sval);
				//
				//				} else {
				//					System.out.println("else     =" + (char) token);
				//				}

				// process input based depending on current state
				switch (stateElement) {
					case STATE_ELEMENT_START :
						if (token == '<') {
							stateElement = STATE_ELEMENT_AFTER_LT;
						} // else do nothing, cannot be beginning of META tag
						break;
					case STATE_ELEMENT_AFTER_LT :
						if (token == StreamTokenizer.TT_WORD) {
							// some element opened
							if (ELEMENT_META
								.equalsIgnoreCase(tokenizer.sval)) {
								// META element opened
								stateElement = STATE_ELEMENT_META;
								// initialize state of attributes
								stateHttp = STATE_HTTP_START;
								stateContent = STATE_CONTENT_START;
								contentValue = null;
							} else if (
								ELEMENT_BODY.equalsIgnoreCase(
									tokenizer.sval)) {
								// body element opened, we are too far, stop processing input
								return null;
							} else {
								// some other element opened, start from initial state
								stateElement = STATE_ELEMENT_START;
							}
						} else if (token == '/') {
							// can be begging of head closing
							stateElement = STATE_ELEMENT_AFTER_LT_SLASH;
						} else {
							// not an element opened, could be openning of declaration
							// or element closing e.t.c.
							stateElement = STATE_ELEMENT_START;
						}
						break;
					case STATE_ELEMENT_AFTER_LT_SLASH :
						if (token == StreamTokenizer.TT_WORD
							&& ELEMENT_HEAD.equalsIgnoreCase(tokenizer.sval)) {
							// head element closed, we are too far, stop processing input
							return null;
						} else {
							stateElement = STATE_ELEMENT_START;
						}
						break;
					default : // STATE_META_IN :
						switch (token) {
							case '>' :
								// no longer inside META, start from initial state
								stateElement = STATE_ELEMENT_START;
								break;
							case StreamTokenizer.TT_WORD :
								// string inside META tag, can be attribute name
								if (ATTRIBUTE_HTTP
									.equalsIgnoreCase(tokenizer.sval)) {
									// found HTTP-EQUIV attribute name 
									stateHttp = STATE_HTTP_AFTER_NAME;
								} else if (
									ATTRIBUTE_CONTENT.equalsIgnoreCase(
										tokenizer.sval)) {
									// found CONTENT attribute name
									stateContent = STATE_CONTENT_AFTER_NAME;
								} else if (
									stateHttp == STATE_HTTP_AFTER_EQ
										&& ATTRIBUTE_HTTP_VALUE.equalsIgnoreCase(
											tokenizer.sval)) {
									// value of HTTP-EQUIV attribute (unquoted)
									// we found <META ... HTTP-EQUIV=content-type
									stateHttp = STATE_HTTP_DONE;
								} else {
									// some other attribute name or string,
									// reset states of seeked attributes,
									// unless successfully processed earlier
									if (stateHttp != STATE_HTTP_DONE) {
										stateHttp = STATE_HTTP_START;
									}
									if (stateContent != STATE_CONTENT_DONE) {
										stateContent = STATE_CONTENT_START;
									}
								}
								break;
							case '=' :
								// = inside META tag, can separate interesing us
								// attribute names from values
								if (stateHttp == STATE_HTTP_AFTER_NAME) {
									// we have HTTP-EQUIV=
									stateHttp = STATE_HTTP_AFTER_EQ;
								} else if (
									stateContent == STATE_CONTENT_AFTER_NAME) {
									// we have CONTENT=
									stateContent = STATE_CONTENT_AFTER_EQ;
								} else {
									// equal sign after some other attribute name or string,
									// reset states of seeked attributes,
									// unless successfully processed earlier
									if (stateHttp != STATE_HTTP_DONE) {
										stateHttp = STATE_HTTP_START;
									}
									if (stateContent != STATE_CONTENT_DONE) {
										stateContent = STATE_CONTENT_START;
									}
								}
								break;
							case '\"' :
								// quoted string inside META tag, can be attribute value
								if (stateHttp == STATE_HTTP_AFTER_EQ) {
									// value of HTTP-EQUIV attribute
									if (ATTRIBUTE_HTTP_VALUE
										.equalsIgnoreCase(tokenizer.sval)) {
										// we found <META ... HTTP-EQUIV="content-type"
										stateHttp = STATE_HTTP_DONE;
									}
								} else if (
									stateContent == STATE_CONTENT_AFTER_EQ) {
									// value of CONTENT attribute
									stateContent = STATE_CONTENT_DONE;
									// save the value of the attribute
									// if attribue HTTP-EQUIV="content-type" is found
									// in the same META tag, this value might have
									// Content-type entity header
									contentValue = tokenizer.sval;
								} else {
									// value for the attribute is missing
									// reset states of seeked attributes,
									// unless successfully processed earlier
									if (stateHttp != STATE_HTTP_DONE) {
										stateHttp = STATE_HTTP_START;
									}
									if (stateContent != STATE_CONTENT_DONE) {
										stateContent = STATE_CONTENT_START;
									}
								}
								break;
							default :
								// other unexpected token inside META tag
								// reset states of seeked attributes,
								// unless successfully processed earlier
								if (stateHttp != STATE_HTTP_DONE) {
									stateHttp = STATE_HTTP_START;
								}
								if (stateContent != STATE_CONTENT_DONE) {
									stateContent = STATE_CONTENT_START;
								}
								break;
						}
						break;
				}
				if (contentValue != null
					&& stateHttp == STATE_HTTP_DONE
					&& stateContent == STATE_CONTENT_DONE) {
					// <META HTTP-EQUIV="content-type" CONTENT="*******"
					// parse vale of content attribute to extract encoding
					return getCharsetFromHTTP(contentValue);
				}

			}
		} catch (IOException ioe) {
			return null;
		}
		// end of file
		return null;
	}
	/**
	 * Parses HTTP1.1 Content-Type entity-header field
	 * for example, Content-Type: text/html; charset=ISO-8859-4,
	 * and extracts charset parameter value of the media sub type.
	 * @param media-type, for example  Content-Type: text/html; charset=ISO-8859-4
	 * @return value of charset parameter, for example ISO-8859-4
	 *  or  null if parameter does not exist
	 */
	public String getCharsetFromHTTP(String contentValue) {
		StringTokenizer t = new StringTokenizer(contentValue, ";");
		while (t.hasMoreTokens()) {
			String parameter = t.nextToken().trim();
			if (parameter.startsWith("charset=")) {
				String charset =
					parameter.substring("charset=".length()).trim();
				if (charset.length() > 0) {
					return charset;
				}
			}
		}
		return null;
	}
}

Back to the top