Skip to main content
summaryrefslogtreecommitdiffstats
blob: eca7bf5a198327f0ab2954f9a68410e81caf349e (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
//------------------------------------------------------------------------------
// Copyright (c) 2005, 2007 IBM Corporation and others.
// All rights reserved. This program and the accompanying materials
// are made available under the terms of the Eclipse Public License v1.0
// which accompanies this distribution, and is available at
// http://www.eclipse.org/legal/epl-v10.html
//
// Contributors:
// IBM Corporation - initial implementation
//------------------------------------------------------------------------------
package org.eclipse.epf.common.html;

import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.io.PrintWriter;
import java.io.StringWriter;
import java.io.UnsupportedEncodingException;

import org.eclipse.epf.common.IHTMLFormatter;
import org.eclipse.epf.common.utils.FileUtil;
import org.eclipse.epf.common.utils.StrUtil;
import org.w3c.tidy.Tidy;

/**
 * Pretty-formats HTML source and makes it XHTML compliant.
 * 
 * @author Kelvin Low
 * @since 1.0
 */
public class DefaultHTMLFormatter implements IHTMLFormatter{

	protected static final String HTML_BODY_START_TAG = "<body"; //$NON-NLS-1$

	protected static final String HTML_BODY_END_TAG = "</body>"; //$NON-NLS-1$

//	private static final int HTML_BODY_START_TAG_LENGTH = HTML_BODY_START_TAG
//			.length();

	protected int lineWidth;

	protected boolean indent;

	protected int indentSize;

	protected String lastErrorStr;
	
	



	/**
	 * Creates a new instance.
	 */
	public DefaultHTMLFormatter() {
		this(132, true, 4);
	}

	/**
	 * Creates a new instance.
	 */
	public DefaultHTMLFormatter(int lineWidth, boolean indent, int indentSize) {
		this.lineWidth = lineWidth;
		this.indent = indent;
		this.indentSize = indentSize;
	}

	/**
	 * Sets the maximum character width of a line.
	 * 
	 * @param lineWidth
	 *            The line width (in number of characters).
	 */
	public void setLineWidth(int lineWidth) {
		this.lineWidth = lineWidth;
	}

	/**
	 * Enables or disables tags indent.
	 * 
	 * @param indent
	 *            If true, ident the tags.
	 */
	public void setIndent(boolean indent) {
		this.indent = indent;
	}

	/**
	 * Sets the indent size.
	 * 
	 * @param indentSize
	 *            The indent size (in number of characters).
	 */
	public void setIndentSize(int indentSize) {
		this.indentSize = indentSize;
	}

	/**
	 * Formats the given HTML source.
	 * 
	 * @param html
	 *            The HTML source.
	 * @return The pretty-formatted HTML source.
	 */
	public String formatHTML(String html) throws UnsupportedEncodingException {
		return formatHTML(html, false, false, false, false);
	}

	/**
	 * Formats the given HTML source.
	 * 
	 * @param html The HTML source.
	 * @param returnBodyOnly if false, return full HTML document or body content based on what is passed in.  if true, always return body content only
	 * @param forceOutput if true, return cleaned HTML even if errors. if false, will clean minor problems and return clean HTML, but on a major error, will set getLastErrorStr() and return passed-in html
	 * @param makeBare set to true for cleaning MS HTML
	 * @param word2000 set to true for cleaning MS Word 2000 HTML 
	 * @return
	 * @throws UnsupportedEncodingException
	 */
	public String formatHTML(String html, boolean returnBodyOnly, boolean forceOutput, boolean makeBare, boolean word2000) throws UnsupportedEncodingException {
		lastErrorStr = null;
		if (html == null || html.length() == 0) {
			return html;
		}
		
		html = removeLeadingWhitespace(html);

		Tidy tidy = new Tidy();
		tidy.setXHTML(true);
		tidy.setDropEmptyParas(false);
		tidy.setDropFontTags(false);
		tidy.setQuiet(true);
		tidy.setShowWarnings(false);
		tidy.setSmartIndent(false);
		tidy.setTidyMark(false);
		tidy.setWraplen(lineWidth);
		tidy.setIndentAttributes(false);
		tidy.setIndentContent(indent);
		tidy.setSpaces(indentSize);
		tidy.setCharEncoding(org.w3c.tidy.Configuration.UTF8);
//		tidy.setInputEncoding("UTF-16"); //$NON-NLS-1$
//		tidy.setOutputEncoding("UTF-16");//$NON-NLS-1$
		tidy.setFixBackslash(false);
		// this will add <p> around each text block (?that isn't in a block already?)
//		tidy.setEncloseBlockText(true);
		// setting this seemed to prevent JTidy from indenting the source
//		tidy.setPrintBodyOnly(true);
		
		if (forceOutput) {
			// output document even if errors are present
//			tidy.setForceOutput(true);
		}
		if (makeBare) {
			// remove MS clutter
//			tidy.setMakeBare(true);
			tidy.setMakeClean(true);
		}
		if (word2000) {
			// draconian Word2000 cleaning
			tidy.setWord2000(true);
		}


//		Reader input = new StringReader(html);
//		Writer output = new StringWriter();

		StringWriter sw = new StringWriter();
		PrintWriter pw = new PrintWriter(sw);
		tidy.setErrout(pw);
		InputStream input= new ByteArrayInputStream(html.getBytes("UTF-8"));
		
		ByteArrayOutputStream output = new ByteArrayOutputStream();
		tidy.parse(input, output);
		String error = sw.getBuffer().toString();
		if (error != null && error.length() > 0
				&& error.startsWith("line") && error.indexOf("column") > 0) { //$NON-NLS-1$ //$NON-NLS-2$
			lastErrorStr = error;
			if (!forceOutput) {
				// if forceOutput is true, JTidy will return clean HTML so don't return here
				return html;
			}
		}

		String formattedHTML = new String(output.toByteArray(), "UTF-8"); //$NON-NLS-1$
		formattedHTML = StrUtil.getEscapedHTML(formattedHTML);
		
		String htmlStartUpper = html.substring(0, Math.min(10, html.length())).toUpperCase();

		if (returnBodyOnly || (!htmlStartUpper.startsWith("<!DOCTYPE") && !htmlStartUpper.startsWith("<HTML"))) { //$NON-NLS-1$ //$NON-NLS-2$
			int startBodyTag = formattedHTML.indexOf(HTML_BODY_START_TAG);
			int start = -1;
			if (startBodyTag != -1) {
				start = formattedHTML.indexOf(">",startBodyTag); //$NON-NLS-1$
			}
			int end = formattedHTML.indexOf(HTML_BODY_END_TAG);
			if (start == -1 || end == -1) {
				return ""; //$NON-NLS-1$
			}
			start += 1;
			if (start >= end) {
				return ""; //$NON-NLS-1$
			}
			start += FileUtil.LINE_SEP_LENGTH;
			end -= FileUtil.LINE_SEP_LENGTH;
			if (indent && indentSize > 0) {
				end -= indentSize;
			}
			if (start >= end) {
				return ""; //$NON-NLS-1$
			}
			String result = formattedHTML.substring(start, end);
			if (indent && indentSize > 0) {
				String indentStr = getIndentStr(indentSize * 2);
				result = fixIndentation(result, indentStr);
				return result;
			}
		}
		return formattedHTML;
	}
		
	/**
	 * Returns the indent string.
	 */
	protected static String getIndentStr(int indentLength) {
		if (indentLength == 0) {
			return ""; //$NON-NLS-1$
		}
		StringBuffer indentStr = new StringBuffer();
		for (int i = 0; i < indentLength; i++) {
			indentStr.append(' ');
		}
		return indentStr.toString();
	}

	public static final String PRE_TAG_START = "<pre>"; //$NON-NLS-1$

	public static final String PRE_TAG_END = "</pre>"; //$NON-NLS-1$

	public static final int PRE_TAG_END_LENGTH = PRE_TAG_END.length();

	/**
	 * Undo the JTidy indent, but ignore &lt;pre&gt; tags
	 * 
	 * @param html
	 * @param indentStr
	 * @return
	 */
	protected static String fixIndentation(String html, String indentStr) {
		if (html.startsWith(indentStr)) {
			html = html.substring(indentStr.length());
		}
		StringBuffer strBuf = new StringBuffer();
		int pre_index = -1;
		int last_pre_end_index = -1;
		while ((pre_index = html.indexOf(PRE_TAG_START, last_pre_end_index)) != -1) {
			strBuf.append(html.substring(
					last_pre_end_index < 0 ? 0 : last_pre_end_index
							+ PRE_TAG_END_LENGTH, pre_index).replaceAll(
					"\r\n" + indentStr, "\r\n")); //$NON-NLS-1$ //$NON-NLS-2$
			last_pre_end_index = html.indexOf(PRE_TAG_END, pre_index);
			if (last_pre_end_index != -1) {
				strBuf.append(html.substring(pre_index, last_pre_end_index
						+ PRE_TAG_END_LENGTH));
			} else {
				// found <pre>, but no ending </pre> - shouldn't ever get here
				// append rest of string and return it
				strBuf.append(html.substring(pre_index));
				return strBuf.toString();
			}
		}
		strBuf.append(html.substring(
				last_pre_end_index < 0 ? 0 : last_pre_end_index
						+ PRE_TAG_END_LENGTH).replaceAll("\r\n" + indentStr, //$NON-NLS-1$
				"\r\n")); //$NON-NLS-1$
		return strBuf.toString();
	}

	public String getLastErrorStr() {
		return lastErrorStr;
	}
	
	
	public String removeLeadingWhitespace(String input) {
		return p_whitespace.matcher(input).replaceAll(""); //$NON-NLS-1$
	}
}

Back to the top