diff options
author | cbrealey | 2006-06-05 21:06:40 +0000 |
---|---|---|
committer | cbrealey | 2006-06-05 21:06:40 +0000 |
commit | 02249aebb88729977b05e4edeea527c69a5d56d4 (patch) | |
tree | df351f3243a1653e6b783d561d410fe4ee34d81f /bundles/org.eclipse.wst.ws.parser | |
parent | 29b2fa1c33e56caf710e03e20f8e401264aabbad (diff) | |
download | webtools.webservices-02249aebb88729977b05e4edeea527c69a5d56d4.tar.gz webtools.webservices-02249aebb88729977b05e4edeea527c69a5d56d4.tar.xz webtools.webservices-02249aebb88729977b05e4edeea527c69a5d56d4.zip |
[142324] WebServicesParser ignores charset when parsing HTML documents
Diffstat (limited to 'bundles/org.eclipse.wst.ws.parser')
2 files changed, 141 insertions, 11 deletions
diff --git a/bundles/org.eclipse.wst.ws.parser/src/org/eclipse/wst/ws/internal/parser/wsil/HTMLHeadHandler.java b/bundles/org.eclipse.wst.ws.parser/src/org/eclipse/wst/ws/internal/parser/wsil/HTMLHeadHandler.java index 36994bb05..e63c60df3 100644 --- a/bundles/org.eclipse.wst.ws.parser/src/org/eclipse/wst/ws/internal/parser/wsil/HTMLHeadHandler.java +++ b/bundles/org.eclipse.wst.ws.parser/src/org/eclipse/wst/ws/internal/parser/wsil/HTMLHeadHandler.java @@ -1,13 +1,16 @@ /******************************************************************************* - * Copyright (c) 2001, 2004 IBM Corporation and others. + * Copyright (c) 2001, 2006 IBM Corporation and others. * All rights reserved. This program and the accompanying materials * are made available under the terms of the Eclipse Public License v1.0 * which accompanies this distribution, and is available at * http://www.eclipse.org/legal/epl-v10.html - * + * * Contributors: * IBM Corporation - initial API and implementation - ******************************************************************************/ + * yyyymmdd bug Email and other contact information + * -------- -------- ----------------------------------------------------------- + * 20060517 142324 rsinha@ca.ibm.com - Rupam Kuehner + *******************************************************************************/ package org.eclipse.wst.ws.internal.parser.wsil; @@ -27,7 +30,13 @@ public class HTMLHeadHandler extends DefaultHandler private final String HEAD_END_TAG = "</head>"; private final String ROOT_START_TAG = "<root>"; private final String ROOT_END_TAG = "</root>"; - private final String UTF8 = "UTF-8"; + private final String UTF8 = "UTF-8"; + + //HTML META tag information used to detect the charset. + private final String HTML_CONTENT = "content"; + private final String HTTP_EQUIV = "http-equiv"; + private final String HTTP_EQUIV_CONTENT_TYPE = "Content-Type"; + private final String CHARSET = "charset"; // WSIL tag information. private final String META = "meta"; @@ -46,6 +55,7 @@ public class HTMLHeadHandler extends DefaultHandler private String baseURI_; private Vector wsils_; private Vector discos_; + private String byteEncoding = UTF8; //Default to UTF-8. public HTMLHeadHandler(String baseURI) { @@ -123,8 +133,20 @@ public class HTMLHeadHandler extends DefaultHandler { } - private void harvestTags(StringBuffer target,String document,String tag) + /** + * Appends the elements of the provided tag in the provided document to the provided StringBuffer. + * @param target + * @param document + * @param tag + * @param encoding + * @return boolean false if the value of the encoding parameter matched the detected charset or if no charset was detected. + * Returns true if a charset was detected and it did not equal the encoding parameter. If true is returned + * the harvesting of the tags would have stopped at the point the charset was detected. The caller + * should call this method again with the correct encoding. + */ + private boolean harvestTags(StringBuffer target,String document,String tag, String encoding) { + boolean changeEncoding = false; int index = document.indexOf(START_TAG); int documentLength = document.length(); int tagLength = tag.length(); @@ -136,19 +158,81 @@ public class HTMLHeadHandler extends DefaultHandler str = document.substring(index,document.indexOf(END_TAG,index+1)+1); target.append(str); index += str.length(); + + //If tag is META and declares the charset, find out what it is + //and if it matches what was passed in. If it matches, continue + //with the parsing and return false when complete. + //If the detected charset is different from what was passed in, + //- change byteEncoding to equal the detected charset. + //- stop parsing. + //- return true. + if (tag.equals(META)) + { + int idxOfContent = str.indexOf(HTML_CONTENT); + int idxOfHTTPEQUIV = str.indexOf(HTTP_EQUIV); + if (idxOfHTTPEQUIV!= -1 && idxOfContent != -1) + { + //Check if the http-equiv attribute is set to Content-Type. + int idxOfHTTPEQUIVOpenQuote = str.indexOf("\"", idxOfHTTPEQUIV+1); + int idxOfHTTPEQUIVClosingQuote = str.indexOf("\"", idxOfHTTPEQUIVOpenQuote+1); + String hTTPEQUIVValueUntrimmed = str.substring(idxOfHTTPEQUIVOpenQuote+1, idxOfHTTPEQUIVClosingQuote); + if (hTTPEQUIVValueUntrimmed.trim().equals(HTTP_EQUIV_CONTENT_TYPE)) + { + //This META tag contains the charset. Get the value of the content attribute + int idxOfOpenQuote = str.indexOf("\"", idxOfContent+1); + int idxOfClosingQuote = str.indexOf("\"", idxOfOpenQuote+1); + String contentValue = str.substring(idxOfOpenQuote+1, idxOfClosingQuote); + + //Get the charset + int idxOfCharSet = contentValue.indexOf(CHARSET); + int idxOfEquals = contentValue.indexOf("=", idxOfCharSet+CHARSET.length()); + String detectedEncodingValueUntrimmed = contentValue.substring(idxOfEquals+1); + String detectedEncodingValue = detectedEncodingValueUntrimmed.trim(); + if (!detectedEncodingValue.equals(encoding)) + { + byteEncoding = detectedEncodingValue; + changeEncoding = true; + break; + } + } + } + } } else index++; index = document.indexOf(START_TAG,index); } + + return changeEncoding; } + + /** + * If the provided byte array reperesents the contents of an HTML + * document, this method will return a byte array in which + * <ul> + * <li>the opening and closing HEAD tags are removed and replaced with + * opening and closing <root> tags</li> + * <li>only the META and LINK elements are in the HTML document + * are included in the contents between the opening and closing + * <root> tags. + * </ul> + * This method will modify the value of the byteEncoding String + * attribute on this class if it is something other than + * UTF-8. Callers of this method should call getByteEncoding() + * after calling this method if they need to know the charset + * value used by this method to decode/endcode the byte array. + * @param b + * @return byte[] + */ public byte[] harvestHeadTags(byte[] b) { String s; + try { - s = new String(b, UTF8); + //Assume the default byte encoding of UTF-8 for now. + s = new String(b, byteEncoding); } catch (UnsupportedEncodingException uee) { @@ -162,10 +246,51 @@ public class HTMLHeadHandler extends DefaultHandler if (headStartIndex != -1 && headEndIndex != -1) { head = s.substring(headStartIndex, headEndIndex+HEAD_END_TAG.length()); - harvestTags(sb,head,META); - harvestTags(sb,head,LINK); + boolean encodingChanged = harvestTags(sb,head,META, byteEncoding); + if (encodingChanged) + { + //The harvestTags method detected a different charset + //than the one that was passed in. Start from the beginning + //with the correct charset. + String s2; + try + { + s2 = new String(b, byteEncoding); + } + catch (UnsupportedEncodingException uee) + { + s2 = new String(b); + } + String head2 = s2.toLowerCase(); + int head2StartIndex = head2.indexOf(HEAD_START_TAG); + int head2EndIndex = head2.indexOf(HEAD_END_TAG); + sb = new StringBuffer(); + sb.append(ROOT_START_TAG); + if (head2StartIndex != -1 && head2EndIndex != -1) + { + head2 = s2.substring(head2StartIndex, head2EndIndex+HEAD_END_TAG.length()); + harvestTags(sb,head2,META, byteEncoding); + harvestTags(sb,head2,LINK,byteEncoding); + } + } + else + { + harvestTags(sb,head,LINK,byteEncoding); + } } sb.append(ROOT_END_TAG); - return sb.toString().getBytes(); + try + { + return sb.toString().getBytes(byteEncoding); + } catch (UnsupportedEncodingException uee) + { + return sb.toString().getBytes(); + } + + } + + public String getByteEncoding() + { + return byteEncoding; } -}
\ No newline at end of file +} diff --git a/bundles/org.eclipse.wst.ws.parser/src/org/eclipse/wst/ws/internal/parser/wsil/WebServicesParser.java b/bundles/org.eclipse.wst.ws.parser/src/org/eclipse/wst/ws/internal/parser/wsil/WebServicesParser.java index 166e27c8b..1902d6cfd 100644 --- a/bundles/org.eclipse.wst.ws.parser/src/org/eclipse/wst/ws/internal/parser/wsil/WebServicesParser.java +++ b/bundles/org.eclipse.wst.ws.parser/src/org/eclipse/wst/ws/internal/parser/wsil/WebServicesParser.java @@ -10,6 +10,7 @@ * yyyymmdd bug Email and other contact information * -------- -------- ----------------------------------------------------------- * 20060504 119296 pmoogk@ca.ibm.com - Peter Moogk + * 20060517 142324 rsinha@ca.ibm.com - Rupam Kuehner *******************************************************************************/ package org.eclipse.wst.ws.internal.parser.wsil; @@ -203,13 +204,17 @@ public class WebServicesParser // parse uri_ as a HTML document HTMLHeadHandler headHandler = new HTMLHeadHandler(theUri); byte[] head = headHandler.harvestHeadTags(b); + String byteEncoding = headHandler.getByteEncoding(); SAXParserFactory factory = SAXParserFactory.newInstance(); factory.setNamespaceAware(false); factory.setValidating(false); SAXParser parser = factory.newSAXParser(); try { - parser.parse(new ByteArrayInputStream(head), headHandler); + ByteArrayInputStream bais = new ByteArrayInputStream(head); + InputStreamReader isr = new InputStreamReader(bais, byteEncoding); + InputSource is = new InputSource(isr); + parser.parse(is, headHandler); } catch (Throwable t) { |