david_williams | 9621348 | 2004-11-11 09:07:12 +0000 | [diff] [blame] | 1 | /******************************************************************************* |
| 2 | * Copyright (c) 2001, 2004 IBM Corporation and others. |
| 3 | * All rights reserved. This program and the accompanying materials |
| 4 | * are made available under the terms of the Eclipse Public License v1.0 |
| 5 | * which accompanies this distribution, and is available at |
| 6 | * http://www.eclipse.org/legal/epl-v10.html |
| 7 | * |
| 8 | * Contributors: |
| 9 | * IBM Corporation - initial API and implementation |
| 10 | * Jens Lukowski/Innoopract - initial renaming/restructuring |
| 11 | * |
| 12 | *******************************************************************************/ |
david_williams | 282b8f4 | 2005-02-14 07:00:56 +0000 | [diff] [blame] | 13 | package org.eclipse.wst.xml.core.internal.contenttype; |
david_williams | 9621348 | 2004-11-11 09:07:12 +0000 | [diff] [blame] | 14 | |
| 15 | import java.io.IOException; |
| 16 | |
david_williams | 9621348 | 2004-11-11 09:07:12 +0000 | [diff] [blame] | 17 | import org.eclipse.wst.common.encoding.EncodingMemento; |
david_williams | 9621348 | 2004-11-11 09:07:12 +0000 | [diff] [blame] | 18 | import org.eclipse.wst.common.encoding.IResourceCharsetDetector; |
david_williams | 9621348 | 2004-11-11 09:07:12 +0000 | [diff] [blame] | 19 | |
| 20 | |
| 21 | public class XMLResourceEncodingDetector extends AbstractResourceEncodingDetector implements IResourceCharsetDetector { |
| 22 | private XMLHeadTokenizer fTokenizer; |
| 23 | |
| 24 | private boolean canHandleAsUnicodeStream(String tokenType) { |
| 25 | boolean canHandleAsUnicodeStream = false; |
| 26 | if (tokenType == EncodingParserConstants.UTF83ByteBOM) { |
| 27 | canHandleAsUnicodeStream = true; |
| 28 | String enc = "UTF-8"; //$NON-NLS-1$ |
| 29 | createEncodingMemento(enc, EncodingMemento.DETECTED_STANDARD_UNICODE_BYTES); |
| 30 | fEncodingMemento.setUTF83ByteBOMUsed(true); |
| 31 | } |
| 32 | |
| 33 | else if (tokenType == EncodingParserConstants.UTF16BE) { |
| 34 | canHandleAsUnicodeStream = true; |
| 35 | String enc = "UTF-16BE"; //$NON-NLS-1$ |
| 36 | createEncodingMemento(enc, EncodingMemento.DETECTED_STANDARD_UNICODE_BYTES); |
| 37 | } else if (tokenType == EncodingParserConstants.UTF16LE) { |
| 38 | canHandleAsUnicodeStream = true; |
| 39 | String enc = "UTF-16"; //$NON-NLS-1$ |
| 40 | createEncodingMemento(enc, EncodingMemento.DETECTED_STANDARD_UNICODE_BYTES); |
| 41 | } |
| 42 | return canHandleAsUnicodeStream; |
| 43 | } |
| 44 | |
| 45 | public String getSpecDefaultEncoding() { |
| 46 | // by default, UTF-8 as per XML spec |
| 47 | final String enc = "UTF-8"; //$NON-NLS-1$ |
| 48 | return enc; |
| 49 | } |
| 50 | |
| 51 | /** |
| 52 | * @return Returns the tokenizer. |
| 53 | */ |
| 54 | private XMLHeadTokenizer getTokenizer() { |
| 55 | // TODO: need to work on 'reset' in tokenizer, so new instance isn't |
| 56 | // always needed |
| 57 | //if (fTokenizer == null) { |
| 58 | fTokenizer = new XMLHeadTokenizer(); |
| 59 | //} |
| 60 | return fTokenizer; |
| 61 | } |
| 62 | |
| 63 | private boolean isLegalString(String valueTokenType) { |
| 64 | if (valueTokenType == null) |
| 65 | return false; |
| 66 | else |
| 67 | return valueTokenType.equals(EncodingParserConstants.StringValue) || valueTokenType.equals(EncodingParserConstants.UnDelimitedStringValue) || valueTokenType.equals(EncodingParserConstants.InvalidTerminatedStringValue) || valueTokenType.equals(EncodingParserConstants.InvalidTermintatedUnDelimitedStringValue); |
| 68 | } |
| 69 | |
| 70 | protected void parseInput() throws IOException { |
| 71 | XMLHeadTokenizer tokenizer = getTokenizer(); |
| 72 | tokenizer.reset(fReader); |
| 73 | HeadParserToken token = null; |
| 74 | String tokenType = null; |
| 75 | do { |
| 76 | token = tokenizer.getNextToken(); |
| 77 | tokenType = token.getType(); |
| 78 | if (canHandleAsUnicodeStream(tokenType)) { |
| 79 | // side effect of canHandle is to create appropriate memento |
| 80 | } else { |
| 81 | if (tokenType == XMLHeadTokenizerConstants.XMLDelEncoding) { |
| 82 | if (tokenizer.hasMoreTokens()) { |
| 83 | token = tokenizer.getNextToken(); |
| 84 | tokenType = token.getType(); |
| 85 | if (isLegalString(tokenType)) { |
| 86 | String enc = token.getText(); |
| 87 | if (enc != null && enc.length() > 0) { |
| 88 | createEncodingMemento(enc, EncodingMemento.FOUND_ENCODING_IN_CONTENT); |
| 89 | } |
| 90 | |
| 91 | } |
| 92 | } |
| 93 | } |
| 94 | } |
| 95 | } while (tokenizer.hasMoreTokens()); |
| 96 | |
| 97 | } |
| 98 | |
| 99 | } |