david_williams | dce4ddd | 2005-03-18 05:35:37 +0000 | [diff] [blame] | 1 | /******************************************************************************* |
| 2 | * Copyright (c) 2001, 2004 IBM Corporation and others. |
| 3 | * All rights reserved. This program and the accompanying materials |
| 4 | * are made available under the terms of the Eclipse Public License v1.0 |
| 5 | * which accompanies this distribution, and is available at |
| 6 | * http://www.eclipse.org/legal/epl-v10.html |
| 7 | * |
| 8 | * Contributors: |
| 9 | * IBM Corporation - initial API and implementation |
| 10 | * Jens Lukowski/Innoopract - initial renaming/restructuring |
| 11 | * |
| 12 | *******************************************************************************/ |
| 13 | package org.eclipse.wst.sse.core.internal.encoding; |
| 14 | |
| 15 | import org.eclipse.core.runtime.content.IContentDescription; |
| 16 | |
| 17 | |
| 18 | /** |
| 19 | * This class is to simply hold information and data about the type of |
| 20 | * encoding found for a resource. It not only includes names, etc., but also |
| 21 | * gives hints about the algorithm, or rule, that the encodng was determined. |
| 22 | * Having all this info in a central object, associated with the Document |
| 23 | * (technically, IStructuredDocument), allows for better user error messages, |
| 24 | * and better handling of knowing how to dump a file, given we know how it was |
| 25 | * loaded. |
| 26 | * |
| 27 | * Note: the data in this class is only valid if its has actually gone through |
| 28 | * the loading or dumping sequence. It is not accurate, for example, if a |
| 29 | * structuredDocument is simply created and then setText called. In this type |
| 30 | * of case, accuracy for loading and dumping is not required, since its all |
| 31 | * re-discovered. One limitation is that structuredDocument's created "from |
| 32 | * scratch" this way, don't have any encoding information to count on, and |
| 33 | * would have to arrange the processing to be done. (And it is done, |
| 34 | * automatically if going through loader or dumper, but perhaps not in future |
| 35 | * new uses. TODO: this can be inproved in future versions.) |
| 36 | * |
| 37 | * isInitialized is set when the loader or dumper processes have been used, |
| 38 | * but even this can't be counted on 100% if the document has been modified |
| 39 | * since. |
david_williams | 7a65dc2 | 2005-04-09 02:19:50 +0000 | [diff] [blame] | 40 | * |
david_williams | dce4ddd | 2005-03-18 05:35:37 +0000 | [diff] [blame] | 41 | */ |
| 42 | public class EncodingMemento implements Cloneable { |
| 43 | |
| 44 | public final static String CLONED = "cloned"; //$NON-NLS-1$ |
| 45 | public final static String DEFAULTS_ASSUMED_FOR_EMPTY_INPUT = "DefaultsAssumedForEmptyInput"; //$NON-NLS-1$ |
| 46 | public final static String DEFAULTS_USED_DUE_TO_SMALL_STREAM = "defaultsUsedDueToSmallStream"; //$NON-NLS-1$ |
| 47 | |
| 48 | |
| 49 | /* |
| 50 | * Strings to be used for tracing. TODO: need to clean this up, we no |
| 51 | * longer use all of them |
| 52 | */ |
| 53 | public final static String DETECTED_STANDARD_UNICODE_BYTES = "detectedStandardUnicodeBytes"; //$NON-NLS-1$ |
| 54 | public final static String FOUND_ENCODING_IN_CONTENT = "foundEncodingInContent"; //$NON-NLS-1$ |
| 55 | public final static String FOUND_ENCODING_IN_STREAM = "foundEncodingInStream"; //$NON-NLS-1$ |
| 56 | public final static String FOUND_ENCODING_IN_STRUCTURED_DOCUMENT = "foundEncodingInStructuredDocument"; //$NON-NLS-1$ |
| 57 | public final static String GUESSED_ENCODING_FROM_STREAM = "GuessEncodingFromStream"; //$NON-NLS-1$ |
| 58 | public final static String JAVA_NAME_FOUND_AS_IANA_NAME = "noMappingFoundButJavaNameFoundToBeIANAName"; //$NON-NLS-1$ |
| 59 | public final static String JAVA_NAME_FOUND_IN_ALIAS_NAME = "noMappingFoundButJavaNameFoundInAliasTable"; //$NON-NLS-1$ |
| 60 | public final static String NO_IANA_NAME_FOUND = "noMappingFoundFromJavaNameToIANAName"; //$NON-NLS-1$ |
| 61 | public final static String USED_CONTENT_TYPE_DEFAULT = "UsedContentTypeDefault"; //$NON-NLS-1$ |
| 62 | public final static String USED_JAVA_DEFAULT = "UsedJavaDefault"; //$NON-NLS-1$ |
| 63 | public final static String USED_MEMENTO_FROM_LOAD = "usedMementoFromLoad"; //$NON-NLS-1$ |
| 64 | public final static String USED_PROPERTY_SETTINGS = "USED_PROPERTY_SETTINGS"; //$NON-NLS-1$ |
| 65 | public final static String USED_USER_SPECIFIED_PREFERENCE = "UsedUserSpecifiedPreference"; //$NON-NLS-1$ |
| 66 | public final static String USED_WORKSPACE_DEFAULT = "UsedWorkspaceDefault"; //$NON-NLS-1$ |
| 67 | public final static String USER_IS_USING_JAVA_ENCODING = "UserIsUsingJavaEncoding"; //$NON-NLS-1$ |
| 68 | private String fAppropriateDefault; |
| 69 | private String fDetectedCharsetName; |
david_williams | dce4ddd | 2005-03-18 05:35:37 +0000 | [diff] [blame] | 70 | private String fInvalidEncoding; |
david_williams | dce4ddd | 2005-03-18 05:35:37 +0000 | [diff] [blame] | 71 | |
| 72 | |
| 73 | private String fJavaCharsetName; |
| 74 | private boolean fUnicodeStream; |
| 75 | private boolean fUTF83ByteBOMUsed; |
| 76 | |
david_williams | 126339f | 2005-07-05 05:54:08 +0000 | [diff] [blame] | 77 | public EncodingMemento() { |
| 78 | super(); |
| 79 | } |
| 80 | |
david_williams | dce4ddd | 2005-03-18 05:35:37 +0000 | [diff] [blame] | 81 | /** |
david_williams | 7a65dc2 | 2005-04-09 02:19:50 +0000 | [diff] [blame] | 82 | * Returns a clone of this object. |
david_williams | dce4ddd | 2005-03-18 05:35:37 +0000 | [diff] [blame] | 83 | */ |
| 84 | public Object clone() { |
| 85 | EncodingMemento object = null; |
| 86 | try { |
| 87 | object = (EncodingMemento) super.clone(); |
david_williams | 7a65dc2 | 2005-04-09 02:19:50 +0000 | [diff] [blame] | 88 | } |
| 89 | catch (CloneNotSupportedException e) { |
david_williams | dce4ddd | 2005-03-18 05:35:37 +0000 | [diff] [blame] | 90 | // impossible, since we're implementing here |
| 91 | } |
| 92 | |
| 93 | return object; |
| 94 | |
| 95 | } |
| 96 | |
| 97 | /** |
| 98 | * Returns the appropriateDefault. This is only set if an invalid encoding |
| 99 | * was found, and contains an charset appropriate to use as a default |
| 100 | * value, if, for example, the user decides to load the document anyway, |
| 101 | * even though the charset was found to be invalid. |
| 102 | * |
| 103 | * @return String |
| 104 | */ |
| 105 | public String getAppropriateDefault() { |
| 106 | if (fAppropriateDefault == null) { |
| 107 | fAppropriateDefault = NonContentBasedEncodingRules.useDefaultNameRules(null); |
| 108 | } |
| 109 | return fAppropriateDefault; |
| 110 | } |
| 111 | |
| 112 | /** |
| 113 | * Returns the charset name, if it is different from the charset name |
| 114 | * found in getJavaCharsetName. This can happen, for example, if there are |
| 115 | * differences in case. This method might return SHIFT_JIS, and the the |
| 116 | * getJavaCharsetName might return Shift_JIS -- if SHIFT_JIS was detected |
| 117 | * in file/document. If the original file contained the correct case, then |
| 118 | * this method would return null. The getJavaCharsetName is typically the |
| 119 | * one that should always be used, and this one only used for certain |
| 120 | * error conditions, or or if when creating a "duplicate" resource, it was |
| 121 | * desired to use exactly the charset name as in the original document. As |
| 122 | * an example of this later case, the original document might contain |
| 123 | * ISO-8859-9, but the detected charset name might contain ISO-8859-9-I. |
| 124 | * |
| 125 | * @return String |
| 126 | */ |
| 127 | public String getDetectedCharsetName() { |
| 128 | return fDetectedCharsetName; |
| 129 | } |
| 130 | |
| 131 | /** |
david_williams | dce4ddd | 2005-03-18 05:35:37 +0000 | [diff] [blame] | 132 | * Returns a charset name that was detected, but not found to be a charset |
| 133 | * suppoorted by the VM. |
| 134 | * |
| 135 | * @return String |
| 136 | */ |
| 137 | public String getInvalidEncoding() { |
| 138 | return fInvalidEncoding; |
| 139 | } |
| 140 | |
| 141 | /** |
| 142 | * Returns the java cononical charset name. |
| 143 | * |
| 144 | * @return String |
| 145 | */ |
| 146 | public String getJavaCharsetName() { |
| 147 | return fJavaCharsetName; |
| 148 | } |
| 149 | |
| 150 | /** |
| 151 | * Note: we may be able to remove this method, if it turns out this work |
| 152 | * is done by "text" type. |
| 153 | * |
| 154 | * @deprecated - |
| 155 | */ |
| 156 | public byte[] getUnicodeBOM() { |
| 157 | byte[] bom = null; |
| 158 | if (isUTF83ByteBOMUsed()) |
| 159 | bom = IContentDescription.BOM_UTF_8; |
| 160 | else if (isUnicodeStream()) { |
| 161 | if (getJavaCharsetName().equals("UTF-16") || getJavaCharsetName().equals("UTF-16LE")) { //$NON-NLS-1$ //$NON-NLS-2$ |
| 162 | bom = IContentDescription.BOM_UTF_16LE; |
david_williams | 7a65dc2 | 2005-04-09 02:19:50 +0000 | [diff] [blame] | 163 | } |
| 164 | else if (getJavaCharsetName().equals("UTF-16BE")) { //$NON-NLS-1$ |
david_williams | dce4ddd | 2005-03-18 05:35:37 +0000 | [diff] [blame] | 165 | bom = IContentDescription.BOM_UTF_16BE; |
| 166 | } |
| 167 | |
| 168 | } |
| 169 | return bom; |
| 170 | } |
| 171 | |
| 172 | /** |
david_williams | dce4ddd | 2005-03-18 05:35:37 +0000 | [diff] [blame] | 173 | * Note: in our implementation, the stream is a unicode stream if the |
| 174 | * charset is UTF-16, UTF-16LE, or UTF-16BE. A stream with 3 byte BOM is |
| 175 | * not considered unicode stream here. |
| 176 | * |
| 177 | * @return returns true if is a unicode (UTF-16) stream |
| 178 | */ |
| 179 | public boolean isUnicodeStream() { |
| 180 | return fUnicodeStream; |
| 181 | } |
| 182 | |
| 183 | /** |
| 184 | * Note: in our implementation, the stream is a unicode stream if the |
| 185 | * charset is UTF-16, UTF-16LE, or UTF-16BE. A stream with 3 byte BOM is |
| 186 | * not considered unicode stream here. |
| 187 | * |
| 188 | * Set during load, can be used by dumper to write 3 byte BOM, which Java |
| 189 | * does not normally do. This helps maintain compatibility with other |
| 190 | * programs (those that wrote the 3 byte BOM there to begin with. |
| 191 | * |
| 192 | * @return boolean |
| 193 | */ |
| 194 | public boolean isUTF83ByteBOMUsed() { |
| 195 | return fUTF83ByteBOMUsed; |
| 196 | } |
| 197 | |
| 198 | public boolean isValid() { |
| 199 | return getInvalidEncoding() == null; |
| 200 | } |
| 201 | |
| 202 | /** |
| 203 | * Sets the appropriateDefault. |
| 204 | * |
| 205 | * @param appropriateDefault |
| 206 | * The appropriateDefault to set |
| 207 | */ |
| 208 | public void setAppropriateDefault(String appropriateDefault) { |
| 209 | fAppropriateDefault = appropriateDefault; |
| 210 | } |
| 211 | |
| 212 | |
| 213 | public void setDetectedCharsetName(String detectedCharsetName) { |
| 214 | fDetectedCharsetName = detectedCharsetName; |
| 215 | } |
| 216 | |
david_williams | dce4ddd | 2005-03-18 05:35:37 +0000 | [diff] [blame] | 217 | public void setInvalidEncoding(String invalidEncoding) { |
| 218 | fInvalidEncoding = invalidEncoding; |
| 219 | } |
| 220 | |
| 221 | /** |
david_williams | dce4ddd | 2005-03-18 05:35:37 +0000 | [diff] [blame] | 222 | * Sets the javaEncodingName. |
| 223 | * |
| 224 | * @param javaEncodingName |
| 225 | * The javaEncodingName to set |
| 226 | */ |
| 227 | public void setJavaCharsetName(String javaCharsetName) { |
| 228 | fJavaCharsetName = javaCharsetName; |
| 229 | } |
| 230 | |
| 231 | /** |
| 232 | * @param b |
| 233 | */ |
| 234 | public void setUnicodeStream(boolean unicodeStream) { |
| 235 | fUnicodeStream = unicodeStream; |
| 236 | |
| 237 | } |
| 238 | |
| 239 | /** |
| 240 | * Sets the uTF83ByteBOMfound. |
| 241 | * |
| 242 | * @param uTF83ByteBOMfound |
| 243 | * The uTF83ByteBOMfound to set |
| 244 | */ |
| 245 | public void setUTF83ByteBOMUsed(boolean uTF83ByteBOMUsed) { |
| 246 | fUTF83ByteBOMUsed = uTF83ByteBOMUsed; |
| 247 | } |
| 248 | |
| 249 | } |