blob: 853dfde27d2221b80c4fb9dbc65c65c7821060cb [file] [log] [blame]
david_williamsdce4ddd2005-03-18 05:35:37 +00001/*******************************************************************************
2 * Copyright (c) 2001, 2004 IBM Corporation and others.
3 * All rights reserved. This program and the accompanying materials
4 * are made available under the terms of the Eclipse Public License v1.0
5 * which accompanies this distribution, and is available at
6 * http://www.eclipse.org/legal/epl-v10.html
7 *
8 * Contributors:
9 * IBM Corporation - initial API and implementation
10 * Jens Lukowski/Innoopract - initial renaming/restructuring
11 *
12 *******************************************************************************/
13package org.eclipse.wst.sse.core.internal.encoding;
14
15import org.eclipse.core.runtime.content.IContentDescription;
16
17
18/**
19 * This class is to simply hold information and data about the type of
20 * encoding found for a resource. It not only includes names, etc., but also
21 * gives hints about the algorithm, or rule, that the encodng was determined.
22 * Having all this info in a central object, associated with the Document
23 * (technically, IStructuredDocument), allows for better user error messages,
24 * and better handling of knowing how to dump a file, given we know how it was
25 * loaded.
26 *
27 * Note: the data in this class is only valid if its has actually gone through
28 * the loading or dumping sequence. It is not accurate, for example, if a
29 * structuredDocument is simply created and then setText called. In this type
30 * of case, accuracy for loading and dumping is not required, since its all
31 * re-discovered. One limitation is that structuredDocument's created "from
32 * scratch" this way, don't have any encoding information to count on, and
33 * would have to arrange the processing to be done. (And it is done,
34 * automatically if going through loader or dumper, but perhaps not in future
35 * new uses. TODO: this can be inproved in future versions.)
36 *
37 * isInitialized is set when the loader or dumper processes have been used,
38 * but even this can't be counted on 100% if the document has been modified
39 * since.
david_williams7a65dc22005-04-09 02:19:50 +000040 *
david_williamsdce4ddd2005-03-18 05:35:37 +000041 */
42public class EncodingMemento implements Cloneable {
43
44 public final static String CLONED = "cloned"; //$NON-NLS-1$
45 public final static String DEFAULTS_ASSUMED_FOR_EMPTY_INPUT = "DefaultsAssumedForEmptyInput"; //$NON-NLS-1$
46 public final static String DEFAULTS_USED_DUE_TO_SMALL_STREAM = "defaultsUsedDueToSmallStream"; //$NON-NLS-1$
47
48
49 /*
50 * Strings to be used for tracing. TODO: need to clean this up, we no
51 * longer use all of them
52 */
53 public final static String DETECTED_STANDARD_UNICODE_BYTES = "detectedStandardUnicodeBytes"; //$NON-NLS-1$
54 public final static String FOUND_ENCODING_IN_CONTENT = "foundEncodingInContent"; //$NON-NLS-1$
55 public final static String FOUND_ENCODING_IN_STREAM = "foundEncodingInStream"; //$NON-NLS-1$
56 public final static String FOUND_ENCODING_IN_STRUCTURED_DOCUMENT = "foundEncodingInStructuredDocument"; //$NON-NLS-1$
57 public final static String GUESSED_ENCODING_FROM_STREAM = "GuessEncodingFromStream"; //$NON-NLS-1$
58 public final static String JAVA_NAME_FOUND_AS_IANA_NAME = "noMappingFoundButJavaNameFoundToBeIANAName"; //$NON-NLS-1$
59 public final static String JAVA_NAME_FOUND_IN_ALIAS_NAME = "noMappingFoundButJavaNameFoundInAliasTable"; //$NON-NLS-1$
60 public final static String NO_IANA_NAME_FOUND = "noMappingFoundFromJavaNameToIANAName"; //$NON-NLS-1$
61 public final static String USED_CONTENT_TYPE_DEFAULT = "UsedContentTypeDefault"; //$NON-NLS-1$
62 public final static String USED_JAVA_DEFAULT = "UsedJavaDefault"; //$NON-NLS-1$
63 public final static String USED_MEMENTO_FROM_LOAD = "usedMementoFromLoad"; //$NON-NLS-1$
64 public final static String USED_PROPERTY_SETTINGS = "USED_PROPERTY_SETTINGS"; //$NON-NLS-1$
65 public final static String USED_USER_SPECIFIED_PREFERENCE = "UsedUserSpecifiedPreference"; //$NON-NLS-1$
66 public final static String USED_WORKSPACE_DEFAULT = "UsedWorkspaceDefault"; //$NON-NLS-1$
67 public final static String USER_IS_USING_JAVA_ENCODING = "UserIsUsingJavaEncoding"; //$NON-NLS-1$
68 private String fAppropriateDefault;
69 private String fDetectedCharsetName;
david_williamsdce4ddd2005-03-18 05:35:37 +000070 private String fInvalidEncoding;
david_williamsdce4ddd2005-03-18 05:35:37 +000071
72
73 private String fJavaCharsetName;
74 private boolean fUnicodeStream;
75 private boolean fUTF83ByteBOMUsed;
76
david_williams126339f2005-07-05 05:54:08 +000077 public EncodingMemento() {
78 super();
79 }
80
david_williamsdce4ddd2005-03-18 05:35:37 +000081 /**
david_williams7a65dc22005-04-09 02:19:50 +000082 * Returns a clone of this object.
david_williamsdce4ddd2005-03-18 05:35:37 +000083 */
84 public Object clone() {
85 EncodingMemento object = null;
86 try {
87 object = (EncodingMemento) super.clone();
david_williams7a65dc22005-04-09 02:19:50 +000088 }
89 catch (CloneNotSupportedException e) {
david_williamsdce4ddd2005-03-18 05:35:37 +000090 // impossible, since we're implementing here
91 }
92
93 return object;
94
95 }
96
97 /**
98 * Returns the appropriateDefault. This is only set if an invalid encoding
99 * was found, and contains an charset appropriate to use as a default
100 * value, if, for example, the user decides to load the document anyway,
101 * even though the charset was found to be invalid.
102 *
103 * @return String
104 */
105 public String getAppropriateDefault() {
106 if (fAppropriateDefault == null) {
107 fAppropriateDefault = NonContentBasedEncodingRules.useDefaultNameRules(null);
108 }
109 return fAppropriateDefault;
110 }
111
112 /**
113 * Returns the charset name, if it is different from the charset name
114 * found in getJavaCharsetName. This can happen, for example, if there are
115 * differences in case. This method might return SHIFT_JIS, and the the
116 * getJavaCharsetName might return Shift_JIS -- if SHIFT_JIS was detected
117 * in file/document. If the original file contained the correct case, then
118 * this method would return null. The getJavaCharsetName is typically the
119 * one that should always be used, and this one only used for certain
120 * error conditions, or or if when creating a "duplicate" resource, it was
121 * desired to use exactly the charset name as in the original document. As
122 * an example of this later case, the original document might contain
123 * ISO-8859-9, but the detected charset name might contain ISO-8859-9-I.
124 *
125 * @return String
126 */
127 public String getDetectedCharsetName() {
128 return fDetectedCharsetName;
129 }
130
131 /**
david_williamsdce4ddd2005-03-18 05:35:37 +0000132 * Returns a charset name that was detected, but not found to be a charset
133 * suppoorted by the VM.
134 *
135 * @return String
136 */
137 public String getInvalidEncoding() {
138 return fInvalidEncoding;
139 }
140
141 /**
142 * Returns the java cononical charset name.
143 *
144 * @return String
145 */
146 public String getJavaCharsetName() {
147 return fJavaCharsetName;
148 }
149
150 /**
151 * Note: we may be able to remove this method, if it turns out this work
152 * is done by "text" type.
153 *
154 * @deprecated -
155 */
156 public byte[] getUnicodeBOM() {
157 byte[] bom = null;
158 if (isUTF83ByteBOMUsed())
159 bom = IContentDescription.BOM_UTF_8;
160 else if (isUnicodeStream()) {
161 if (getJavaCharsetName().equals("UTF-16") || getJavaCharsetName().equals("UTF-16LE")) { //$NON-NLS-1$ //$NON-NLS-2$
162 bom = IContentDescription.BOM_UTF_16LE;
david_williams7a65dc22005-04-09 02:19:50 +0000163 }
164 else if (getJavaCharsetName().equals("UTF-16BE")) { //$NON-NLS-1$
david_williamsdce4ddd2005-03-18 05:35:37 +0000165 bom = IContentDescription.BOM_UTF_16BE;
166 }
167
168 }
169 return bom;
170 }
171
172 /**
david_williamsdce4ddd2005-03-18 05:35:37 +0000173 * Note: in our implementation, the stream is a unicode stream if the
174 * charset is UTF-16, UTF-16LE, or UTF-16BE. A stream with 3 byte BOM is
175 * not considered unicode stream here.
176 *
177 * @return returns true if is a unicode (UTF-16) stream
178 */
179 public boolean isUnicodeStream() {
180 return fUnicodeStream;
181 }
182
183 /**
184 * Note: in our implementation, the stream is a unicode stream if the
185 * charset is UTF-16, UTF-16LE, or UTF-16BE. A stream with 3 byte BOM is
186 * not considered unicode stream here.
187 *
188 * Set during load, can be used by dumper to write 3 byte BOM, which Java
189 * does not normally do. This helps maintain compatibility with other
190 * programs (those that wrote the 3 byte BOM there to begin with.
191 *
192 * @return boolean
193 */
194 public boolean isUTF83ByteBOMUsed() {
195 return fUTF83ByteBOMUsed;
196 }
197
198 public boolean isValid() {
199 return getInvalidEncoding() == null;
200 }
201
202 /**
203 * Sets the appropriateDefault.
204 *
205 * @param appropriateDefault
206 * The appropriateDefault to set
207 */
208 public void setAppropriateDefault(String appropriateDefault) {
209 fAppropriateDefault = appropriateDefault;
210 }
211
212
213 public void setDetectedCharsetName(String detectedCharsetName) {
214 fDetectedCharsetName = detectedCharsetName;
215 }
216
david_williamsdce4ddd2005-03-18 05:35:37 +0000217 public void setInvalidEncoding(String invalidEncoding) {
218 fInvalidEncoding = invalidEncoding;
219 }
220
221 /**
david_williamsdce4ddd2005-03-18 05:35:37 +0000222 * Sets the javaEncodingName.
223 *
224 * @param javaEncodingName
225 * The javaEncodingName to set
226 */
227 public void setJavaCharsetName(String javaCharsetName) {
228 fJavaCharsetName = javaCharsetName;
229 }
230
231 /**
232 * @param b
233 */
234 public void setUnicodeStream(boolean unicodeStream) {
235 fUnicodeStream = unicodeStream;
236
237 }
238
239 /**
240 * Sets the uTF83ByteBOMfound.
241 *
242 * @param uTF83ByteBOMfound
243 * The uTF83ByteBOMfound to set
244 */
245 public void setUTF83ByteBOMUsed(boolean uTF83ByteBOMUsed) {
246 fUTF83ByteBOMUsed = uTF83ByteBOMUsed;
247 }
248
249}