Home

Contribute

Source code


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249

/*******************************************************************************
 * Copyright (c) 2001, 2004 IBM Corporation and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors:
 *     IBM Corporation - initial API and implementation
 *     Jens Lukowski/Innoopract - initial renaming/restructuring
 *     
 *******************************************************************************/
package org.eclipse.wst.sse.core.internal.encoding;

import org.eclipse.core.runtime.content.IContentDescription;


/**
 * This class is to simply hold information and data about the type of
 * encoding found for a resource. It not only includes names, etc., but also
 * gives hints about the algorithm, or rule, that the encodng was determined.
 * Having all this info in a central object, associated with the Document
 * (technically, IStructuredDocument), allows for better user error messages,
 * and better handling of knowing how to dump a file, given we know how it was
 * loaded.
 * 
 * Note: the data in this class is only valid if its has actually gone through
 * the loading or dumping sequence. It is not accurate, for example, if a
 * structuredDocument is simply created and then setText called. In this type
 * of case, accuracy for loading and dumping is not required, since its all
 * re-discovered. One limitation is that structuredDocument's created "from
 * scratch" this way, don't have any encoding information to count on, and
 * would have to arrange the processing to be done. (And it is done,
 * automatically if going through loader or dumper, but perhaps not in future
 * new uses. TODO: this can be inproved in future versions.)
 * 
 * isInitialized is set when the loader or dumper processes have been used,
 * but even this can't be counted on 100% if the document has been modified
 * since.
 * 
 */
public class EncodingMemento implements Cloneable {

	public final static String CLONED = "cloned"; //$NON-NLS-1$
	public final static String DEFAULTS_ASSUMED_FOR_EMPTY_INPUT = "DefaultsAssumedForEmptyInput"; //$NON-NLS-1$
	public final static String DEFAULTS_USED_DUE_TO_SMALL_STREAM = "defaultsUsedDueToSmallStream"; //$NON-NLS-1$


	/*
	 * Strings to be used for tracing. TODO: need to clean this up, we no
	 * longer use all of them
	 */
	public final static String DETECTED_STANDARD_UNICODE_BYTES = "detectedStandardUnicodeBytes"; //$NON-NLS-1$
	public final static String FOUND_ENCODING_IN_CONTENT = "foundEncodingInContent"; //$NON-NLS-1$
	public final static String FOUND_ENCODING_IN_STREAM = "foundEncodingInStream"; //$NON-NLS-1$
	public final static String FOUND_ENCODING_IN_STRUCTURED_DOCUMENT = "foundEncodingInStructuredDocument"; //$NON-NLS-1$
	public final static String GUESSED_ENCODING_FROM_STREAM = "GuessEncodingFromStream"; //$NON-NLS-1$
	public final static String JAVA_NAME_FOUND_AS_IANA_NAME = "noMappingFoundButJavaNameFoundToBeIANAName"; //$NON-NLS-1$
	public final static String JAVA_NAME_FOUND_IN_ALIAS_NAME = "noMappingFoundButJavaNameFoundInAliasTable"; //$NON-NLS-1$
	public final static String NO_IANA_NAME_FOUND = "noMappingFoundFromJavaNameToIANAName"; //$NON-NLS-1$
	public final static String USED_CONTENT_TYPE_DEFAULT = "UsedContentTypeDefault"; //$NON-NLS-1$
	public final static String USED_JAVA_DEFAULT = "UsedJavaDefault"; //$NON-NLS-1$
	public final static String USED_MEMENTO_FROM_LOAD = "usedMementoFromLoad"; //$NON-NLS-1$
	public final static String USED_PROPERTY_SETTINGS = "USED_PROPERTY_SETTINGS"; //$NON-NLS-1$
	public final static String USED_USER_SPECIFIED_PREFERENCE = "UsedUserSpecifiedPreference"; //$NON-NLS-1$
	public final static String USED_WORKSPACE_DEFAULT = "UsedWorkspaceDefault"; //$NON-NLS-1$
	public final static String USER_IS_USING_JAVA_ENCODING = "UserIsUsingJavaEncoding"; //$NON-NLS-1$
	private String fAppropriateDefault;
	private String fDetectedCharsetName;
	private String fInvalidEncoding;


	private String fJavaCharsetName;
	private boolean fUnicodeStream;
	private boolean fUTF83ByteBOMUsed;

	public EncodingMemento() {
		super();
	}

	/**
	 * Returns a clone of this object.
	 */
	public Object clone() {
		EncodingMemento object = null;
		try {
			object = (EncodingMemento) super.clone();
		}
		catch (CloneNotSupportedException e) {
			// impossible, since we're implementing here
		}

		return object;

	}

	/**
	 * Returns the appropriateDefault. This is only set if an invalid encoding
	 * was found, and contains an charset appropriate to use as a default
	 * value, if, for example, the user decides to load the document anyway,
	 * even though the charset was found to be invalid.
	 * 
	 * @return String
	 */
	public String getAppropriateDefault() {
		if (fAppropriateDefault == null) {
			fAppropriateDefault = NonContentBasedEncodingRules.useDefaultNameRules(null);
		}
		return fAppropriateDefault;
	}

	/**
	 * Returns the charset name, if it is different from the charset name
	 * found in getJavaCharsetName. This can happen, for example, if there are
	 * differences in case. This method might return SHIFT_JIS, and the the
	 * getJavaCharsetName might return Shift_JIS -- if SHIFT_JIS was detected
	 * in file/document. If the original file contained the correct case, then
	 * this method would return null. The getJavaCharsetName is typically the
	 * one that should always be used, and this one only used for certain
	 * error conditions, or or if when creating a "duplicate" resource, it was
	 * desired to use exactly the charset name as in the original document. As
	 * an example of this later case, the original document might contain
	 * ISO-8859-9, but the detected charset name might contain ISO-8859-9-I.
	 * 
	 * @return String
	 */
	public String getDetectedCharsetName() {
		return fDetectedCharsetName;
	}

	/**
	 * Returns a charset name that was detected, but not found to be a charset
	 * suppoorted by the VM.
	 * 
	 * @return String
	 */
	public String getInvalidEncoding() {
		return fInvalidEncoding;
	}

	/**
	 * Returns the java cononical charset name.
	 * 
	 * @return String
	 */
	public String getJavaCharsetName() {
		return fJavaCharsetName;
	}

	/**
	 * Note: we may be able to remove this method, if it turns out this work
	 * is done by "text" type.
	 * 
	 * @deprecated -
	 */
	public byte[] getUnicodeBOM() {
		byte[] bom = null;
		if (isUTF83ByteBOMUsed())
			bom = IContentDescription.BOM_UTF_8;
		else if (isUnicodeStream()) {
			if (getJavaCharsetName().equals("UTF-16") || getJavaCharsetName().equals("UTF-16LE")) { //$NON-NLS-1$ //$NON-NLS-2$
				bom = IContentDescription.BOM_UTF_16LE;
			}
			else if (getJavaCharsetName().equals("UTF-16BE")) { //$NON-NLS-1$
				bom = IContentDescription.BOM_UTF_16BE;
			}

		}
		return bom;
	}

	/**
	 * Note: in our implementation, the stream is a unicode stream if the
	 * charset is UTF-16, UTF-16LE, or UTF-16BE. A stream with 3 byte BOM is
	 * not considered unicode stream here.
	 * 
	 * @return returns true if is a unicode (UTF-16) stream
	 */
	public boolean isUnicodeStream() {
		return fUnicodeStream;
	}

	/**
	 * Note: in our implementation, the stream is a unicode stream if the
	 * charset is UTF-16, UTF-16LE, or UTF-16BE. A stream with 3 byte BOM is
	 * not considered unicode stream here.
	 * 
	 * Set during load, can be used by dumper to write 3 byte BOM, which Java
	 * does not normally do. This helps maintain compatibility with other
	 * programs (those that wrote the 3 byte BOM there to begin with.
	 * 
	 * @return boolean
	 */
	public boolean isUTF83ByteBOMUsed() {
		return fUTF83ByteBOMUsed;
	}

	public boolean isValid() {
		return getInvalidEncoding() == null;
	}

	/**
	 * Sets the appropriateDefault.
	 * 
	 * @param appropriateDefault
	 *            The appropriateDefault to set
	 */
	public void setAppropriateDefault(String appropriateDefault) {
		fAppropriateDefault = appropriateDefault;
	}


	public void setDetectedCharsetName(String detectedCharsetName) {
		fDetectedCharsetName = detectedCharsetName;
	}

	public void setInvalidEncoding(String invalidEncoding) {
		fInvalidEncoding = invalidEncoding;
	}

	/**
	 * Sets the javaEncodingName.
	 * 
	 * @param javaEncodingName
	 *            The javaEncodingName to set
	 */
	public void setJavaCharsetName(String javaCharsetName) {
		fJavaCharsetName = javaCharsetName;
	}

	/**
	 * @param b
	 */
	public void setUnicodeStream(boolean unicodeStream) {
		fUnicodeStream = unicodeStream;

	}

	/**
	 * Sets the uTF83ByteBOMfound.
	 * 
	 * @param uTF83ByteBOMfound
	 *            The uTF83ByteBOMfound to set
	 */
	public void setUTF83ByteBOMUsed(boolean uTF83ByteBOMUsed) {
		fUTF83ByteBOMUsed = uTF83ByteBOMUsed;
	}

}