Skip to main content

This CGIT instance is deprecated, and repositories have been moved to Gitlab or Github. See the repository descriptions for specific locations.

summaryrefslogtreecommitdiffstats
blob: 82f6e5c49a7b076583e818bd397be21855d1c7c3 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
/*******************************************************************************
 * Copyright (c) 2001, 2004 IBM Corporation and others.
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License v1.0
 * which accompanies this distribution, and is available at
 * http://www.eclipse.org/legal/epl-v10.html
 * 
 * Contributors:
 *     IBM Corporation - initial API and implementation
 *     Jens Lukowski/Innoopract - initial renaming/restructuring
 *     
 *******************************************************************************/
package org.eclipse.wst.css.core.internal.contenttype;

/**
 * 
 * This is ported from PageDesigner's hpbcom/Kanji.cpp's
 * Kanji::guess_kanji_code(),
 *  
 */
public class EncodingGuesser {
	private static final int ASCII = 0; // ASCII
	// ISO-2022-JP
	private static final int ASCII_IN = 8; // This is after ISO2022's change
	// Shift-JIS
	private static final int EUC_HALFKANA = 6; // This is Half Kana in EUC-JP
	private static final int EUC_JP = 3; // This is EUC-JP
	private static final int ISO2022_JP = 4; // This is ISO-2022-JP
	private static final int JIS_HALFKANA = 7; // THis is Half Kana in
	private static final byte KT_EUC1 = 0x40;
	private static final byte KT_EUC2 = (byte) 0x80;
	// ASCII
	private static final byte KT_JIN = 0x01;
	private static final byte KT_JOUT = 0x02;
	//	private static final byte KT_ESC = 0x04;
	//	private static final byte KT_JIS = 0x08;
	private static final byte KT_SFT1 = 0x10;
	private static final byte KT_SFT2 = 0x20;
	private static final byte ktype[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 00 */
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 10 */
	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x08, 0x08, 0x09, 0x08, 0x08, 0x08, /* !"#$%&' *//* " */
	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, /* ()*+,-./ */
	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, /* 01234567 */
	0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, /* 89:; <=>? */
	0x29, 0x28, 0x2b, 0x28, 0x28, 0x28, 0x28, 0x28, /* @ABCDEFG */
	0x2a, 0x28, 0x2a, 0x28, 0x28, 0x28, 0x28, 0x28, /* HIJKLMNO */
	0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, /* PQRSTUVW */
	0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, /* XYZ[\]^_ */
	0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, /* abcdefg */
	0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, /* hijklmno */
	0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, /* pqrstuvw */
	0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x00, /* xyz{|}~ */
	0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, /* 80 */
	0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, /* 90 */
	0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x20, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, /* A0 */
	(byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, /* B0 */
	(byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, /* C0 */
	(byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, /* D0 */
	(byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, /* E0 */
	(byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, /* F0 */
	(byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xc0, (byte) 0xc0, 0x00,};
	//	private static final int ISO8859_1 = 1; // ISO-1
	private static final int SHIFT_JIS = 2; // This is Shift-JIS
	private static final int SJIS_HALFKANA = 5; // This is Half Kana in

	/**
	 * Currently, only Japanese encodings are supported.
	 */
	private static final int UNKNOWN = -1; // Unknown

	/**
	 * @return java.lang.String
	 * @param code
	 *            int
	 * 
	 * Convert private int to IANA Encoding name.
	 */
	private static String convertToIANAEncodingName(int code) {
		String encoding = null;

		switch (code) {
			case SHIFT_JIS :
			case SJIS_HALFKANA :
				encoding = "Shift_JIS";//$NON-NLS-1$
				break;
			case EUC_JP :
			case EUC_HALFKANA :
				encoding = "EUC-JP";//$NON-NLS-1$
				break;
			case ISO2022_JP :
			case JIS_HALFKANA :
				encoding = "ISO-2022-JP";//$NON-NLS-1$
			default :
				break;
		}

		return encoding;
	}

	/**
	 * Return guessed Java Encoding name target: bytes to be inspected length:
	 * length of target
	 */
	public static String guessEncoding(byte[] target, int length) {
		int code = UNKNOWN;

		// Currently, only Japanese is supported.
		String system_ctype = java.util.Locale.getDefault().getLanguage();
		String jp_ctype = java.util.Locale.JAPANESE.getLanguage();
		if (system_ctype.compareTo(jp_ctype) == 0) {
			// Ok, I'm under ja_JP.
			code = ASCII;
			int pos = 0;
			while ((code == ASCII) && (length > 0)) {
				int ch1 = target[pos];
				ch1 = ch1 & 0x000000FF;
				int ch2 = (length >= 2) ? target[pos + 1] : 0;
				ch2 = ch2 & 0x000000FF;
				int ch3 = (length >= 3) ? target[pos + 2] : 0;
				ch3 = ch3 & 0x000000FF;
				code = guessJapaneseKanjiCode(ch1, ch2, ch3, 0);
				pos++;
				length--;
			}
			switch (code) {
				case ISO2022_JP :
				case JIS_HALFKANA :
					code = ISO2022_JP;
					break;
				case EUC_JP :
					code = EUC_JP;
					break;
				default :
					code = SHIFT_JIS;
			}
		}
		return (convertToIANAEncodingName(code));
	}

	/**
	 * Guess the encoding. halfkana_flag = 0x01 ( detect SJIS half kana )
	 * halfkana_flag = 0x02 ( detect EUC half kana )
	 */
	private static int guessJapaneseKanjiCode(int ch1, int ch2, int ch3, int halfkana_flag) {
		boolean sjis_hankaku_flag = ((halfkana_flag & 0x01) != 0) ? true : false;
		boolean euc_hankaku_flag = ((halfkana_flag & 0x02) != 0) ? true : false;

		if (ch1 == 0)
			return UNKNOWN;
		if (sjis_hankaku_flag && ch1 >= 0xa1 && ch1 <= 0xdf)
			return SJIS_HALFKANA;
		else if (euc_hankaku_flag && ch1 == 0x8e && ch2 >= 0xa1 && ch2 <= 0xdf)
			return EUC_HALFKANA;
		else if (((ktype[ch1] & KT_SFT1) != 0) && ((ktype[ch2] & KT_SFT2) != 0))
			return SHIFT_JIS;
		else if (((ktype[ch1] & KT_EUC1) != 0) && ((ktype[ch2] & KT_EUC2) != 0))
			return EUC_JP;
		else if (ch1 == 0x1b && ((ktype[ch2] & KT_JIN) != 0))
			return ISO2022_JP;
		else if (ch1 >= 0xa1 && ch1 <= 0xdf)
			return SJIS_HALFKANA;
		else if (ch1 == 0x1b && ch2 == 0x28/* '(' */&& ch3 == 0x49/* 'I' */)
			return JIS_HALFKANA;
		else if (ch1 == 0x1b && ch2 == 0x28/* '(' */&& ((ktype[ch3] & KT_JOUT) != 0))
			return ASCII_IN;

		return ASCII;
	}

	public EncodingGuesser() {
		super();
	}
}

Back to the top