diff options
Diffstat (limited to 'bundles/org.eclipse.wst.css.core/src/org/eclipse/wst/css/core/internal/contenttype/EncodingGuesser.java')
-rw-r--r-- | bundles/org.eclipse.wst.css.core/src/org/eclipse/wst/css/core/internal/contenttype/EncodingGuesser.java | 173 |
1 files changed, 173 insertions, 0 deletions
diff --git a/bundles/org.eclipse.wst.css.core/src/org/eclipse/wst/css/core/internal/contenttype/EncodingGuesser.java b/bundles/org.eclipse.wst.css.core/src/org/eclipse/wst/css/core/internal/contenttype/EncodingGuesser.java new file mode 100644 index 0000000000..82f6e5c49a --- /dev/null +++ b/bundles/org.eclipse.wst.css.core/src/org/eclipse/wst/css/core/internal/contenttype/EncodingGuesser.java @@ -0,0 +1,173 @@ +/******************************************************************************* + * Copyright (c) 2001, 2004 IBM Corporation and others. + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Eclipse Public License v1.0 + * which accompanies this distribution, and is available at + * http://www.eclipse.org/legal/epl-v10.html + * + * Contributors: + * IBM Corporation - initial API and implementation + * Jens Lukowski/Innoopract - initial renaming/restructuring + * + *******************************************************************************/ +package org.eclipse.wst.css.core.internal.contenttype; + +/** + * + * This is ported from PageDesigner's hpbcom/Kanji.cpp's + * Kanji::guess_kanji_code(), + * + */ +public class EncodingGuesser { + private static final int ASCII = 0; // ASCII + // ISO-2022-JP + private static final int ASCII_IN = 8; // This is after ISO2022's change + // Shift-JIS + private static final int EUC_HALFKANA = 6; // This is Half Kana in EUC-JP + private static final int EUC_JP = 3; // This is EUC-JP + private static final int ISO2022_JP = 4; // This is ISO-2022-JP + private static final int JIS_HALFKANA = 7; // THis is Half Kana in + private static final byte KT_EUC1 = 0x40; + private static final byte KT_EUC2 = (byte) 0x80; + // ASCII + private static final byte KT_JIN = 0x01; + private static final byte KT_JOUT = 0x02; + // private static final byte KT_ESC = 0x04; + // private static final byte KT_JIS = 0x08; + private static final byte KT_SFT1 = 0x10; + private static final byte KT_SFT2 = 0x20; + private static final byte ktype[] = {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 00 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 10 */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x08, 0x08, 0x08, 0x09, 0x08, 0x08, 0x08, /* !"#$%&' *//* " */ + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, /* ()*+,-./ */ + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, /* 01234567 */ + 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, 0x08, /* 89:; <=>? */ + 0x29, 0x28, 0x2b, 0x28, 0x28, 0x28, 0x28, 0x28, /* @ABCDEFG */ + 0x2a, 0x28, 0x2a, 0x28, 0x28, 0x28, 0x28, 0x28, /* HIJKLMNO */ + 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, /* PQRSTUVW */ + 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, /* XYZ[\]^_ */ + 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, /* abcdefg */ + 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, /* hijklmno */ + 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, /* pqrstuvw */ + 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x28, 0x00, /* xyz{|}~ */ + 0x20, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, /* 80 */ + 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, /* 90 */ + 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x30, 0x20, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, /* A0 */ + (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, /* B0 */ + (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, /* C0 */ + (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, /* D0 */ + (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xe0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, /* E0 */ + (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, /* F0 */ + (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xf0, (byte) 0xc0, (byte) 0xc0, 0x00,}; + // private static final int ISO8859_1 = 1; // ISO-1 + private static final int SHIFT_JIS = 2; // This is Shift-JIS + private static final int SJIS_HALFKANA = 5; // This is Half Kana in + + /** + * Currently, only Japanese encodings are supported. + */ + private static final int UNKNOWN = -1; // Unknown + + /** + * @return java.lang.String + * @param code + * int + * + * Convert private int to IANA Encoding name. + */ + private static String convertToIANAEncodingName(int code) { + String encoding = null; + + switch (code) { + case SHIFT_JIS : + case SJIS_HALFKANA : + encoding = "Shift_JIS";//$NON-NLS-1$ + break; + case EUC_JP : + case EUC_HALFKANA : + encoding = "EUC-JP";//$NON-NLS-1$ + break; + case ISO2022_JP : + case JIS_HALFKANA : + encoding = "ISO-2022-JP";//$NON-NLS-1$ + default : + break; + } + + return encoding; + } + + /** + * Return guessed Java Encoding name target: bytes to be inspected length: + * length of target + */ + public static String guessEncoding(byte[] target, int length) { + int code = UNKNOWN; + + // Currently, only Japanese is supported. + String system_ctype = java.util.Locale.getDefault().getLanguage(); + String jp_ctype = java.util.Locale.JAPANESE.getLanguage(); + if (system_ctype.compareTo(jp_ctype) == 0) { + // Ok, I'm under ja_JP. + code = ASCII; + int pos = 0; + while ((code == ASCII) && (length > 0)) { + int ch1 = target[pos]; + ch1 = ch1 & 0x000000FF; + int ch2 = (length >= 2) ? target[pos + 1] : 0; + ch2 = ch2 & 0x000000FF; + int ch3 = (length >= 3) ? target[pos + 2] : 0; + ch3 = ch3 & 0x000000FF; + code = guessJapaneseKanjiCode(ch1, ch2, ch3, 0); + pos++; + length--; + } + switch (code) { + case ISO2022_JP : + case JIS_HALFKANA : + code = ISO2022_JP; + break; + case EUC_JP : + code = EUC_JP; + break; + default : + code = SHIFT_JIS; + } + } + return (convertToIANAEncodingName(code)); + } + + /** + * Guess the encoding. halfkana_flag = 0x01 ( detect SJIS half kana ) + * halfkana_flag = 0x02 ( detect EUC half kana ) + */ + private static int guessJapaneseKanjiCode(int ch1, int ch2, int ch3, int halfkana_flag) { + boolean sjis_hankaku_flag = ((halfkana_flag & 0x01) != 0) ? true : false; + boolean euc_hankaku_flag = ((halfkana_flag & 0x02) != 0) ? true : false; + + if (ch1 == 0) + return UNKNOWN; + if (sjis_hankaku_flag && ch1 >= 0xa1 && ch1 <= 0xdf) + return SJIS_HALFKANA; + else if (euc_hankaku_flag && ch1 == 0x8e && ch2 >= 0xa1 && ch2 <= 0xdf) + return EUC_HALFKANA; + else if (((ktype[ch1] & KT_SFT1) != 0) && ((ktype[ch2] & KT_SFT2) != 0)) + return SHIFT_JIS; + else if (((ktype[ch1] & KT_EUC1) != 0) && ((ktype[ch2] & KT_EUC2) != 0)) + return EUC_JP; + else if (ch1 == 0x1b && ((ktype[ch2] & KT_JIN) != 0)) + return ISO2022_JP; + else if (ch1 >= 0xa1 && ch1 <= 0xdf) + return SJIS_HALFKANA; + else if (ch1 == 0x1b && ch2 == 0x28/* '(' */&& ch3 == 0x49/* 'I' */) + return JIS_HALFKANA; + else if (ch1 == 0x1b && ch2 == 0x28/* '(' */&& ((ktype[ch3] & KT_JOUT) != 0)) + return ASCII_IN; + + return ASCII; + } + + public EncodingGuesser() { + super(); + } +} |