Skip to main content
summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
Diffstat (limited to 'plugins/org.eclipse.xtend/src/org/eclipse/internal/xtend/util/internal/icu/CharsetMatch.java')
-rw-r--r--plugins/org.eclipse.xtend/src/org/eclipse/internal/xtend/util/internal/icu/CharsetMatch.java286
1 files changed, 286 insertions, 0 deletions
diff --git a/plugins/org.eclipse.xtend/src/org/eclipse/internal/xtend/util/internal/icu/CharsetMatch.java b/plugins/org.eclipse.xtend/src/org/eclipse/internal/xtend/util/internal/icu/CharsetMatch.java
new file mode 100644
index 00000000..37278da0
--- /dev/null
+++ b/plugins/org.eclipse.xtend/src/org/eclipse/internal/xtend/util/internal/icu/CharsetMatch.java
@@ -0,0 +1,286 @@
+package org.eclipse.internal.xtend.util.internal.icu;
+/**
+This code was taken from the com.ibm.icu library (3.6.1). We needed to copy it to have not
+a dependency to this large library only the need of charset recognition.
+(KTH 2007-07-30)
+
+The com.ibm.icu library is published under Eclipse Public License V1.0.
+ */
+
+/**
+*******************************************************************************
+* Copyright (C) 2005-2006, International Business Machines Corporation and *
+* others. All Rights Reserved. *
+*******************************************************************************
+*/
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+
+
+/**
+ * This class represents a charset that has been identified by a CharsetDetector
+ * as a possible encoding for a set of input data. From an instance of this
+ * class, you can ask for a confidence level in the charset identification,
+ * or for Java Reader or String to access the original byte data in Unicode form.
+ * <p/>
+ * Instances of this class are created only by CharsetDetectors.
+ * <p/>
+ * Note: this class has a natural ordering that is inconsistent with equals.
+ * The natural ordering is based on the match confidence value.
+ *
+ * @draft ICU 3.4
+ * @provisional This API might change or be removed in a future release.
+ */
+public class CharsetMatch implements Comparable {
+
+
+ /**
+ * Create a java.io.Reader for reading the Unicode character data corresponding
+ * to the original byte data supplied to the Charset detect operation.
+ * <p/>
+ * CAUTION: if the source of the byte data was an InputStream, a Reader
+ * can be created for only one matching char set using this method. If more
+ * than one charset needs to be tried, the caller will need to reset
+ * the InputStream and create InputStreamReaders itself, based on the charset name.
+ *
+ * @return the Reader for the Unicode character data.
+ *
+ * @draft ICU 3.4
+ * @provisional This API might change or be removed in a future release.
+ */
+ public Reader getReader() {
+ InputStream inputStream = fInputStream;
+
+ if (inputStream == null) {
+ inputStream = new ByteArrayInputStream(fRawInput, 0, fRawLength);
+ }
+
+ try {
+ inputStream.reset();
+ return new InputStreamReader(inputStream, getName());
+ } catch (IOException e) {
+ return null;
+ }
+ }
+
+ /**
+ * Create a Java String from Unicode character data corresponding
+ * to the original byte data supplied to the Charset detect operation.
+ *
+ * @return a String created from the converted input data.
+ *
+ * @draft ICU 3.4
+ * @provisional This API might change or be removed in a future release.
+ */
+ public String getString() throws java.io.IOException {
+ return getString(-1);
+
+ }
+
+ /**
+ * Create a Java String from Unicode character data corresponding
+ * to the original byte data supplied to the Charset detect operation.
+ * The length of the returned string is limited to the specified size;
+ * the string will be trunctated to this length if necessary. A limit value of
+ * zero or less is ignored, and treated as no limit.
+ *
+ * @param maxLength The maximium length of the String to be created when the
+ * source of the data is an input stream, or -1 for
+ * unlimited length.
+ * @return a String created from the converted input data.
+ *
+ * @draft ICU 3.4
+ * @provisional This API might change or be removed in a future release.
+ */
+ public String getString(int maxLength) throws java.io.IOException {
+ String result = null;
+ if (fInputStream != null) {
+ StringBuffer sb = new StringBuffer();
+ char[] buffer = new char[1024];
+ Reader reader = getReader();
+ int max = maxLength < 0? Integer.MAX_VALUE : maxLength;
+ int bytesRead = 0;
+
+ while ((bytesRead = reader.read(buffer, 0, Math.min(max, 1024))) >= 0) {
+ sb.append(buffer, 0, bytesRead);
+ max -= bytesRead;
+ }
+
+ reader.close();
+
+ return sb.toString();
+ } else {
+ result = new String(fRawInput, getName());
+ }
+ return result;
+
+ }
+
+ /**
+ * Get an indication of the confidence in the charset detected.
+ * Confidence values range from 0-100, with larger numbers indicating
+ * a better match of the input data to the characteristics of the
+ * charset.
+ *
+ * @return the confidence in the charset match
+ *
+ * @draft ICU 3.4
+ * @provisional This API might change or be removed in a future release.
+ */
+ public int getConfidence() {
+ return fConfidence;
+ }
+
+
+ /**
+ * Bit flag indicating the match is based on the the encoding scheme.
+ *
+ * @see #getMatchType
+ * @draft ICU 3.4
+ * @provisional This API might change or be removed in a future release.
+ */
+ static public final int ENCODING_SCHEME = 1;
+
+ /**
+ * Bit flag indicating the match is based on the presence of a BOM.
+ *
+ * @see #getMatchType
+ * @draft ICU 3.4
+ * @provisional This API might change or be removed in a future release.
+ */
+ static public final int BOM = 2;
+
+ /**
+ * Bit flag indicating he match is based on the declared encoding.
+ *
+ * @see #getMatchType
+ * @draft ICU 3.4
+ * @provisional This API might change or be removed in a future release.
+ */
+ static public final int DECLARED_ENCODING = 4;
+
+ /**
+ * Bit flag indicating the match is based on language statistics.
+ *
+ * @see #getMatchType
+ * @draft ICU 3.4
+ * @provisional This API might change or be removed in a future release.
+ */
+ static public final int LANG_STATISTICS = 8;
+
+ /**
+ * Return flags indicating what it was about the input data
+ * that caused this charset to be considered as a possible match.
+ * The result is a bitfield containing zero or more of the flags
+ * ENCODING_SCHEME, BOM, DECLARED_ENCODING, and LANG_STATISTICS.
+ * A result of zero means no information is available.
+ * <p>
+ * Note: currently, this method always returns zero.
+ * <p>
+ *
+ * @return the type of match found for this charset.
+ *
+ * @draft ICU 3.4
+ * @provisional This API might change or be removed in a future release.
+ */
+ public int getMatchType() {
+// TODO: create a list of enum-like constants for common combinations of types of matches.
+ return 0;
+ }
+
+ /**
+ * Get the name of the detected charset.
+ * The name will be one that can be used with other APIs on the
+ * platform that accept charset names. It is the "Canonical name"
+ * as defined by the class java.nio.charset.Charset; for
+ * charsets that are registered with the IANA charset registry,
+ * this is the MIME-preferred registerd name.
+ *
+ * @see java.nio.charset.Charset
+ * @see java.io.InputStreamReader
+ *
+ * @return The name of the charset.
+ *
+ * @draft ICU 3.4
+ * @provisional This API might change or be removed in a future release.
+ */
+ public String getName() {
+ return fRecognizer.getName();
+ }
+
+ /**
+ * Get the ISO code for the language of the detected charset.
+ *
+ * @return The ISO code for the language or <code>null</code> if the language cannot be determined.
+ *
+ * @draft ICU 3.4
+ * @provisional This API might change or be removed in a future release.
+ */
+ public String getLanguage() {
+ return fRecognizer.getLanguage();
+ }
+
+
+ /**
+ * Compare to other CharsetMatch objects.
+ * Comparison is based on the match confidence value, which
+ * allows CharsetDetector.detectAll() to order its results.
+ *
+ * @param o the CharsetMatch object to compare against.
+ * @return a negative integer, zero, or a positive integer as the
+ * confidence level of this CharsetMatch
+ * is less than, equal to, or greater than that of
+ * the argument.
+ * @throws ClassCastException if the argument is not a CharsetMatch.
+ * @draft ICU 3.4
+ * @provisional This API might change or be removed in a future release.
+ */
+ public int compareTo (Object o) {
+ CharsetMatch other = (CharsetMatch)o;
+ int compareResult = 0;
+ if (this.fConfidence > other.fConfidence) {
+ compareResult = 1;
+ } else if (this.fConfidence < other.fConfidence) {
+ compareResult = -1;
+ }
+ return compareResult;
+ }
+
+ /**
+ * Constructor. Implementation internal
+ *
+ * @internal
+ */
+ CharsetMatch(CharsetDetector det, CharsetRecognizer rec, int conf) {
+ fRecognizer = rec;
+ fConfidence = conf;
+
+ // The references to the original aplication input data must be copied out
+ // of the charset recognizer to here, in case the application resets the
+ // recognizer before using this CharsetMatch.
+ if (det.fInputStream == null) {
+ // We only want the existing input byte data if it came straight from the user,
+ // not if is just the head of a stream.
+ fRawInput = det.fRawInput;
+ fRawLength = det.fRawLength;
+ }
+ fInputStream = det.fInputStream;
+ }
+
+
+ //
+ // Private Data
+ //
+ private int fConfidence;
+ private CharsetRecognizer fRecognizer;
+ private byte[] fRawInput = null; // Original, untouched input bytes.
+ // If user gave us a byte array, this is it.
+ private int fRawLength; // Length of data in fRawInput array.
+
+ private InputStream fInputStream = null; // User's input stream, or null if the user
+ // gave us a byte array.
+}

Back to the top