Skip to main content
aboutsummaryrefslogtreecommitdiffstats
blob: 9996d6e569903271c73f169c74779fdd5f1d4ff0 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
/******************************************************************************
 * Copyright (c) 2009, 2020 Borland Software Corporation, CEA LIST, Artal
 * 
 * All rights reserved. This program and the accompanying materials
 * are made available under the terms of the Eclipse Public License 2.0
 * which accompanies this distribution, and is available at
 * https://www.eclipse.org/legal/epl-2.0/ 
 * 
 * SPDX-License-Identifier: EPL-2.0
 *
 * Contributors: 
 *    Artem Tikhomirov (Borland) - initial API and implementation
 *     Aurelien Didier (ARTAL) - aurelien.didier51@gmail.com - Bug 569174
 *****************************************************************************/
package org.eclipse.papyrus.gmf.internal.xpand.inactive;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;

import org.eclipse.papyrus.gmf.internal.xpand.Activator;

/**
 * FIXME tests!!! (especially that C2AB and C2BB without BOM give UTF8)
 * @author artem
 */
public class StreamDecoder {

	public static final Charset LEGACY_ENCODING = Charset.forName("ISO-8859-1"); //$NON-NLS-1$

	private final InputStream myInputStream;
	private final Charset myDefaultEncoding;
	private Reader myResult;
	private Charset myEncoding;

	/**
	 * @param is can't be null
	 * @param defaultEncoding may be null
	 */
	public StreamDecoder(InputStream is, Charset defaultEncoding) {
		assert is != null;
		myInputStream = ensureMarkSupported(is);
		myDefaultEncoding = defaultEncoding;
	}

	public Reader getReader() {
		if (myResult == null) {
			myResult = createReader(myInputStream, getEncoding());
		}
		return myResult;
	}

	/**
	 * @return defaultEncoding, if can't detect
	 */
	public Charset getEncoding() {
		if (myEncoding == null) {
			myEncoding = detectEncoding(myInputStream);
		}
		return myEncoding;
	}

	// is passed supports marks
	protected Charset detectEncoding(InputStream is) {
		assert is.markSupported();
		final int markLimit = 1024;
		is.mark(markLimit); // pure guess, most templates, even those with EPL comment header, got smth that far  
		try {
			int b1 = is.read();
			int b2 = is.read();
			if (b1 == -1 || b2 == -1) {
				return myDefaultEncoding;
			}
			if (b1 == 0xFE && b2 == 0xFF) {
				return Charset.forName("UTF-16BE");
			}
			if (b1 == 0xFF && b2 == 0xFE) {
				return Charset.forName("UTF-16LE");
			}
			int b3 = is.read();
			if (b3 == -1) {
				return myDefaultEncoding;
			}
			if (b1 == 0xEF && b2 == 0xBB && b3 == 0xBF) {
				return Charset.forName("UTF-8");
			}
			is.reset(); // all over again
			boolean foundC2, foundAB, foundBB, foundC2AB, foundC2BB;
			foundC2 = foundAB = foundBB = foundC2AB = foundC2BB = false;
			for (int i = markLimit; i > 0; i--) {
				int b = is.read();
				if (b == -1) {
					break;
				}
				if (!foundAB && !foundC2AB) {
					foundAB = b == 0xAB;
					foundC2AB = foundC2 && foundAB;
				}
				if (!foundBB && !foundC2BB) {
					foundBB = b == 0xBB;
					foundC2BB = foundC2 && foundBB;
				}
				foundC2 = b == 0xC2; // keeps knowledge whether current byte is C2 for the next iteration
			}
			if (foundC2AB && foundC2BB) {
				return Charset.forName("UTF-8");
			}
			if (foundAB && foundBB) {
				return LEGACY_ENCODING;
			}
		} catch (IOException ex) {
			// IGNORE
		} finally {
			try {
				is.reset();
			} catch (IOException ex) {
				// XXX actually, should avoid using Activator as it may trigger plugin initialization
				// but as long as it can barely happen here...
				Activator.logError(ex);
			}
		}
		return myDefaultEncoding;
	}

	protected Reader createReader(InputStream is, Charset encoding) {
		return encoding != null ? new InputStreamReader(is, encoding) : new InputStreamReader(is);
	}

	/**
	 * @return same or wrapped input stream that has {@link InputStream#markSupported()} == true
	 */
	public static InputStream ensureMarkSupported(InputStream is) {
		return is.markSupported() ? is : new BufferedInputStream(is);
	}
//	public static Reader ensureMarkSupported(Reader r) {
//		return r.markSupported() ? r : new BufferedReader(r);
//	}
}

Back to the top