From b3e6ebf416221f649b168bdf83b21aa91f97d34c Mon Sep 17 00:00:00 2001 From: Thomas Becker Date: Wed, 21 Sep 2011 11:47:38 +0200 Subject: Fix for #358121 (Utf8Appendable refactored to use Bjoern Hoehrmann's decoder). Signed-off-by: Simone Bordet --- .../org/eclipse/jetty/util/Utf8Appendable.java | 225 +++++++++++---------- .../org/eclipse/jetty/util/Utf8StringBuffer.java | 54 ++--- .../org/eclipse/jetty/util/Utf8StringBuilder.java | 48 +++-- .../eclipse/jetty/util/Utf8StringBufferTest.java | 112 +++++----- .../eclipse/jetty/util/Utf8StringBuilderTest.java | 118 ++++------- 5 files changed, 265 insertions(+), 292 deletions(-) diff --git a/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8Appendable.java b/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8Appendable.java index b869f431b9..c646979de5 100644 --- a/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8Appendable.java +++ b/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8Appendable.java @@ -1,180 +1,181 @@ +// ======================================================================== +// Copyright (c) 2006-2009 Mort Bay Consulting Pty. Ltd. +// ------------------------------------------------------------------------ +// All rights reserved. This program and the accompanying materials +// are made available under the terms of the Eclipse Public License v1.0 +// and Apache License v2.0 which accompanies this distribution. +// The Eclipse Public License is available at +// http://www.eclipse.org/legal/epl-v10.html +// The Apache License v2.0 is available at +// http://www.opensource.org/licenses/apache2.0.php +// You may elect to redistribute this code under either of these licenses. +// ======================================================================== package org.eclipse.jetty.util; import java.io.IOException; -import java.util.IllegalFormatCodePointException; +/* ------------------------------------------------------------ */ +/** + * Utf8 Appendable abstract base class + * + * This abstract class wraps a standard {@link java.lang.Appendable} and provides methods to append UTF-8 encoded bytes, that are converted into characters. + * + * This class is stateful and up to 4 calls to {@link #append(byte)} may be needed before state a character is appended to the string buffer. + * + * The UTF-8 decoding is done by this class and no additional buffers or Readers are used. The UTF-8 code was inspired by + * http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + * + * License information for Bjoern Hoehrmann's code: + * + * Copyright (c) 2008-2009 Bjoern Hoehrmann + * Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS + * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER + * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + **/ public abstract class Utf8Appendable { private final char REPLACEMENT = '\ufffd'; + private static final int UTF8_ACCEPT = 0; + private static final int UTF8_REJECT = 12; + protected final Appendable _appendable; - protected int _expectedContinuationBytes; - protected int _codePoint; - protected int _minCodePoint; + protected int _state = UTF8_ACCEPT; + + private static final byte[] BYTE_TABLE = + { + // The first part of the table maps bytes to character classes that + // to reduce the size of the transition table and create bitmasks. + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, + 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8 + }; + + private static final byte[] TRANS_TABLE = + { + // The second part is a transition table that maps a combination + // of a state of the automaton and a character class to a state. + 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12, + 12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12, + 12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12, + 12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12, + 12,36,12,12,12,12,12,12,12,12,12,12 + }; + + private int _codep; public Utf8Appendable(Appendable appendable) { - _appendable=appendable; + _appendable = appendable; } public abstract int length(); - + + protected void reset() + { + _state = UTF8_ACCEPT; + } + public void append(byte b) { try { appendByte(b); } - catch(IOException e) + catch (IOException e) { throw new RuntimeException(e); } } - - public void append(byte[] b,int offset, int length) + + public void append(byte[] b, int offset, int length) { try { - int end=offset+length; - for (int i=offset; imaxChars) + if (length() > maxChars) return false; appendByte(b[i]); } return true; } - catch(IOException e) + catch (IOException e) { throw new RuntimeException(e); } } - + protected void appendByte(byte b) throws IOException { - // Check for invalid bytes - if (b==(byte)0xc0 || b==(byte)0xc1 || (int)b>=0xf5) + + if (b > 0 && isUtf8SequenceComplete()) { - _appendable.append(REPLACEMENT); - _expectedContinuationBytes=0; - _codePoint=0; - throw new NotUtf8Exception(); + _appendable.append((char)(b & 0xFF)); } - - // Is it plain ASCII? - if (b>=0) - { - // Were we expecting a continuation byte? - if (_expectedContinuationBytes>0) - { - _appendable.append(REPLACEMENT); - _expectedContinuationBytes=0; - _codePoint=0; - throw new NotUtf8Exception(); - } - else - _appendable.append((char)(0x7f&b)); - } - // Else is this a start byte - else if (_expectedContinuationBytes==0) + else { - if ((b & 0xe0) == 0xc0) - { - //110xxxxx - _expectedContinuationBytes=1; - _codePoint=b&0x1f; - _minCodePoint=0x80; - } - else if ((b & 0xf0) == 0xe0) - { - //1110xxxx - _expectedContinuationBytes=2; - _codePoint=b&0x0f; - _minCodePoint=0x800; - } - else if ((b & 0xf8) == 0xf0) - { - //11110xxx - _expectedContinuationBytes=3; - _codePoint=b&0x07; - _minCodePoint=0x10000; - } - else if ((b & 0xfc) == 0xf8) - { - //111110xx - _expectedContinuationBytes=4; - _codePoint=b&0x03; - _minCodePoint=0x200000; - } - else if ((b & 0xfe) == 0xfc) + int i = b & 0xFF; + int type = BYTE_TABLE[i]; + _codep = isUtf8SequenceComplete() ? (0xFF >> type) & i : (i & 0x3F) | (_codep << 6); + _state = TRANS_TABLE[_state + type]; + + if (isUtf8SequenceComplete()) { - //1111110x - _expectedContinuationBytes=5; - _codePoint=b&0x01; - _minCodePoint=0x400000; + if (_codep < Character.MIN_HIGH_SURROGATE) + { + _appendable.append((char)_codep); + } + else + { + for (char c : Character.toChars(_codep)) + _appendable.append(c); + } } - else + else if (_state == UTF8_REJECT) { + _state = UTF8_ACCEPT; _appendable.append(REPLACEMENT); - _expectedContinuationBytes=0; - _codePoint=0; throw new NotUtf8Exception(); } } - // else is this a continuation character - else if ((b&0xc0)==0x80) - { - // 10xxxxxx - _codePoint=(_codePoint<<6)|(b&0x3f); - - // was that the last continuation? - if (--_expectedContinuationBytes==0) - { - // If this a valid unicode point? - if (_codePoint<_minCodePoint || (_codePoint>=0xD800 && _codePoint<=0xDFFF)) - { - _appendable.append(REPLACEMENT); - _expectedContinuationBytes=0; - _codePoint=0; - throw new NotUtf8Exception(); - } - - _minCodePoint=0; - char[] chars = Character.toChars(_codePoint); - for (char c : chars) - _appendable.append(c); - } - } - // Else this is not a continuation character - else - { - // ! 10xxxxxx - _appendable.append(REPLACEMENT); - _expectedContinuationBytes=0; - _codePoint=0; - throw new NotUtf8Exception(); - } } + protected boolean isUtf8SequenceComplete() + { + return _state == UTF8_ACCEPT; + } public static class NotUtf8Exception extends IllegalArgumentException { public NotUtf8Exception() { - super("!UTF-8"); + super("Not valid UTF8!"); } } -} \ No newline at end of file +} diff --git a/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8StringBuffer.java b/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8StringBuffer.java index bd730deabf..b86058e584 100644 --- a/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8StringBuffer.java +++ b/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8StringBuffer.java @@ -4,71 +4,73 @@ // All rights reserved. This program and the accompanying materials // are made available under the terms of the Eclipse Public License v1.0 // and Apache License v2.0 which accompanies this distribution. -// The Eclipse Public License is available at +// The Eclipse Public License is available at // http://www.eclipse.org/legal/epl-v10.html // The Apache License v2.0 is available at // http://www.opensource.org/licenses/apache2.0.php -// You may elect to redistribute this code under either of these licenses. +// You may elect to redistribute this code under either of these licenses. // ======================================================================== package org.eclipse.jetty.util; -import java.io.IOException; - /* ------------------------------------------------------------ */ -/** UTF-8 StringBuffer. +/** + * UTF-8 StringBuffer. * - * This class wraps a standard {@link java.lang.StringBuffer} and provides methods to append + * This class wraps a standard {@link java.lang.StringBuffer} and provides methods to append * UTF-8 encoded bytes, that are converted into characters. - * - * This class is stateful and up to 6 calls to {@link #append(byte)} may be needed before + * + * This class is stateful and up to 4 calls to {@link #append(byte)} may be needed before * state a character is appended to the string buffer. - * + * * The UTF-8 decoding is done by this class and no additional buffers or Readers are used. - * The UTF-8 code was inspired by http://javolution.org - * - * This class is not synchronised and should probably be called Utf8StringBuilder + * The UTF-8 code was inspired by http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ */ -public class Utf8StringBuffer extends Utf8Appendable +public class Utf8StringBuffer extends Utf8Appendable { final StringBuffer _buffer; - + public Utf8StringBuffer() { super(new StringBuffer()); - _buffer=(StringBuffer)_appendable; + _buffer = (StringBuffer)_appendable; } - + public Utf8StringBuffer(int capacity) { super(new StringBuffer(capacity)); - _buffer=(StringBuffer)_appendable; + _buffer = (StringBuffer)_appendable; } + @Override public int length() { return _buffer.length(); } - + + @Override public void reset() { + super.reset(); _buffer.setLength(0); - _expectedContinuationBytes=0; - _codePoint=0; } - + public StringBuffer getStringBuffer() { - if (_expectedContinuationBytes!=0) - throw new NotUtf8Exception(); + checkState(); return _buffer; } - + @Override public String toString() { - if (_expectedContinuationBytes!=0) - throw new NotUtf8Exception(); + checkState(); return _buffer.toString(); } + + private void checkState() + { + if (!isUtf8SequenceComplete()) + throw new IllegalArgumentException("Tried to read incomplete UTF8 decoded String"); + } } diff --git a/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8StringBuilder.java b/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8StringBuilder.java index 541590f642..09866884ea 100644 --- a/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8StringBuilder.java +++ b/jetty-util/src/main/java/org/eclipse/jetty/util/Utf8StringBuilder.java @@ -4,70 +4,74 @@ // All rights reserved. This program and the accompanying materials // are made available under the terms of the Eclipse Public License v1.0 // and Apache License v2.0 which accompanies this distribution. -// The Eclipse Public License is available at +// The Eclipse Public License is available at // http://www.eclipse.org/legal/epl-v10.html // The Apache License v2.0 is available at // http://www.opensource.org/licenses/apache2.0.php -// You may elect to redistribute this code under either of these licenses. +// You may elect to redistribute this code under either of these licenses. // ======================================================================== package org.eclipse.jetty.util; -import java.io.IOException; /* ------------------------------------------------------------ */ /** UTF-8 StringBuilder. * - * This class wraps a standard {@link java.lang.StringBuffer} and provides methods to append + * This class wraps a standard {@link java.lang.StringBuilder} and provides methods to append * UTF-8 encoded bytes, that are converted into characters. - * - * This class is stateful and up to 6 calls to {@link #append(byte)} may be needed before + * + * This class is stateful and up to 4 calls to {@link #append(byte)} may be needed before * state a character is appended to the string buffer. - * + * * The UTF-8 decoding is done by this class and no additional buffers or Readers are used. - * The UTF-8 code was inspired by http://javolution.org - * + * The UTF-8 code was inspired by http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ + * */ -public class Utf8StringBuilder extends Utf8Appendable +public class Utf8StringBuilder extends Utf8Appendable { final StringBuilder _buffer; - + public Utf8StringBuilder() { super(new StringBuilder()); _buffer=(StringBuilder)_appendable; } - + public Utf8StringBuilder(int capacity) { super(new StringBuilder(capacity)); _buffer=(StringBuilder)_appendable; } - + + @Override public int length() { return _buffer.length(); } - + + @Override public void reset() { + super.reset(); _buffer.setLength(0); - _expectedContinuationBytes=0; - _codePoint=0; } - + public StringBuilder getStringBuilder() { - if (_expectedContinuationBytes!=0) - throw new NotUtf8Exception(); + checkState(); return _buffer; } - + @Override public String toString() { - if (_expectedContinuationBytes!=0) - throw new NotUtf8Exception(); + checkState(); return _buffer.toString(); } + + private void checkState() + { + if (!isUtf8SequenceComplete()) + throw new IllegalArgumentException("Tried to read incomplete UTF8 decoded String"); + } } diff --git a/jetty-util/src/test/java/org/eclipse/jetty/util/Utf8StringBufferTest.java b/jetty-util/src/test/java/org/eclipse/jetty/util/Utf8StringBufferTest.java index 9c44625e8f..eacd85b33b 100644 --- a/jetty-util/src/test/java/org/eclipse/jetty/util/Utf8StringBufferTest.java +++ b/jetty-util/src/test/java/org/eclipse/jetty/util/Utf8StringBufferTest.java @@ -4,92 +4,98 @@ // All rights reserved. This program and the accompanying materials // are made available under the terms of the Eclipse Public License v1.0 // and Apache License v2.0 which accompanies this distribution. -// The Eclipse Public License is available at +// The Eclipse Public License is available at // http://www.eclipse.org/legal/epl-v10.html // The Apache License v2.0 is available at // http://www.opensource.org/licenses/apache2.0.php -// You may elect to redistribute this code under either of these licenses. +// You may elect to redistribute this code under either of these licenses. // ======================================================================== package org.eclipse.jetty.util; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; +import java.io.UnsupportedEncodingException; import org.junit.Test; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; public class Utf8StringBufferTest { - public void testUtfStringBuffer() - throws Exception + @Test + public void testUtfStringBuffer() throws Exception { - String source="abcd012345\n\r\u0000\u00a4\u10fb\ufffdjetty"; + String source = "abcd012345\n\r\u0000\u00a4\u10fb\ufffdjetty"; byte[] bytes = source.getBytes(StringUtil.__UTF8); Utf8StringBuffer buffer = new Utf8StringBuffer(); - for (int i=0;i=0); - } - assertEquals("abc\ufffd",buffer.toString()); + for (byte aByte : bytes) + buffer.append(aByte); } - - @Test - public void testUTF32codes() - throws Exception + + + @Test + public void testUTF32codes() throws Exception { - String source="\uD842\uDF9F"; - byte[] bytes=source.getBytes("UTF-8"); - + String source = "\uD842\uDF9F"; + byte[] bytes = source.getBytes("UTF-8"); + String jvmcheck = new String(bytes,0,bytes.length,"UTF-8"); assertEquals(source,jvmcheck); - + Utf8StringBuffer buffer = new Utf8StringBuffer(); buffer.append(bytes,0,bytes.length); - String result=buffer.toString(); + String result = buffer.toString(); assertEquals(source,result); } + @Test + public void testGermanUmlauts() throws Exception + { + byte[] bytes = new byte[6]; + bytes[0] = (byte)0xC3; + bytes[1] = (byte)0xBC; + bytes[2] = (byte)0xC3; + bytes[3] = (byte)0xB6; + bytes[4] = (byte)0xC3; + bytes[5] = (byte)0xA4; + + Utf8StringBuffer buffer = new Utf8StringBuffer(); + for (int i = 0; i < bytes.length; i++) + buffer.append(bytes[i]); + + assertEquals("\u00FC\u00F6\u00E4",buffer.toString()); + } + + @Test(expected = Utf8Appendable.NotUtf8Exception.class) + public void testInvalidUTF8() throws UnsupportedEncodingException + { + Utf8StringBuffer buffer = new Utf8StringBuffer(); + buffer.append((byte)0xC2); + buffer.append((byte)0xC2); + } } diff --git a/jetty-util/src/test/java/org/eclipse/jetty/util/Utf8StringBuilderTest.java b/jetty-util/src/test/java/org/eclipse/jetty/util/Utf8StringBuilderTest.java index bfa0cccd87..b83aa1099d 100644 --- a/jetty-util/src/test/java/org/eclipse/jetty/util/Utf8StringBuilderTest.java +++ b/jetty-util/src/test/java/org/eclipse/jetty/util/Utf8StringBuilderTest.java @@ -4,142 +4,102 @@ // All rights reserved. This program and the accompanying materials // are made available under the terms of the Eclipse Public License v1.0 // and Apache License v2.0 which accompanies this distribution. -// The Eclipse Public License is available at +// The Eclipse Public License is available at // http://www.eclipse.org/legal/epl-v10.html // The Apache License v2.0 is available at // http://www.opensource.org/licenses/apache2.0.php -// You may elect to redistribute this code under either of these licenses. +// You may elect to redistribute this code under either of these licenses. // ======================================================================== package org.eclipse.jetty.util; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - import org.junit.Test; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; public class Utf8StringBuilderTest { @Test - public void testInvalid() - throws Exception + public void testInvalid() throws Exception { - String[] invalids = { - "c0af", - "EDA080", - "f08080af", - "f8808080af", - "e080af", - "F4908080", - "fbbfbfbfbf" - }; - + String[] invalids = + { "c0af", "EDA080", "f08080af", "f8808080af", "e080af", "F4908080", "fbbfbfbfbf", "10FFFF" }; + for (String i : invalids) { byte[] bytes = TypeUtil.fromHexString(i); - - /* Test what JVM does - try - { - String s = new String(bytes,0,bytes.length,"UTF-8"); - System.err.println(i+": "+s); - } - catch(Exception e) - { - System.err.println(i+": "+e); - } - */ - try { Utf8StringBuilder buffer = new Utf8StringBuilder(); buffer.append(bytes,0,bytes.length); - + assertEquals(i,"not expected",buffer.toString()); } - catch(IllegalArgumentException e) + catch (Utf8Appendable.NotUtf8Exception e) { assertTrue(i,true); } } } - + @Test - public void testUtfStringBuilder() - throws Exception + public void testUtfStringBuilder() throws Exception { - String source="abcd012345\n\r\u0000\u00a4\u10fb\ufffdjetty"; + String source = "abcd012345\n\r\u0000\u00a4\u10fb\ufffdjetty"; byte[] bytes = source.getBytes(StringUtil.__UTF8); Utf8StringBuilder buffer = new Utf8StringBuilder(); - for (int i=0;i=0); - } + buffer.toString(); } - + @Test - public void testLong() - throws Exception + public void testLong() throws Exception { - String source="abcXX"; + String source = "abcXX"; byte[] bytes = source.getBytes(StringUtil.__UTF8); - bytes[3]=(byte)0xc0; - bytes[4]=(byte)0x00; + bytes[3] = (byte)0xc0; + bytes[4] = (byte)0x00; Utf8StringBuilder buffer = new Utf8StringBuilder(); try { - for (int i = 0; i < bytes.length; i++) - buffer.append(bytes[i]); + for (byte aByte : bytes) + buffer.append(aByte); assertTrue(false); } - catch(Utf8Appendable.NotUtf8Exception e) + catch (IllegalArgumentException e) { assertTrue(true); } - assertEquals("abc\ufffd", buffer.toString()); + assertEquals("abc\ufffd",buffer.toString()); } - - @Test - public void testUTF32codes() - throws Exception + @Test + public void testUTF32codes() throws Exception { - String source="\uD842\uDF9F"; - byte[] bytes=source.getBytes("UTF-8"); - - // System.err.println(TypeUtil.toHexString(bytes)); + String source = "\uD842\uDF9F"; + byte[] bytes = source.getBytes("UTF-8"); + String jvmcheck = new String(bytes,0,bytes.length,"UTF-8"); assertEquals(source,jvmcheck); - + Utf8StringBuilder buffer = new Utf8StringBuilder(); buffer.append(bytes,0,bytes.length); - String result=buffer.toString(); + String result = buffer.toString(); assertEquals(source,result); } - - } -- cgit v1.2.3