Author: niallp Date: Sat Nov 29 17:24:12 2008 New Revision: 721749 URL: http://svn.apache.org/viewvc?rev=721749&view=rev Log: IO-178 BOMExclusionInputStream - an InputStream for UTF-8 data that ignores an initial Byte Order mark - thanks for patch from Keith D Gregory
Added: commons/proper/io/trunk/src/java/org/apache/commons/io/input/BOMExclusionInputStream.java (with props) commons/proper/io/trunk/src/test/org/apache/commons/io/input/BOMExclusionInputStreamTest.java (with props) Added: commons/proper/io/trunk/src/java/org/apache/commons/io/input/BOMExclusionInputStream.java URL: http://svn.apache.org/viewvc/commons/proper/io/trunk/src/java/org/apache/commons/io/input/BOMExclusionInputStream.java?rev=721749&view=auto ============================================================================== --- commons/proper/io/trunk/src/java/org/apache/commons/io/input/BOMExclusionInputStream.java (added) +++ commons/proper/io/trunk/src/java/org/apache/commons/io/input/BOMExclusionInputStream.java Sat Nov 29 17:24:12 2008 @@ -0,0 +1,168 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.io.input; + +import java.io.IOException; +import java.io.InputStream; + +/** + * This class is used to wrap a UTF8-encoded stream that includes an encoded + * Byte Order Mark (BOM, 0xFEFF encoded as 0xEF 0xBB 0xBF) as its first bytes. + * Such streams are produced by various Microsoft applications. This class + * will automatically skip these bytes and return the subsequent byte as the + * first byte in the stream. + * <p> + * If the first byte in the stream is 0xEF, this class will attempt to read + * the next two bytes. Results are undefined if the stream does not contain + * UTF-8 encoded data, as these next two bytes may not exist. + * + * @version $Revision$ $Date$ + * @since Commons IO 2.0 + */ +public class BOMExclusionInputStream extends ProxyInputStream { + private int[] firstBytes; + private int fbLength; + private int fbIndex; + private boolean markedAtStart; + + /** + * Constructs a new BOM Exclusion InputStream. + * @param delegate the InputStream to delegate to + */ + public BOMExclusionInputStream(InputStream delegate) { + super(delegate); + } + + /** + * This method reads and either preserves or skips the first bytes in the + * stream. It behaves like the single-byte <code>read()</code> method, + * either returning a valid byte or -1 to indicate that the initial bytes + * have been processed already. + * @return the byte read (excluding BOM) or -1 if the end of stream + * @throws IOException if an I/O error occurs + */ + private int readFirstBytes() throws IOException { + if (firstBytes == null) { + firstBytes = new int[3]; + int b0 = in.read(); + if ((b0 < 0) || (b0 != 0xEF)) { + return b0; + } + + int b1 = in.read(); + int b2 = in.read(); + if ((b1 == 0xBB) && (b2 == 0xBF)) { + return in.read(); + } + + // if the stream isn't valid UTF-8, this is where things get weird + firstBytes[fbLength++] = b0; + firstBytes[fbLength++] = b1; + firstBytes[fbLength++] = b2; + } + + return (fbIndex < fbLength) ? firstBytes[fbIndex++] : -1; + } + + //---------------------------------------------------------------------------- + // Implementation of InputStream + //---------------------------------------------------------------------------- + + /** + * Invokes the delegate's <code>read()</code> method, skipping BOM. + * @return the byte read (excluding BOM) or -1 if the end of stream + * @throws IOException if an I/O error occurs + */ + @Override + public int read() throws IOException { + int b = readFirstBytes(); + return (b >= 0) ? b : in.read(); + } + + /** + * Invokes the delegate's <code>read(byte[], int, int)</code> method, skipping BOM. + * @param buf the buffer to read the bytes into + * @param off The start offset + * @param len The number of bytes to read (excluding BOM) + * @return the number of bytes read or -1 if the end of stream + * @throws IOException if an I/O error occurs + */ + @Override + public int read(byte[] buf, int off, int len) throws IOException { + int firstCount = 0; + int b = 0; + while ((len > 0) && (b >= 0)) { + b = readFirstBytes(); + if (b >= 0) { + buf[off++] = (byte) (b & 0xFF); + len--; + firstCount++; + } + } + int secondCount = in.read(buf, off, len); + return (secondCount < 0) ? firstCount : firstCount + secondCount; + } + + /** + * Invokes the delegate's <code>read(byte[])</code> method, skipping BOM. + * @param buf the buffer to read the bytes into + * @return the number of bytes read (excluding BOM) + * or -1 if the end of stream + * @throws IOException if an I/O error occurs + */ + @Override + public int read(byte[] buf) throws IOException { + return read(buf, 0, buf.length); + } + + /** + * Invokes the delegate's <code>mark(int)</code> method. + * @param readlimit read ahead limit + */ + @Override + public synchronized void mark(int readlimit) { + markedAtStart = (firstBytes == null); + in.mark(readlimit); + } + + /** + * Invokes the delegate's <code>reset()</code> method. + * @throws IOException if an I/O error occurs + */ + @Override + public synchronized void reset() throws IOException { + if (markedAtStart) { + firstBytes = null; + } + + in.reset(); + } + + /** + * Invokes the delegate's <code>skip(long)</code> method, skipping BOM. + * @param n the number of bytes to skip + * @return the number of bytes to skipped or -1 if the end of stream + * @throws IOException if an I/O error occurs + */ + @Override + public long skip(long n) throws IOException { + while ((n > 0) && (readFirstBytes() >= 0)) { + n--; + } + return in.skip(n); + } +} Propchange: commons/proper/io/trunk/src/java/org/apache/commons/io/input/BOMExclusionInputStream.java ------------------------------------------------------------------------------ svn:eol-style = native Propchange: commons/proper/io/trunk/src/java/org/apache/commons/io/input/BOMExclusionInputStream.java ------------------------------------------------------------------------------ svn:keywords = Date Author Id Revision Added: commons/proper/io/trunk/src/test/org/apache/commons/io/input/BOMExclusionInputStreamTest.java URL: http://svn.apache.org/viewvc/commons/proper/io/trunk/src/test/org/apache/commons/io/input/BOMExclusionInputStreamTest.java?rev=721749&view=auto ============================================================================== --- commons/proper/io/trunk/src/test/org/apache/commons/io/input/BOMExclusionInputStreamTest.java (added) +++ commons/proper/io/trunk/src/test/org/apache/commons/io/input/BOMExclusionInputStreamTest.java Sat Nov 29 17:24:12 2008 @@ -0,0 +1,247 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.commons.io.input; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.io.InputStream; + +import junit.framework.TestCase; + +public class BOMExclusionInputStreamTest extends TestCase { + //---------------------------------------------------------------------------- + // Support code + //---------------------------------------------------------------------------- + + /** + * Creates the underlying data stream, with or without BOM. + */ + public InputStream createDataStream(byte[] baseData, boolean addBOM) { + byte[] data = baseData; + if (addBOM) { + data = new byte[baseData.length + 3]; + data[0] = (byte) 0xEF; + data[1] = (byte) 0xBB; + data[2] = (byte) 0xBF; + System.arraycopy(baseData, 0, data, 3, baseData.length); + } + return new ByteArrayInputStream(data); + } + + private void assertData(byte[] expected, byte[] actual, int off, int len) + throws Exception { + assertEquals("length", expected.length, len); + for (int ii = 0; ii < expected.length; ii++) { + assertEquals("byte " + ii, expected[ii], actual[ii]); + } + } + + /** + * A mock InputStream that expects <code>close()</code> to be called. + */ + private static class ExpectCloseInputStream extends InputStream { + private boolean _closeCalled; + + @Override + public void close() throws IOException { + _closeCalled = true; + } + + @Override + public int read() throws IOException { + return -1; + } + + public void assertCloseCalled() { + assertTrue(_closeCalled); + } + } + + //---------------------------------------------------------------------------- + // Test cases + //---------------------------------------------------------------------------- + + // make sure that our support code works as expected + public void testSupportCode() throws Exception { + InputStream in = createDataStream(new byte[] { 'A', 'B' }, true); + byte[] buf = new byte[1024]; + int len = in.read(buf); + assertEquals(5, len); + assertEquals(0xEF, buf[0] & 0xFF); + assertEquals(0xBB, buf[1] & 0xFF); + assertEquals(0xBF, buf[2] & 0xFF); + assertEquals('A', buf[3] & 0xFF); + assertEquals('B', buf[4] & 0xFF); + + assertData( + new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 'A', 'B' }, + buf, 0, len); + } + + public void testReadWithoutBOM() throws Exception { + byte[] data = new byte[] { 'A', 'B', 'C' }; + InputStream in = new BOMExclusionInputStream(createDataStream(data, false)); + assertEquals('A', in.read()); + assertEquals('B', in.read()); + assertEquals('C', in.read()); + assertEquals(-1, in.read()); + } + + public void testReadWithBOM() throws Exception { + byte[] data = new byte[] { 'A', 'B', 'C' }; + InputStream in = new BOMExclusionInputStream(createDataStream(data, true)); + assertEquals('A', in.read()); + assertEquals('B', in.read()); + assertEquals('C', in.read()); + assertEquals(-1, in.read()); + } + + public void testLargeBufferWithoutBOM() throws Exception { + byte[] data = new byte[] { 'A', 'B', 'C' }; + InputStream in = new BOMExclusionInputStream(createDataStream(data, false)); + byte[] buf = new byte[1024]; + assertData(data, buf, 0, in.read(buf)); + } + + public void testLargeBufferWithBOM() throws Exception { + byte[] data = new byte[] { 'A', 'B', 'C' }; + InputStream in = new BOMExclusionInputStream(createDataStream(data, true)); + byte[] buf = new byte[1024]; + assertData(data, buf, 0, in.read(buf)); + } + + public void testSmallBufferWithoutBOM() throws Exception { + byte[] data = new byte[] { 'A', 'B', 'C' }; + InputStream in = new BOMExclusionInputStream(createDataStream(data, false)); + byte[] buf = new byte[1024]; + assertData(new byte[] { 'A', 'B' }, buf, 0, in.read(buf, 0, 2)); + assertData(new byte[] { 'C' }, buf, 0, in.read(buf, 0, 2)); + } + + public void testSmallBufferWithBOM() throws Exception { + byte[] data = new byte[] { 'A', 'B', 'C' }; + InputStream in = new BOMExclusionInputStream(createDataStream(data, true)); + byte[] buf = new byte[1024]; + assertData(new byte[] { 'A', 'B' }, buf, 0, in.read(buf, 0, 2)); + assertData(new byte[] { 'C' }, buf, 0, in.read(buf, 0, 2)); + } + + public void testLeadingNonBOMSingleRead() throws Exception { + byte[] data = new byte[] { (byte) 0xEF, (byte) 0xAB, (byte) 0xCD }; + InputStream in = new BOMExclusionInputStream(createDataStream(data, false)); + assertEquals(0xEF, in.read()); + assertEquals(0xAB, in.read()); + assertEquals(0xCD, in.read()); + assertEquals(-1, in.read()); + } + + public void testLeadingNonBOMBufferedRead() throws Exception { + byte[] data = new byte[] { (byte) 0xEF, (byte) 0xAB, (byte) 0xCD }; + InputStream in = new BOMExclusionInputStream(createDataStream(data, false)); + byte[] buf = new byte[1024]; + assertData(data, buf, 0, in.read(buf)); + } + + public void testSkipWithoutBOM() throws Exception { + byte[] data = new byte[] { 'A', 'B', 'C', 'D' }; + InputStream in = new BOMExclusionInputStream(createDataStream(data, false)); + in.skip(2L); + assertEquals('C', in.read()); + } + + public void testSkipWithBOM() throws Exception { + byte[] data = new byte[] { 'A', 'B', 'C', 'D' }; + InputStream in = new BOMExclusionInputStream(createDataStream(data, true)); + in.skip(2L); + assertEquals('C', in.read()); + } + + public void testMarkResetAfterReadWithoutBOM() throws Exception { + byte[] data = new byte[] { 'A', 'B', 'C', 'D' }; + InputStream in = new BOMExclusionInputStream(createDataStream(data, false)); + assertTrue(in.markSupported()); + + in.read(); + in.mark(10); + + in.read(); + in.read(); + in.reset(); + assertEquals('B', in.read()); + } + + public void testMarkResetAfterReadWithBOM() throws Exception { + byte[] data = new byte[] { 'A', 'B', 'C', 'D' }; + InputStream in = new BOMExclusionInputStream(createDataStream(data, true)); + assertTrue(in.markSupported()); + + in.read(); + in.mark(10); + + in.read(); + in.read(); + in.reset(); + assertEquals('B', in.read()); + } + + public void testMarkResetBeforeReadWithoutBOM() throws Exception { + byte[] data = new byte[] { 'A', 'B', 'C', 'D' }; + InputStream in = new BOMExclusionInputStream(createDataStream(data, false)); + assertTrue(in.markSupported()); + + in.mark(10); + + in.read(); + in.read(); + in.reset(); + assertEquals('A', in.read()); + } + + public void testMarkResetBeforeReadWithBOM() throws Exception { + byte[] data = new byte[] { 'A', 'B', 'C', 'D' }; + InputStream in = new BOMExclusionInputStream(createDataStream(data, true)); + assertTrue(in.markSupported()); + + in.mark(10); + + in.read(); + in.read(); + in.reset(); + assertEquals('A', in.read()); + } + + public void testAvailableWithoutBOM() throws Exception { + byte[] data = new byte[] { 'A', 'B', 'C', 'D' }; + InputStream in = new BOMExclusionInputStream(createDataStream(data, false)); + assertEquals(4, in.available()); + } + + public void testAvailableWithBOM() throws Exception { + byte[] data = new byte[] { 'A', 'B', 'C', 'D' }; + InputStream in = new BOMExclusionInputStream(createDataStream(data, true)); + assertEquals(7, in.available()); + } + + // this is here for coverage + public void testClose() throws Exception { + ExpectCloseInputStream del = new ExpectCloseInputStream(); + InputStream in = new BOMExclusionInputStream(del); + + in.close(); + del.assertCloseCalled(); + } +} Propchange: commons/proper/io/trunk/src/test/org/apache/commons/io/input/BOMExclusionInputStreamTest.java ------------------------------------------------------------------------------ svn:mergeinfo =