Author: niallp
Date: Sat Nov 29 17:24:12 2008
New Revision: 721749

URL: http://svn.apache.org/viewvc?rev=721749&view=rev
Log:
IO-178 BOMExclusionInputStream - an InputStream for UTF-8 data that ignores an 
initial Byte Order mark - thanks for patch from Keith D Gregory

Added:
    
commons/proper/io/trunk/src/java/org/apache/commons/io/input/BOMExclusionInputStream.java
   (with props)
    
commons/proper/io/trunk/src/test/org/apache/commons/io/input/BOMExclusionInputStreamTest.java
   (with props)

Added: 
commons/proper/io/trunk/src/java/org/apache/commons/io/input/BOMExclusionInputStream.java
URL: 
http://svn.apache.org/viewvc/commons/proper/io/trunk/src/java/org/apache/commons/io/input/BOMExclusionInputStream.java?rev=721749&view=auto
==============================================================================
--- 
commons/proper/io/trunk/src/java/org/apache/commons/io/input/BOMExclusionInputStream.java
 (added)
+++ 
commons/proper/io/trunk/src/java/org/apache/commons/io/input/BOMExclusionInputStream.java
 Sat Nov 29 17:24:12 2008
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.io.input;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+/**
+ *  This class is used to wrap a UTF8-encoded stream that includes an encoded
+ *  Byte Order Mark (BOM, 0xFEFF encoded as 0xEF 0xBB 0xBF) as its first bytes.
+ *  Such streams are produced by various Microsoft applications. This class
+ *  will automatically skip these bytes and return the subsequent byte as the
+ *  first byte in the stream.
+ *  <p>
+ *  If the first byte in the stream is 0xEF, this class will attempt to read
+ *  the next two bytes. Results are undefined if the stream does not contain
+ *  UTF-8 encoded data, as these next two bytes may not exist.
+ *
+ * @version $Revision$ $Date$
+ * @since Commons IO 2.0
+ */
+public class BOMExclusionInputStream extends ProxyInputStream {
+    private int[] firstBytes;
+    private int fbLength;
+    private int fbIndex;
+    private boolean markedAtStart;
+
+    /**
+     * Constructs a new BOM Exclusion InputStream.
+     * @param delegate the InputStream to delegate to
+     */
+    public BOMExclusionInputStream(InputStream delegate) {
+        super(delegate);
+    }
+
+    /**
+     * This method reads and either preserves or skips the first bytes in the
+     * stream. It behaves like the single-byte <code>read()</code> method,
+     * either returning a valid byte or -1 to indicate that the initial bytes
+     * have been processed already.
+     * @return the byte read (excluding BOM) or -1 if the end of stream
+     * @throws IOException if an I/O error occurs
+     */
+    private int readFirstBytes() throws IOException {
+        if (firstBytes == null) {
+            firstBytes = new int[3];
+            int b0 = in.read();
+            if ((b0 < 0) || (b0 != 0xEF)) {
+                return b0;
+            }
+
+            int b1 = in.read();
+            int b2 = in.read();
+            if ((b1 == 0xBB) && (b2 == 0xBF)) {
+                return in.read();
+            }
+
+            // if the stream isn't valid UTF-8, this is where things get weird
+            firstBytes[fbLength++] = b0;
+            firstBytes[fbLength++] = b1;
+            firstBytes[fbLength++] = b2;
+        }
+
+        return (fbIndex < fbLength) ? firstBytes[fbIndex++] : -1;
+    }
+
+    
//----------------------------------------------------------------------------
+    //  Implementation of InputStream
+    
//----------------------------------------------------------------------------
+
+    /**
+     * Invokes the delegate's <code>read()</code> method, skipping BOM.
+     * @return the byte read (excluding BOM) or -1 if the end of stream
+     * @throws IOException if an I/O error occurs
+     */
+    @Override
+    public int read() throws IOException {
+        int b = readFirstBytes();
+        return (b >= 0) ? b : in.read();
+    }
+
+    /**
+     * Invokes the delegate's <code>read(byte[], int, int)</code> method, 
skipping BOM.
+     * @param buf the buffer to read the bytes into
+     * @param off The start offset
+     * @param len The number of bytes to read (excluding BOM)
+     * @return the number of bytes read or -1 if the end of stream
+     * @throws IOException if an I/O error occurs
+     */
+    @Override
+    public int read(byte[] buf, int off, int len) throws IOException {
+        int firstCount = 0;
+        int b = 0;
+        while ((len > 0) && (b >= 0)) {
+            b = readFirstBytes();
+            if (b >= 0) {
+                buf[off++] = (byte) (b & 0xFF);
+                len--;
+                firstCount++;
+            }
+        }
+        int secondCount = in.read(buf, off, len);
+        return (secondCount < 0) ? firstCount : firstCount + secondCount;
+    }
+
+    /**
+     * Invokes the delegate's <code>read(byte[])</code> method, skipping BOM.
+     * @param buf the buffer to read the bytes into
+     * @return the number of bytes read (excluding BOM)
+     * or -1 if the end of stream
+     * @throws IOException if an I/O error occurs
+     */
+    @Override
+    public int read(byte[] buf) throws IOException {
+        return read(buf, 0, buf.length);
+    }
+
+    /**
+     * Invokes the delegate's <code>mark(int)</code> method.
+     * @param readlimit read ahead limit
+     */
+    @Override
+    public synchronized void mark(int readlimit) {
+        markedAtStart = (firstBytes == null);
+        in.mark(readlimit);
+    }
+
+    /**
+     * Invokes the delegate's <code>reset()</code> method.
+     * @throws IOException if an I/O error occurs
+     */
+    @Override
+    public synchronized void reset() throws IOException {
+        if (markedAtStart) {
+            firstBytes = null;
+        }
+
+        in.reset();
+    }
+
+    /**
+     * Invokes the delegate's <code>skip(long)</code> method, skipping BOM.
+     * @param n the number of bytes to skip
+     * @return the number of bytes to skipped or -1 if the end of stream
+     * @throws IOException if an I/O error occurs
+     */
+    @Override
+    public long skip(long n) throws IOException {
+        while ((n > 0) && (readFirstBytes() >= 0)) {
+            n--;
+        }
+        return in.skip(n);
+    }
+}

Propchange: 
commons/proper/io/trunk/src/java/org/apache/commons/io/input/BOMExclusionInputStream.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: 
commons/proper/io/trunk/src/java/org/apache/commons/io/input/BOMExclusionInputStream.java
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision

Added: 
commons/proper/io/trunk/src/test/org/apache/commons/io/input/BOMExclusionInputStreamTest.java
URL: 
http://svn.apache.org/viewvc/commons/proper/io/trunk/src/test/org/apache/commons/io/input/BOMExclusionInputStreamTest.java?rev=721749&view=auto
==============================================================================
--- 
commons/proper/io/trunk/src/test/org/apache/commons/io/input/BOMExclusionInputStreamTest.java
 (added)
+++ 
commons/proper/io/trunk/src/test/org/apache/commons/io/input/BOMExclusionInputStreamTest.java
 Sat Nov 29 17:24:12 2008
@@ -0,0 +1,247 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.commons.io.input;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+
+import junit.framework.TestCase;
+
+public class BOMExclusionInputStreamTest extends TestCase {
+    
//----------------------------------------------------------------------------
+    //  Support code
+    
//----------------------------------------------------------------------------
+
+    /**
+     *  Creates the underlying data stream, with or without BOM.
+     */
+    public InputStream createDataStream(byte[] baseData, boolean addBOM) {
+        byte[] data = baseData;
+        if (addBOM) {
+            data = new byte[baseData.length + 3];
+            data[0] = (byte) 0xEF;
+            data[1] = (byte) 0xBB;
+            data[2] = (byte) 0xBF;
+            System.arraycopy(baseData, 0, data, 3, baseData.length);
+        }
+        return new ByteArrayInputStream(data);
+    }
+
+    private void assertData(byte[] expected, byte[] actual, int off, int len)
+        throws Exception {
+        assertEquals("length", expected.length, len);
+        for (int ii = 0; ii < expected.length; ii++) {
+            assertEquals("byte " + ii, expected[ii], actual[ii]);
+        }
+    }
+
+    /**
+     *  A mock InputStream that expects <code>close()</code> to be called.
+     */
+    private static class ExpectCloseInputStream extends InputStream {
+        private boolean _closeCalled;
+
+        @Override
+        public void close() throws IOException {
+            _closeCalled = true;
+        }
+
+        @Override
+        public int read() throws IOException {
+            return -1;
+        }
+
+        public void assertCloseCalled() {
+            assertTrue(_closeCalled);
+        }
+    }
+
+    
//----------------------------------------------------------------------------
+    //  Test cases
+    
//----------------------------------------------------------------------------
+
+    // make sure that our support code works as expected
+    public void testSupportCode() throws Exception {
+        InputStream in = createDataStream(new byte[] { 'A', 'B' }, true);
+        byte[] buf = new byte[1024];
+        int len = in.read(buf);
+        assertEquals(5, len);
+        assertEquals(0xEF, buf[0] & 0xFF);
+        assertEquals(0xBB, buf[1] & 0xFF);
+        assertEquals(0xBF, buf[2] & 0xFF);
+        assertEquals('A', buf[3] & 0xFF);
+        assertEquals('B', buf[4] & 0xFF);
+
+        assertData(
+                new byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF, 'A', 'B' },
+                buf, 0, len);
+    }
+
+    public void testReadWithoutBOM() throws Exception {
+        byte[] data = new byte[] { 'A', 'B', 'C' };
+        InputStream in = new BOMExclusionInputStream(createDataStream(data, 
false));
+        assertEquals('A', in.read());
+        assertEquals('B', in.read());
+        assertEquals('C', in.read());
+        assertEquals(-1, in.read());
+    }
+
+    public void testReadWithBOM() throws Exception {
+        byte[] data = new byte[] { 'A', 'B', 'C' };
+        InputStream in = new BOMExclusionInputStream(createDataStream(data, 
true));
+        assertEquals('A', in.read());
+        assertEquals('B', in.read());
+        assertEquals('C', in.read());
+        assertEquals(-1, in.read());
+    }
+
+    public void testLargeBufferWithoutBOM() throws Exception {
+        byte[] data = new byte[] { 'A', 'B', 'C' };
+        InputStream in = new BOMExclusionInputStream(createDataStream(data, 
false));
+        byte[] buf = new byte[1024];
+        assertData(data, buf, 0, in.read(buf));
+    }
+
+    public void testLargeBufferWithBOM() throws Exception {
+        byte[] data = new byte[] { 'A', 'B', 'C' };
+        InputStream in = new BOMExclusionInputStream(createDataStream(data, 
true));
+        byte[] buf = new byte[1024];
+        assertData(data, buf, 0, in.read(buf));
+    }
+
+    public void testSmallBufferWithoutBOM() throws Exception {
+        byte[] data = new byte[] { 'A', 'B', 'C' };
+        InputStream in = new BOMExclusionInputStream(createDataStream(data, 
false));
+        byte[] buf = new byte[1024];
+        assertData(new byte[] { 'A', 'B' }, buf, 0, in.read(buf, 0, 2));
+        assertData(new byte[] { 'C' }, buf, 0, in.read(buf, 0, 2));
+    }
+
+    public void testSmallBufferWithBOM() throws Exception {
+        byte[] data = new byte[] { 'A', 'B', 'C' };
+        InputStream in = new BOMExclusionInputStream(createDataStream(data, 
true));
+        byte[] buf = new byte[1024];
+        assertData(new byte[] { 'A', 'B' }, buf, 0, in.read(buf, 0, 2));
+        assertData(new byte[] { 'C' }, buf, 0, in.read(buf, 0, 2));
+    }
+
+    public void testLeadingNonBOMSingleRead() throws Exception {
+        byte[] data = new byte[] { (byte) 0xEF, (byte) 0xAB, (byte) 0xCD };
+        InputStream in = new BOMExclusionInputStream(createDataStream(data, 
false));
+        assertEquals(0xEF, in.read());
+        assertEquals(0xAB, in.read());
+        assertEquals(0xCD, in.read());
+        assertEquals(-1, in.read());
+    }
+
+    public void testLeadingNonBOMBufferedRead() throws Exception {
+        byte[] data = new byte[] { (byte) 0xEF, (byte) 0xAB, (byte) 0xCD };
+        InputStream in = new BOMExclusionInputStream(createDataStream(data, 
false));
+        byte[] buf = new byte[1024];
+        assertData(data, buf, 0, in.read(buf));
+    }
+
+    public void testSkipWithoutBOM() throws Exception {
+        byte[] data = new byte[] { 'A', 'B', 'C', 'D' };
+        InputStream in = new BOMExclusionInputStream(createDataStream(data, 
false));
+        in.skip(2L);
+        assertEquals('C', in.read());
+    }
+
+    public void testSkipWithBOM() throws Exception {
+        byte[] data = new byte[] { 'A', 'B', 'C', 'D' };
+        InputStream in = new BOMExclusionInputStream(createDataStream(data, 
true));
+        in.skip(2L);
+        assertEquals('C', in.read());
+    }
+
+    public void testMarkResetAfterReadWithoutBOM() throws Exception {
+        byte[] data = new byte[] { 'A', 'B', 'C', 'D' };
+        InputStream in = new BOMExclusionInputStream(createDataStream(data, 
false));
+        assertTrue(in.markSupported());
+
+        in.read();
+        in.mark(10);
+
+        in.read();
+        in.read();
+        in.reset();
+        assertEquals('B', in.read());
+    }
+
+    public void testMarkResetAfterReadWithBOM() throws Exception {
+        byte[] data = new byte[] { 'A', 'B', 'C', 'D' };
+        InputStream in = new BOMExclusionInputStream(createDataStream(data, 
true));
+        assertTrue(in.markSupported());
+
+        in.read();
+        in.mark(10);
+
+        in.read();
+        in.read();
+        in.reset();
+        assertEquals('B', in.read());
+    }
+
+    public void testMarkResetBeforeReadWithoutBOM() throws Exception {
+        byte[] data = new byte[] { 'A', 'B', 'C', 'D' };
+        InputStream in = new BOMExclusionInputStream(createDataStream(data, 
false));
+        assertTrue(in.markSupported());
+
+        in.mark(10);
+
+        in.read();
+        in.read();
+        in.reset();
+        assertEquals('A', in.read());
+    }
+
+    public void testMarkResetBeforeReadWithBOM() throws Exception {
+        byte[] data = new byte[] { 'A', 'B', 'C', 'D' };
+        InputStream in = new BOMExclusionInputStream(createDataStream(data, 
true));
+        assertTrue(in.markSupported());
+
+        in.mark(10);
+
+        in.read();
+        in.read();
+        in.reset();
+        assertEquals('A', in.read());
+    }
+
+    public void testAvailableWithoutBOM() throws Exception {
+        byte[] data = new byte[] { 'A', 'B', 'C', 'D' };
+        InputStream in = new BOMExclusionInputStream(createDataStream(data, 
false));
+        assertEquals(4, in.available());
+    }
+
+    public void testAvailableWithBOM() throws Exception {
+        byte[] data = new byte[] { 'A', 'B', 'C', 'D' };
+        InputStream in = new BOMExclusionInputStream(createDataStream(data, 
true));
+        assertEquals(7, in.available());
+    }
+
+    // this is here for coverage
+    public void testClose() throws Exception {
+        ExpectCloseInputStream del = new ExpectCloseInputStream();
+        InputStream in = new BOMExclusionInputStream(del);
+
+        in.close();
+        del.assertCloseCalled();
+    }
+}

Propchange: 
commons/proper/io/trunk/src/test/org/apache/commons/io/input/BOMExclusionInputStreamTest.java
------------------------------------------------------------------------------
    svn:mergeinfo = 


Reply via email to