[ 
https://issues.apache.org/jira/browse/ARROW-2500?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16449781#comment-16449781
 ] 

Emilio Lahr-Vivaz commented on ARROW-2500:
------------------------------------------

Note: this didn't seem to occur in 0.6.

> [Java] IPC Writers/readers are not always setting validity bits correctly
> -------------------------------------------------------------------------
>
>                 Key: ARROW-2500
>                 URL: https://issues.apache.org/jira/browse/ARROW-2500
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Java - Vectors
>    Affects Versions: 0.8.0, 0.9.0
>            Reporter: Emilio Lahr-Vivaz
>            Priority: Major
>
> When writing multiple batches to a Stream/File Writer, the first validity bit 
> can get garbled between writing and reading. I couldn't pinpoint the exact 
> issue, but I was able to re-create it with a fairly simple unit test.
> in TestArrowStream.java:
> {code:java}
>   @Test
>   public void testReadWriteMultipleBatches() throws IOException {
>     ByteArrayOutputStream os = new ByteArrayOutputStream();
>     try (IntVector vector = new IntVector("foo", allocator);) {
>       Schema schema = new 
> Schema(Collections.singletonList(vector.getField()), null);
>       try (VectorSchemaRoot root = new VectorSchemaRoot(schema, 
> Collections.singletonList((FieldVector) vector), vector.getValueCount());
>            ArrowStreamWriter writer = new ArrowStreamWriter(root, new 
> MapDictionaryProvider(), Channels.newChannel(os));) {
>         writer.start();
>         vector.setNull(0);
>         vector.setSafe(1, 1);
>         vector.setSafe(2, 2);
>         vector.setNull(3);
>         vector.setSafe(4, 1);
>         vector.setValueCount(5);
>         root.setRowCount(5);
>         writer.writeBatch();
>         vector.setNull(0);
>         vector.setSafe(1, 1);
>         vector.setSafe(2, 2);
>         vector.setValueCount(3);
>         root.setRowCount(3);
>         writer.writeBatch();
>       }
>     }
>     ByteArrayInputStream in = new ByteArrayInputStream(os.toByteArray());
>     try (ArrowStreamReader reader = new ArrowStreamReader(in, allocator);) {
>       IntVector read = (IntVector) 
> reader.getVectorSchemaRoot().getFieldVectors().get(0);
>       reader.loadNextBatch();
>       assertEquals(read.getValueCount(), 5);
>       assertNull(read.getObject(0));
>       assertEquals(read.getObject(1), Integer.valueOf(1));
>       assertEquals(read.getObject(2), Integer.valueOf(2));
>       assertNull(read.getObject(3));
>       assertEquals(read.getObject(4), Integer.valueOf(1));
>       reader.loadNextBatch();
>       assertEquals(read.getValueCount(), 3);
>       assertNull(read.getObject(0));
>       assertEquals(read.getObject(1), Integer.valueOf(1));
>       assertEquals(read.getObject(2), Integer.valueOf(2));
>     }
>   }
> {code}
> in TestArrowFile.java:
> {code}
>  @Test
>   public void testReadWriteMultipleBatches() throws IOException {
>     File file = new File("target/mytest_nulls_multibatch.arrow");
>     try (IntVector vector = new IntVector("foo", allocator);) {
>       Schema schema = new 
> Schema(Collections.singletonList(vector.getField()), null);
>       try (FileOutputStream fileOutputStream = new FileOutputStream(file);
>            VectorSchemaRoot root = new VectorSchemaRoot(schema, 
> Collections.singletonList((FieldVector) vector), vector.getValueCount());
>            ArrowFileWriter writer = new ArrowFileWriter(root, new 
> MapDictionaryProvider(), fileOutputStream.getChannel());) {
>         writer.start();
>         vector.setNull(0);
>         vector.setSafe(1, 1);
>         vector.setSafe(2, 2);
>         vector.setNull(3);
>         vector.setSafe(4, 1);
>         vector.setValueCount(5);
>         root.setRowCount(5);
>         writer.writeBatch();
>         vector.setNull(0);
>         vector.setSafe(1, 1);
>         vector.setSafe(2, 2);
>         vector.setValueCount(3);
>         root.setRowCount(3);
>         writer.writeBatch();
>       }
>     }
>     try (FileInputStream fileInputStream = new FileInputStream(file);
>          ArrowFileReader reader = new 
> ArrowFileReader(fileInputStream.getChannel(), allocator);) {
>       IntVector read = (IntVector) 
> reader.getVectorSchemaRoot().getFieldVectors().get(0);
>       reader.loadNextBatch();
>       assertEquals(read.getValueCount(), 5);
>       assertNull(read.getObject(0));
>       assertEquals(read.getObject(1), Integer.valueOf(1));
>       assertEquals(read.getObject(2), Integer.valueOf(2));
>       assertNull(read.getObject(3));
>       assertEquals(read.getObject(4), Integer.valueOf(1));
>       reader.loadNextBatch();
>       assertEquals(read.getValueCount(), 3);
>       assertNull(read.getObject(0));
>       assertEquals(read.getObject(1), Integer.valueOf(1));
>       assertEquals(read.getObject(2), Integer.valueOf(2));
>     }
>   }
> {code}



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to