Emilio Lahr-Vivaz created ARROW-2500: ----------------------------------------
Summary: [Java] IPC Writers/readers are not always setting validity bits correctly Key: ARROW-2500 URL: https://issues.apache.org/jira/browse/ARROW-2500 Project: Apache Arrow Issue Type: Bug Components: Java - Vectors Affects Versions: 0.9.0, 0.8.0 Reporter: Emilio Lahr-Vivaz When writing multiple batches to a Stream/File Writer, the first validity bit can get garbled between writing and reading. I couldn't pinpoint the exact issue, but I was able to re-create it with a fairly simple unit test. in TestArrowStream.java: {code:java} @Test public void testReadWriteMultipleBatches() throws IOException { ByteArrayOutputStream os = new ByteArrayOutputStream(); try (IntVector vector = new IntVector("foo", allocator);) { Schema schema = new Schema(Collections.singletonList(vector.getField()), null); try (VectorSchemaRoot root = new VectorSchemaRoot(schema, Collections.singletonList((FieldVector) vector), vector.getValueCount()); ArrowStreamWriter writer = new ArrowStreamWriter(root, new MapDictionaryProvider(), Channels.newChannel(os));) { writer.start(); vector.setNull(0); vector.setSafe(1, 1); vector.setSafe(2, 2); vector.setNull(3); vector.setSafe(4, 1); vector.setValueCount(5); root.setRowCount(5); writer.writeBatch(); vector.setNull(0); vector.setSafe(1, 1); vector.setSafe(2, 2); vector.setValueCount(3); root.setRowCount(3); writer.writeBatch(); } } ByteArrayInputStream in = new ByteArrayInputStream(os.toByteArray()); try (ArrowStreamReader reader = new ArrowStreamReader(in, allocator);) { IntVector read = (IntVector) reader.getVectorSchemaRoot().getFieldVectors().get(0); reader.loadNextBatch(); assertEquals(read.getValueCount(), 5); assertNull(read.getObject(0)); assertEquals(read.getObject(1), Integer.valueOf(1)); assertEquals(read.getObject(2), Integer.valueOf(2)); assertNull(read.getObject(3)); assertEquals(read.getObject(4), Integer.valueOf(1)); reader.loadNextBatch(); assertEquals(read.getValueCount(), 3); assertNull(read.getObject(0)); assertEquals(read.getObject(1), Integer.valueOf(1)); assertEquals(read.getObject(2), Integer.valueOf(2)); } } {code} in TestArrowFile.java: {code} @Test public void testReadWriteMultipleBatches() throws IOException { File file = new File("target/mytest_nulls_multibatch.arrow"); try (IntVector vector = new IntVector("foo", allocator);) { Schema schema = new Schema(Collections.singletonList(vector.getField()), null); try (FileOutputStream fileOutputStream = new FileOutputStream(file); VectorSchemaRoot root = new VectorSchemaRoot(schema, Collections.singletonList((FieldVector) vector), vector.getValueCount()); ArrowFileWriter writer = new ArrowFileWriter(root, new MapDictionaryProvider(), fileOutputStream.getChannel());) { writer.start(); vector.setNull(0); vector.setSafe(1, 1); vector.setSafe(2, 2); vector.setNull(3); vector.setSafe(4, 1); vector.setValueCount(5); root.setRowCount(5); writer.writeBatch(); vector.setNull(0); vector.setSafe(1, 1); vector.setSafe(2, 2); vector.setValueCount(3); root.setRowCount(3); writer.writeBatch(); } } try (FileInputStream fileInputStream = new FileInputStream(file); ArrowFileReader reader = new ArrowFileReader(fileInputStream.getChannel(), allocator);) { IntVector read = (IntVector) reader.getVectorSchemaRoot().getFieldVectors().get(0); reader.loadNextBatch(); assertEquals(read.getValueCount(), 5); assertNull(read.getObject(0)); assertEquals(read.getObject(1), Integer.valueOf(1)); assertEquals(read.getObject(2), Integer.valueOf(2)); assertNull(read.getObject(3)); assertEquals(read.getObject(4), Integer.valueOf(1)); reader.loadNextBatch(); assertEquals(read.getValueCount(), 3); assertNull(read.getObject(0)); assertEquals(read.getObject(1), Integer.valueOf(1)); assertEquals(read.getObject(2), Integer.valueOf(2)); } } {code} -- This message was sent by Atlassian JIRA (v7.6.3#76005)