This is an automated email from the ASF dual-hosted git repository.
lidavidm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new c2eda76eb8 ARROW-17107: [Java] Fix variable-width vectors in
integration JSON writer (#13676)
c2eda76eb8 is described below
commit c2eda76eb84f5bb8c7272b4c2b0cd6be05c06517
Author: David Li <[email protected]>
AuthorDate: Mon Jul 25 06:58:46 2022 -0400
ARROW-17107: [Java] Fix variable-width vectors in integration JSON writer
(#13676)
- Clarify that this is meant to be an internal format
- Fix handling of empty variable-width vectors
Authored-by: David Li <[email protected]>
Signed-off-by: David Li <[email protected]>
---
.../apache/arrow/vector/ipc/JsonFileReader.java | 6 +-
.../apache/arrow/vector/ipc/JsonFileWriter.java | 41 ++++++--
.../org/apache/arrow/vector/TestValueVector.java | 35 +++++++
.../org/apache/arrow/vector/ipc/TestJSONFile.java | 110 +++++++++++++++------
4 files changed, 152 insertions(+), 40 deletions(-)
diff --git
a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java
b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java
index 6455857c27..742daeef25 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java
@@ -784,8 +784,10 @@ public class JsonFileReader implements AutoCloseable,
DictionaryProvider {
@Override
public void close() throws IOException {
parser.close();
- for (Dictionary dictionary : dictionaries.values()) {
- dictionary.getVector().close();
+ if (dictionaries != null) {
+ for (Dictionary dictionary : dictionaries.values()) {
+ dictionary.getVector().close();
+ }
}
}
diff --git
a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java
b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java
index 58760c1a94..f5e267e812 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java
@@ -30,6 +30,7 @@ import java.util.Set;
import org.apache.arrow.memory.ArrowBuf;
import org.apache.arrow.util.Preconditions;
+import org.apache.arrow.vector.BaseLargeVariableWidthVector;
import org.apache.arrow.vector.BaseVariableWidthVector;
import org.apache.arrow.vector.BigIntVector;
import org.apache.arrow.vector.BitVectorHelper;
@@ -83,8 +84,10 @@ import
com.fasterxml.jackson.core.util.DefaultPrettyPrinter.NopIndenter;
import com.fasterxml.jackson.databind.MappingJsonFactory;
/**
- * A writer that converts binary Vectors into a JSON format suitable
+ * A writer that converts binary Vectors into an <em>internal, unstable</em>
JSON format suitable
* for integration testing.
+ *
+ * This writer does NOT implement a JSON dataset format like JSONL.
*/
public class JsonFileWriter implements AutoCloseable {
@@ -228,11 +231,21 @@ public class JsonFileWriter implements AutoCloseable {
vector.getMinorType() == MinorType.VARBINARY)) {
writeValueToGenerator(bufferType, vectorBuffer,
vectorBuffers.get(v - 1), vector, i);
} else if (bufferType.equals(OFFSET) && vector.getValueCount() == 0
&&
- (vector.getMinorType() == MinorType.VARBINARY ||
vector.getMinorType() == MinorType.VARCHAR)) {
- ArrowBuf vectorBufferTmp = vector.getAllocator().buffer(4);
- vectorBufferTmp.setInt(0, 0);
- writeValueToGenerator(bufferType, vectorBufferTmp, null, vector,
i);
- vectorBufferTmp.close();
+ (vector.getMinorType() == MinorType.LIST ||
vector.getMinorType() == MinorType.MAP ||
+ vector.getMinorType() == MinorType.VARBINARY ||
vector.getMinorType() == MinorType.VARCHAR)) {
+ // Empty vectors may not have allocated an offsets buffer
+ try (ArrowBuf vectorBufferTmp = vector.getAllocator().buffer(4)) {
+ vectorBufferTmp.setInt(0, 0);
+ writeValueToGenerator(bufferType, vectorBufferTmp, null, vector,
i);
+ }
+ } else if (bufferType.equals(OFFSET) && vector.getValueCount() == 0
&&
+ (vector.getMinorType() == MinorType.LARGELIST ||
vector.getMinorType() == MinorType.LARGEVARBINARY ||
+ vector.getMinorType() == MinorType.LARGEVARCHAR)) {
+ // Empty vectors may not have allocated an offsets buffer
+ try (ArrowBuf vectorBufferTmp = vector.getAllocator().buffer(8)) {
+ vectorBufferTmp.setLong(0, 0);
+ writeValueToGenerator(bufferType, vectorBufferTmp, null, vector,
i);
+ }
} else {
writeValueToGenerator(bufferType, vectorBuffer, null, vector, i);
}
@@ -267,7 +280,21 @@ public class JsonFileWriter implements AutoCloseable {
if (bufferType.equals(TYPE)) {
generator.writeNumber(buffer.getByte(index * TinyIntVector.TYPE_WIDTH));
} else if (bufferType.equals(OFFSET)) {
- generator.writeNumber(buffer.getInt(index *
BaseVariableWidthVector.OFFSET_WIDTH));
+ switch (vector.getMinorType()) {
+ case VARCHAR:
+ case VARBINARY:
+ case LIST:
+ case MAP:
+ generator.writeNumber(buffer.getInt((long) index *
BaseVariableWidthVector.OFFSET_WIDTH));
+ break;
+ case LARGELIST:
+ case LARGEVARBINARY:
+ case LARGEVARCHAR:
+ generator.writeNumber(buffer.getLong((long) index *
BaseLargeVariableWidthVector.OFFSET_WIDTH));
+ break;
+ default:
+ throw new IllegalArgumentException("Type has no offset buffer: " +
vector.getField());
+ }
} else if (bufferType.equals(VALIDITY)) {
generator.writeNumber(vector.isNull(index) ? 0 : 1);
} else if (bufferType.equals(DATA)) {
diff --git
a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
index 8858281063..516daa2362 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
@@ -190,6 +190,41 @@ public class TestValueVector {
}
}
+ @Test
+ public void testNoOverFlowWithUINT() {
+ try (final UInt8Vector uInt8Vector = new UInt8Vector("uint8", allocator);
+ final UInt4Vector uInt4Vector = new UInt4Vector("uint4", allocator);
+ final UInt1Vector uInt1Vector = new UInt1Vector("uint1", allocator)) {
+
+ long[] longValues = new long[]{Long.MIN_VALUE, Long.MAX_VALUE, -1L};
+ uInt8Vector.allocateNew(3);
+ uInt8Vector.setValueCount(3);
+ for (int i = 0; i < longValues.length; i++) {
+ uInt8Vector.set(i, longValues[i]);
+ long readValue = uInt8Vector.getObjectNoOverflow(i).longValue();
+ assertEquals(readValue, longValues[i]);
+ }
+
+ int[] intValues = new int[]{Integer.MIN_VALUE, Integer.MAX_VALUE, -1};
+ uInt4Vector.allocateNew(3);
+ uInt4Vector.setValueCount(3);
+ for (int i = 0; i < intValues.length; i++) {
+ uInt4Vector.set(i, intValues[i]);
+ int actualValue = (int)
UInt4Vector.getNoOverflow(uInt4Vector.getDataBuffer(), i);
+ assertEquals(intValues[i], actualValue);
+ }
+
+ byte[] byteValues = new byte[]{Byte.MIN_VALUE, Byte.MAX_VALUE, -1};
+ uInt1Vector.allocateNew(3);
+ uInt1Vector.setValueCount(3);
+ for (int i = 0; i < byteValues.length; i++) {
+ uInt1Vector.set(i, byteValues[i]);
+ byte actualValue = (byte)
UInt1Vector.getNoOverflow(uInt1Vector.getDataBuffer(), i);
+ assertEquals(byteValues[i], actualValue);
+ }
+ }
+ }
+
@Test /* IntVector */
public void testFixedType2() {
try (final IntVector intVector = new IntVector(EMPTY_SCHEMA_PATH,
allocator)) {
diff --git
a/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java
b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java
index f0aa226e26..0aa49d9daa 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java
@@ -18,21 +18,29 @@
package org.apache.arrow.vector.ipc;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
import java.io.File;
import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
import org.apache.arrow.memory.BufferAllocator;
import org.apache.arrow.vector.FieldVector;
-import org.apache.arrow.vector.UInt1Vector;
-import org.apache.arrow.vector.UInt4Vector;
-import org.apache.arrow.vector.UInt8Vector;
import org.apache.arrow.vector.VectorSchemaRoot;
import org.apache.arrow.vector.complex.StructVector;
import org.apache.arrow.vector.complex.impl.ComplexWriterImpl;
import org.apache.arrow.vector.complex.writer.BaseWriter;
import org.apache.arrow.vector.dictionary.DictionaryProvider;
import
org.apache.arrow.vector.dictionary.DictionaryProvider.MapDictionaryProvider;
+import org.apache.arrow.vector.types.UnionMode;
+import org.apache.arrow.vector.types.pojo.ArrowType;
+import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.types.pojo.FieldType;
import org.apache.arrow.vector.types.pojo.Schema;
import org.apache.arrow.vector.util.Validator;
import org.junit.Assert;
@@ -421,37 +429,77 @@ public class TestJSONFile extends BaseFileTest {
}
}
+ /** Regression test for ARROW-17107. */
@Test
- public void testNoOverFlowWithUINT() {
- try (final UInt8Vector uInt8Vector = new UInt8Vector("uint8", allocator);
- final UInt4Vector uInt4Vector = new UInt4Vector("uint4", allocator);
- final UInt1Vector uInt1Vector = new UInt1Vector("uint1", allocator)) {
-
- long[] longValues = new long[]{Long.MIN_VALUE, Long.MAX_VALUE, -1L};
- uInt8Vector.allocateNew(3);
- uInt8Vector.setValueCount(3);
- for (int i = 0; i < longValues.length; i++) {
- uInt8Vector.set(i, longValues[i]);
- long readValue = uInt8Vector.getObjectNoOverflow(i).longValue();
- assertEquals(readValue, longValues[i]);
- }
+ public void testRoundtripEmptyVector() throws Exception {
+ final List<Field> fields = Arrays.asList(
+ Field.nullable("utf8", ArrowType.Utf8.INSTANCE),
+ Field.nullable("largeutf8", ArrowType.LargeUtf8.INSTANCE),
+ Field.nullable("binary", ArrowType.Binary.INSTANCE),
+ Field.nullable("largebinary", ArrowType.LargeBinary.INSTANCE),
+ Field.nullable("fixedsizebinary", new ArrowType.FixedSizeBinary(2)),
+ Field.nullable("decimal128", new ArrowType.Decimal(3, 2, 128)),
+ Field.nullable("decimal128", new ArrowType.Decimal(3, 2, 256)),
+ new Field("list", FieldType.nullable(ArrowType.List.INSTANCE),
+ Collections.singletonList(Field.nullable("items", new
ArrowType.Int(32, true)))),
+ new Field("largelist",
FieldType.nullable(ArrowType.LargeList.INSTANCE),
+ Collections.singletonList(Field.nullable("items", new
ArrowType.Int(32, true)))),
+ new Field("map", FieldType.nullable(new ArrowType.Map(/*keyssorted*/
false)),
+ Collections.singletonList(new Field("items",
FieldType.notNullable(ArrowType.Struct.INSTANCE),
+ Arrays.asList(Field.notNullable("keys", new ArrowType.Int(32,
true)),
+ Field.nullable("values", new ArrowType.Int(32, true)))))),
+ new Field("fixedsizelist", FieldType.nullable(new
ArrowType.FixedSizeList(2)),
+ Collections.singletonList(Field.nullable("items", new
ArrowType.Int(32, true)))),
+ new Field("denseunion", FieldType.nullable(new
ArrowType.Union(UnionMode.Dense, new int[] {0})),
+ Collections.singletonList(Field.nullable("items", new
ArrowType.Int(32, true)))),
+ new Field("sparseunion", FieldType.nullable(new
ArrowType.Union(UnionMode.Sparse, new int[] {0})),
+ Collections.singletonList(Field.nullable("items", new
ArrowType.Int(32, true))))
+ );
+
+ for (final Field field : fields) {
+ final Schema schema = new Schema(Collections.singletonList(field));
+ try (final VectorSchemaRoot root = VectorSchemaRoot.create(schema,
allocator)) {
+ Path outputPath = Files.createTempFile("arrow-", ".json");
+ File outputFile = outputPath.toFile();
+ outputFile.deleteOnExit();
+
+ // Try with no allocation
+ try (final JsonFileWriter jsonWriter = new JsonFileWriter(outputFile,
JsonFileWriter.config().pretty(true))) {
+ jsonWriter.start(schema, null);
+ jsonWriter.write(root);
+ } catch (Exception e) {
+ throw new RuntimeException("Test failed for empty vector of type " +
field, e);
+ }
- int[] intValues = new int[]{Integer.MIN_VALUE, Integer.MAX_VALUE, -1};
- uInt4Vector.allocateNew(3);
- uInt4Vector.setValueCount(3);
- for (int i = 0; i < intValues.length; i++) {
- uInt4Vector.set(i, intValues[i]);
- int actualValue = (int)
UInt4Vector.getNoOverflow(uInt4Vector.getDataBuffer(), i);
- assertEquals(intValues[i], actualValue);
- }
+ try (JsonFileReader reader = new JsonFileReader(outputFile,
allocator)) {
+ final Schema readSchema = reader.start();
+ assertEquals(schema, readSchema);
+ try (final VectorSchemaRoot data = reader.read()) {
+ assertNotNull(data);
+ assertEquals(data.getRowCount(), 0);
+ }
+ assertNull(reader.read());
+ }
- byte[] byteValues = new byte[]{Byte.MIN_VALUE, Byte.MAX_VALUE, -1};
- uInt1Vector.allocateNew(3);
- uInt1Vector.setValueCount(3);
- for (int i = 0; i < byteValues.length; i++) {
- uInt1Vector.set(i, byteValues[i]);
- byte actualValue = (byte)
UInt1Vector.getNoOverflow(uInt1Vector.getDataBuffer(), i);
- assertEquals(byteValues[i], actualValue);
+ // Try with an explicit allocation
+ root.allocateNew();
+ root.setRowCount(0);
+ try (final JsonFileWriter jsonWriter = new JsonFileWriter(outputFile,
JsonFileWriter.config().pretty(true))) {
+ jsonWriter.start(schema, null);
+ jsonWriter.write(root);
+ } catch (Exception e) {
+ throw new RuntimeException("Test failed for empty vector of type " +
field, e);
+ }
+
+ try (JsonFileReader reader = new JsonFileReader(outputFile,
allocator)) {
+ final Schema readSchema = reader.start();
+ assertEquals(schema, readSchema);
+ try (final VectorSchemaRoot data = reader.read()) {
+ assertNotNull(data);
+ assertEquals(data.getRowCount(), 0);
+ }
+ assertNull(reader.read());
+ }
}
}
}