This is an automated email from the ASF dual-hosted git repository.

lidavidm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new c2eda76eb8 ARROW-17107: [Java] Fix variable-width vectors in 
integration JSON writer (#13676)
c2eda76eb8 is described below

commit c2eda76eb84f5bb8c7272b4c2b0cd6be05c06517
Author: David Li <[email protected]>
AuthorDate: Mon Jul 25 06:58:46 2022 -0400

    ARROW-17107: [Java] Fix variable-width vectors in integration JSON writer 
(#13676)
    
    - Clarify that this is meant to be an internal format
    - Fix handling of empty variable-width vectors
    
    Authored-by: David Li <[email protected]>
    Signed-off-by: David Li <[email protected]>
---
 .../apache/arrow/vector/ipc/JsonFileReader.java    |   6 +-
 .../apache/arrow/vector/ipc/JsonFileWriter.java    |  41 ++++++--
 .../org/apache/arrow/vector/TestValueVector.java   |  35 +++++++
 .../org/apache/arrow/vector/ipc/TestJSONFile.java  | 110 +++++++++++++++------
 4 files changed, 152 insertions(+), 40 deletions(-)

diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java 
b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java
index 6455857c27..742daeef25 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileReader.java
@@ -784,8 +784,10 @@ public class JsonFileReader implements AutoCloseable, 
DictionaryProvider {
   @Override
   public void close() throws IOException {
     parser.close();
-    for (Dictionary dictionary : dictionaries.values()) {
-      dictionary.getVector().close();
+    if (dictionaries != null) {
+      for (Dictionary dictionary : dictionaries.values()) {
+        dictionary.getVector().close();
+      }
     }
   }
 
diff --git 
a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java 
b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java
index 58760c1a94..f5e267e812 100644
--- a/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java
+++ b/java/vector/src/main/java/org/apache/arrow/vector/ipc/JsonFileWriter.java
@@ -30,6 +30,7 @@ import java.util.Set;
 
 import org.apache.arrow.memory.ArrowBuf;
 import org.apache.arrow.util.Preconditions;
+import org.apache.arrow.vector.BaseLargeVariableWidthVector;
 import org.apache.arrow.vector.BaseVariableWidthVector;
 import org.apache.arrow.vector.BigIntVector;
 import org.apache.arrow.vector.BitVectorHelper;
@@ -83,8 +84,10 @@ import 
com.fasterxml.jackson.core.util.DefaultPrettyPrinter.NopIndenter;
 import com.fasterxml.jackson.databind.MappingJsonFactory;
 
 /**
- * A writer that converts binary Vectors into a JSON format suitable
+ * A writer that converts binary Vectors into an <em>internal, unstable</em> 
JSON format suitable
  * for integration testing.
+ *
+ * This writer does NOT implement a JSON dataset format like JSONL.
  */
 public class JsonFileWriter implements AutoCloseable {
 
@@ -228,11 +231,21 @@ public class JsonFileWriter implements AutoCloseable {
                   vector.getMinorType() == MinorType.VARBINARY)) {
             writeValueToGenerator(bufferType, vectorBuffer, 
vectorBuffers.get(v - 1), vector, i);
           } else if (bufferType.equals(OFFSET) && vector.getValueCount() == 0 
&&
-              (vector.getMinorType() == MinorType.VARBINARY || 
vector.getMinorType() == MinorType.VARCHAR)) {
-            ArrowBuf vectorBufferTmp = vector.getAllocator().buffer(4);
-            vectorBufferTmp.setInt(0, 0);
-            writeValueToGenerator(bufferType, vectorBufferTmp, null, vector, 
i);
-            vectorBufferTmp.close();
+              (vector.getMinorType() == MinorType.LIST || 
vector.getMinorType() == MinorType.MAP ||
+                  vector.getMinorType() == MinorType.VARBINARY || 
vector.getMinorType() == MinorType.VARCHAR)) {
+            // Empty vectors may not have allocated an offsets buffer
+            try (ArrowBuf vectorBufferTmp = vector.getAllocator().buffer(4)) {
+              vectorBufferTmp.setInt(0, 0);
+              writeValueToGenerator(bufferType, vectorBufferTmp, null, vector, 
i);
+            }
+          } else if (bufferType.equals(OFFSET) && vector.getValueCount() == 0 
&&
+              (vector.getMinorType() == MinorType.LARGELIST || 
vector.getMinorType() == MinorType.LARGEVARBINARY ||
+                vector.getMinorType() == MinorType.LARGEVARCHAR)) {
+            // Empty vectors may not have allocated an offsets buffer
+            try (ArrowBuf vectorBufferTmp = vector.getAllocator().buffer(8)) {
+              vectorBufferTmp.setLong(0, 0);
+              writeValueToGenerator(bufferType, vectorBufferTmp, null, vector, 
i);
+            }
           } else {
             writeValueToGenerator(bufferType, vectorBuffer, null, vector, i);
           }
@@ -267,7 +280,21 @@ public class JsonFileWriter implements AutoCloseable {
     if (bufferType.equals(TYPE)) {
       generator.writeNumber(buffer.getByte(index * TinyIntVector.TYPE_WIDTH));
     } else if (bufferType.equals(OFFSET)) {
-      generator.writeNumber(buffer.getInt(index * 
BaseVariableWidthVector.OFFSET_WIDTH));
+      switch (vector.getMinorType()) {
+        case VARCHAR:
+        case VARBINARY:
+        case LIST:
+        case MAP:
+          generator.writeNumber(buffer.getInt((long) index * 
BaseVariableWidthVector.OFFSET_WIDTH));
+          break;
+        case LARGELIST:
+        case LARGEVARBINARY:
+        case LARGEVARCHAR:
+          generator.writeNumber(buffer.getLong((long) index * 
BaseLargeVariableWidthVector.OFFSET_WIDTH));
+          break;
+        default:
+          throw new IllegalArgumentException("Type has no offset buffer: " + 
vector.getField());
+      }
     } else if (bufferType.equals(VALIDITY)) {
       generator.writeNumber(vector.isNull(index) ? 0 : 1);
     } else if (bufferType.equals(DATA)) {
diff --git 
a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java 
b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
index 8858281063..516daa2362 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java
@@ -190,6 +190,41 @@ public class TestValueVector {
     }
   }
 
+  @Test
+  public void testNoOverFlowWithUINT() {
+    try (final UInt8Vector uInt8Vector = new UInt8Vector("uint8", allocator);
+         final UInt4Vector uInt4Vector = new UInt4Vector("uint4", allocator);
+         final UInt1Vector uInt1Vector = new UInt1Vector("uint1", allocator)) {
+
+      long[] longValues = new long[]{Long.MIN_VALUE, Long.MAX_VALUE, -1L};
+      uInt8Vector.allocateNew(3);
+      uInt8Vector.setValueCount(3);
+      for (int i = 0; i < longValues.length; i++) {
+        uInt8Vector.set(i, longValues[i]);
+        long readValue = uInt8Vector.getObjectNoOverflow(i).longValue();
+        assertEquals(readValue, longValues[i]);
+      }
+
+      int[] intValues = new int[]{Integer.MIN_VALUE, Integer.MAX_VALUE, -1};
+      uInt4Vector.allocateNew(3);
+      uInt4Vector.setValueCount(3);
+      for (int i = 0; i < intValues.length; i++) {
+        uInt4Vector.set(i, intValues[i]);
+        int actualValue = (int) 
UInt4Vector.getNoOverflow(uInt4Vector.getDataBuffer(), i);
+        assertEquals(intValues[i], actualValue);
+      }
+
+      byte[] byteValues = new byte[]{Byte.MIN_VALUE, Byte.MAX_VALUE, -1};
+      uInt1Vector.allocateNew(3);
+      uInt1Vector.setValueCount(3);
+      for (int i = 0; i < byteValues.length; i++) {
+        uInt1Vector.set(i, byteValues[i]);
+        byte actualValue = (byte) 
UInt1Vector.getNoOverflow(uInt1Vector.getDataBuffer(), i);
+        assertEquals(byteValues[i], actualValue);
+      }
+    }
+  }
+
   @Test /* IntVector */
   public void testFixedType2() {
     try (final IntVector intVector = new IntVector(EMPTY_SCHEMA_PATH, 
allocator)) {
diff --git 
a/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java 
b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java
index f0aa226e26..0aa49d9daa 100644
--- a/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java
+++ b/java/vector/src/test/java/org/apache/arrow/vector/ipc/TestJSONFile.java
@@ -18,21 +18,29 @@
 package org.apache.arrow.vector.ipc;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
 
 import java.io.File;
 import java.io.IOException;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
 
 import org.apache.arrow.memory.BufferAllocator;
 import org.apache.arrow.vector.FieldVector;
-import org.apache.arrow.vector.UInt1Vector;
-import org.apache.arrow.vector.UInt4Vector;
-import org.apache.arrow.vector.UInt8Vector;
 import org.apache.arrow.vector.VectorSchemaRoot;
 import org.apache.arrow.vector.complex.StructVector;
 import org.apache.arrow.vector.complex.impl.ComplexWriterImpl;
 import org.apache.arrow.vector.complex.writer.BaseWriter;
 import org.apache.arrow.vector.dictionary.DictionaryProvider;
 import 
org.apache.arrow.vector.dictionary.DictionaryProvider.MapDictionaryProvider;
+import org.apache.arrow.vector.types.UnionMode;
+import org.apache.arrow.vector.types.pojo.ArrowType;
+import org.apache.arrow.vector.types.pojo.Field;
+import org.apache.arrow.vector.types.pojo.FieldType;
 import org.apache.arrow.vector.types.pojo.Schema;
 import org.apache.arrow.vector.util.Validator;
 import org.junit.Assert;
@@ -421,37 +429,77 @@ public class TestJSONFile extends BaseFileTest {
     }
   }
 
+  /** Regression test for ARROW-17107. */
   @Test
-  public void testNoOverFlowWithUINT() {
-    try (final UInt8Vector uInt8Vector = new UInt8Vector("uint8", allocator);
-        final UInt4Vector uInt4Vector = new UInt4Vector("uint4", allocator);
-        final UInt1Vector uInt1Vector = new UInt1Vector("uint1", allocator)) {
-
-      long[] longValues = new long[]{Long.MIN_VALUE, Long.MAX_VALUE, -1L};
-      uInt8Vector.allocateNew(3);
-      uInt8Vector.setValueCount(3);
-      for (int i = 0; i < longValues.length; i++) {
-        uInt8Vector.set(i, longValues[i]);
-        long readValue = uInt8Vector.getObjectNoOverflow(i).longValue();
-        assertEquals(readValue, longValues[i]);
-      }
+  public void testRoundtripEmptyVector() throws Exception {
+    final List<Field> fields = Arrays.asList(
+        Field.nullable("utf8", ArrowType.Utf8.INSTANCE),
+        Field.nullable("largeutf8", ArrowType.LargeUtf8.INSTANCE),
+        Field.nullable("binary", ArrowType.Binary.INSTANCE),
+        Field.nullable("largebinary", ArrowType.LargeBinary.INSTANCE),
+        Field.nullable("fixedsizebinary", new ArrowType.FixedSizeBinary(2)),
+        Field.nullable("decimal128", new ArrowType.Decimal(3, 2, 128)),
+        Field.nullable("decimal128", new ArrowType.Decimal(3, 2, 256)),
+        new Field("list", FieldType.nullable(ArrowType.List.INSTANCE),
+            Collections.singletonList(Field.nullable("items", new 
ArrowType.Int(32, true)))),
+        new Field("largelist", 
FieldType.nullable(ArrowType.LargeList.INSTANCE),
+            Collections.singletonList(Field.nullable("items", new 
ArrowType.Int(32, true)))),
+        new Field("map", FieldType.nullable(new ArrowType.Map(/*keyssorted*/ 
false)),
+            Collections.singletonList(new Field("items", 
FieldType.notNullable(ArrowType.Struct.INSTANCE),
+                Arrays.asList(Field.notNullable("keys", new ArrowType.Int(32, 
true)),
+                    Field.nullable("values", new ArrowType.Int(32, true)))))),
+        new Field("fixedsizelist", FieldType.nullable(new 
ArrowType.FixedSizeList(2)),
+            Collections.singletonList(Field.nullable("items", new 
ArrowType.Int(32, true)))),
+        new Field("denseunion", FieldType.nullable(new 
ArrowType.Union(UnionMode.Dense, new int[] {0})),
+            Collections.singletonList(Field.nullable("items", new 
ArrowType.Int(32, true)))),
+        new Field("sparseunion", FieldType.nullable(new 
ArrowType.Union(UnionMode.Sparse, new int[] {0})),
+            Collections.singletonList(Field.nullable("items", new 
ArrowType.Int(32, true))))
+    );
+
+    for (final Field field : fields) {
+      final Schema schema = new Schema(Collections.singletonList(field));
+      try (final VectorSchemaRoot root = VectorSchemaRoot.create(schema, 
allocator)) {
+        Path outputPath = Files.createTempFile("arrow-", ".json");
+        File outputFile = outputPath.toFile();
+        outputFile.deleteOnExit();
+
+        // Try with no allocation
+        try (final JsonFileWriter jsonWriter = new JsonFileWriter(outputFile, 
JsonFileWriter.config().pretty(true))) {
+          jsonWriter.start(schema, null);
+          jsonWriter.write(root);
+        } catch (Exception e) {
+          throw new RuntimeException("Test failed for empty vector of type " + 
field, e);
+        }
 
-      int[] intValues = new int[]{Integer.MIN_VALUE, Integer.MAX_VALUE, -1};
-      uInt4Vector.allocateNew(3);
-      uInt4Vector.setValueCount(3);
-      for (int i = 0; i < intValues.length; i++) {
-        uInt4Vector.set(i, intValues[i]);
-        int actualValue = (int) 
UInt4Vector.getNoOverflow(uInt4Vector.getDataBuffer(), i);
-        assertEquals(intValues[i], actualValue);
-      }
+        try (JsonFileReader reader = new JsonFileReader(outputFile, 
allocator)) {
+          final Schema readSchema = reader.start();
+          assertEquals(schema, readSchema);
+          try (final VectorSchemaRoot data = reader.read()) {
+            assertNotNull(data);
+            assertEquals(data.getRowCount(), 0);
+          }
+          assertNull(reader.read());
+        }
 
-      byte[] byteValues = new byte[]{Byte.MIN_VALUE, Byte.MAX_VALUE, -1};
-      uInt1Vector.allocateNew(3);
-      uInt1Vector.setValueCount(3);
-      for (int i = 0; i < byteValues.length; i++) {
-        uInt1Vector.set(i, byteValues[i]);
-        byte actualValue = (byte) 
UInt1Vector.getNoOverflow(uInt1Vector.getDataBuffer(), i);
-        assertEquals(byteValues[i], actualValue);
+        // Try with an explicit allocation
+        root.allocateNew();
+        root.setRowCount(0);
+        try (final JsonFileWriter jsonWriter = new JsonFileWriter(outputFile, 
JsonFileWriter.config().pretty(true))) {
+          jsonWriter.start(schema, null);
+          jsonWriter.write(root);
+        } catch (Exception e) {
+          throw new RuntimeException("Test failed for empty vector of type " + 
field, e);
+        }
+
+        try (JsonFileReader reader = new JsonFileReader(outputFile, 
allocator)) {
+          final Schema readSchema = reader.start();
+          assertEquals(schema, readSchema);
+          try (final VectorSchemaRoot data = reader.read()) {
+            assertNotNull(data);
+            assertEquals(data.getRowCount(), 0);
+          }
+          assertNull(reader.read());
+        }
       }
     }
   }

Reply via email to