[ 
https://issues.apache.org/jira/browse/ARROW-15971?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Arrow User updated ARROW-15971:
-------------------------------
    Description: 
h2. Description

When using C++ (or Python) to construct a null or empty outer array of type 
`array_1: list<item: struct<array_sub_col: list<item: string>>>`, either:
 - `array_1:null` or
 - `array_1:[]`

an out of bounds exceptions (see stack trace below) follows when later 
retrieving the field reader for the inner list (`array_sub_col`) in Java.
h2. Reproduction

Java: 7.0.0
C++: 4.0.0
Python: 7.0.0

Creating a stream on C++ of type `array_1: list<item: struct<array_sub_col: 
list<item: string>>> ` with an empty (or null) outer list:
{code:c++}arrow::MemoryPool* pool = arrow::default_memory_pool();
arrow::Result<std::shared_ptr<arrow::io::BufferOutputStream>> stream_buffer =
    arrow::io::BufferOutputStream::Create(1, pool);

std::vector<std::shared_ptr<arrow::Field>> 
inner_list_field\{std::make_shared<arrow::Field>("array_sub_col",arrow::list(arrow::utf8()))};

// Datatype for the builder: list<struct<list<string>>>
std::shared_ptr<DataType> data_type = list(struct_(inner_list_field));

std::unique_ptr<arrow::ArrayBuilder> builder;
arrow::MakeBuilder(pool, data_type, &builder);
auto* list_builder = dynamic_cast<arrow::ListBuilder*>(builder.get());

// Append a null or an empty list to the outer list
list_builder->AppendNull(); // or list_builder->AppendEmptyValue()

std::vector<std::shared_ptr<arrow::Array>> value_batch;
value_batch.resize(1);
list_builder->Finish(&value_batch[0]);

std::vector<std::shared_ptr<arrow::Field>> 
outer_list_field\{std::make_shared<arrow::Field>("array_1",data_type)};
auto schema = std::make_shared<arrow::Schema>(outer_list_field);

// Build a single row record batch
std::shared_ptr<arrow::RecordBatch> batch = RecordBatch::Make(schema, 1, 
value_batch);
ASSERT_OK(batch->Validate());

// Stream the batch to a file
arrow::Result<std::shared_ptr<ipc::RecordBatchWriter>> stream_writer = 
    arrow::ipc::MakeStreamWriter(stream_buffer.ValueOrDie().get(), schema, 
arrow::ipc::IpcWriteOptions::Defaults());
stream_writer.ValueOrDie()->WriteRecordBatch(*batch);

arrow::Result<std::shared_ptr<arrow::Buffer>> buffer_result = 
stream_buffer.ValueOrDie()->Finish();
std::shared_ptr<arrow::Buffer> buffer = buffer_result.ValueOrDie();
auto file_output = 
arrow::io::FileOutputStream::Open("/tmp/batch_stream.out").ValueOrDie();
file_output->Write(buffer->data(), buffer->size());
file_output->Close();
{code}
As expected, Python holds the same memory layout for the field vectors as the 
code above:
{code:python}array = pa.array([None], 
type=pa.list_(pa.struct([pa.field("array_sub_col", pa.list_(pa.utf8()))])))
batch = pa.record_batch([struct_array], names=["array_1"])

sink = pa.BufferOutputStream()
with pa.ipc.new_stream(sink, batch.schema) as writer:
    writer.write_batch(batch)
buf = sink.getvalue()

with open('/tmp/batch_stream.out', 'wb') as f:
    f.write(buf)
{code}
*Java fails when then trying to access the inner list's field reader:*
{code:java}File file = new File("/tmp/batch_stream.out");
byte[] bytes = FileUtils.readFileToByteArray(file);
try (ArrowStreamReader reader = new ArrowStreamReader(new 
ByteArrayInputStream(bytes), allocator)) {
     Schema schema = reader.getVectorSchemaRoot().getSchema();
     reader.loadNextBatch();
     
readBatch.getVector("array_1").getReader().reader().reader("array_sub_col");    
                 // <- fails: reader("array_sub_col") fails with OOB

     // Concrete readers:
     // FieldVector array_1 = readBatch.getVector("array_1");
     // UnionListReader array_1_reader = (UnionListReader) array_1.getReader();
     // NullableStructReaderImpl struct_reader = (NullableStructReaderImpl) 
array_1_reader.reader();
     // FieldReader union_list_reader = struct_reader.reader("array_sub_col");  
                      // <- fails: OOB
{code}
h3. Stack trace:
{quote}java.lang.reflect.InvocationTargetException
    at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0 (Native Method)
    at jdk.internal.reflect.NativeMethodAccessorImpl.invoke 
(NativeMethodAccessorImpl.java:62)
    at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke 
(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke (Method.java:566)
    at org.codehaus.mojo.exec.ExecJavaMojo$1.run (ExecJavaMojo.java:297)
    at java.lang.Thread.run (Thread.java:829)
Caused by: java.lang.IndexOutOfBoundsException: index: 4, length: 4 (expected: 
range(0, 4))
    at org.apache.arrow.memory.ArrowBuf.checkIndexD (ArrowBuf.java:318)
    at org.apache.arrow.memory.ArrowBuf.chk (ArrowBuf.java:305)
    at org.apache.arrow.memory.ArrowBuf.getInt (ArrowBuf.java:424)
    at com.test.arrow.ValidateArrow.testArrow (ValidateArrow.java:433)
    at com.test.arrow.ValidateArrow.main (ValidateArrow.java:440)
    at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0 (Native Method)
    at jdk.internal.reflect.NativeMethodAccessorImpl.invoke 
(NativeMethodAccessorImpl.java:62)
    at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke 
(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke (Method.java:566)
    at org.codehaus.mojo.exec.ExecJavaMojo$1.run (ExecJavaMojo.java:297)
    at java.lang.Thread.run (Thread.java:829)
{quote}

  was:
h2. Description

When using C++ (or Python) to construct a null or empty outer array of type 
`array_1: list<item: struct<array_sub_col: list<item: string>>>`, either:
 - `array_1:null` or
 - `array_1:[]`

an out of bounds exceptions (see stack trace below) follows when later 
retrieving the field reader for the inner list (`array_sub_col`) in Java.
h2. Reproduction

Java: 7.0.0
C++: 4.0.0
Python: 7.0.0

Creating a stream on C++ of type `array_1: list<item: struct<array_sub_col: 
list<item: string>>> ` with an empty (or null) outer list:
{quote}arrow::MemoryPool* pool = arrow::default_memory_pool();
arrow::Result<std::shared_ptr<arrow::io::BufferOutputStream>> stream_buffer =
    arrow::io::BufferOutputStream::Create(1, pool);

std::vector<std::shared_ptr<arrow::Field>> 
inner_list_field\{std::make_shared<arrow::Field>("array_sub_col",arrow::list(arrow::utf8()))};

// Datatype for the builder: list<struct<list<string>>>
std::shared_ptr<DataType> data_type = list(struct_(inner_list_field));

std::unique_ptr<arrow::ArrayBuilder> builder;
arrow::MakeBuilder(pool, data_type, &builder);
auto* list_builder = dynamic_cast<arrow::ListBuilder*>(builder.get());

// Append a null or an empty list to the outer list
list_builder->AppendNull(); // or list_builder->AppendEmptyValue()

std::vector<std::shared_ptr<arrow::Array>> value_batch;
value_batch.resize(1);
list_builder->Finish(&value_batch[0]);

std::vector<std::shared_ptr<arrow::Field>> 
outer_list_field\{std::make_shared<arrow::Field>("array_1",data_type)};
auto schema = std::make_shared<arrow::Schema>(outer_list_field);

// Build a single row record batch
std::shared_ptr<arrow::RecordBatch> batch = RecordBatch::Make(schema, 1, 
value_batch);
ASSERT_OK(batch->Validate());

// Stream the batch to a file
arrow::Result<std::shared_ptr<ipc::RecordBatchWriter>> stream_writer = 
    arrow::ipc::MakeStreamWriter(stream_buffer.ValueOrDie().get(), schema, 
arrow::ipc::IpcWriteOptions::Defaults());
stream_writer.ValueOrDie()->WriteRecordBatch(*batch);

arrow::Result<std::shared_ptr<arrow::Buffer>> buffer_result = 
stream_buffer.ValueOrDie()->Finish();
std::shared_ptr<arrow::Buffer> buffer = buffer_result.ValueOrDie();
auto file_output = 
arrow::io::FileOutputStream::Open("/tmp/batch_stream.out").ValueOrDie();
file_output->Write(buffer->data(), buffer->size());
file_output->Close();
{quote}
As expected, Python holds the same memory layout for the field vectors as the 
code above:
{quote}array = pa.array([None], 
type=pa.list_(pa.struct([pa.field("array_sub_col", pa.list_(pa.utf8()))])))
batch = pa.record_batch([struct_array], names=["array_1"])

sink = pa.BufferOutputStream()
with pa.ipc.new_stream(sink, batch.schema) as writer:
    writer.write_batch(batch)
buf = sink.getvalue()

with open('/tmp/batch_stream.out', 'wb') as f:
    f.write(buf)
{quote}
*Java fails when then trying to access the inner list's field reader:*
{quote}File file = new File("/tmp/batch_stream.out");
byte[] bytes = FileUtils.readFileToByteArray(file);
try (ArrowStreamReader reader = new ArrowStreamReader(new 
ByteArrayInputStream(bytes), allocator)) {
     Schema schema = reader.getVectorSchemaRoot().getSchema();
     reader.loadNextBatch();
     
readBatch.getVector("array_1").getReader().reader().reader("array_sub_col");    
                 // <- fails: reader("array_sub_col") fails with OOB

     // Concrete readers:
     // FieldVector array_1 = readBatch.getVector("array_1");
     // UnionListReader array_1_reader = (UnionListReader) array_1.getReader();
     // NullableStructReaderImpl struct_reader = (NullableStructReaderImpl) 
array_1_reader.reader();
     // FieldReader union_list_reader = struct_reader.reader("array_sub_col");  
                      // <- fails: OOB
{quote}
h3. Stack trace:
{quote}java.lang.reflect.InvocationTargetException
    at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0 (Native Method)
    at jdk.internal.reflect.NativeMethodAccessorImpl.invoke 
(NativeMethodAccessorImpl.java:62)
    at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke 
(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke (Method.java:566)
    at org.codehaus.mojo.exec.ExecJavaMojo$1.run (ExecJavaMojo.java:297)
    at java.lang.Thread.run (Thread.java:829)
Caused by: java.lang.IndexOutOfBoundsException: index: 4, length: 4 (expected: 
range(0, 4))
    at org.apache.arrow.memory.ArrowBuf.checkIndexD (ArrowBuf.java:318)
    at org.apache.arrow.memory.ArrowBuf.chk (ArrowBuf.java:305)
    at org.apache.arrow.memory.ArrowBuf.getInt (ArrowBuf.java:424)
    at com.test.arrow.ValidateArrow.testArrow (ValidateArrow.java:433)
    at com.test.arrow.ValidateArrow.main (ValidateArrow.java:440)
    at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0 (Native Method)
    at jdk.internal.reflect.NativeMethodAccessorImpl.invoke 
(NativeMethodAccessorImpl.java:62)
    at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke 
(DelegatingMethodAccessorImpl.java:43)
    at java.lang.reflect.Method.invoke (Method.java:566)
    at org.codehaus.mojo.exec.ExecJavaMojo$1.run (ExecJavaMojo.java:297)
    at java.lang.Thread.run (Thread.java:829)
{quote}


> Error when reading inner lists within a struct in empty outer lists from 
> C++/Python in Java
> -------------------------------------------------------------------------------------------
>
>                 Key: ARROW-15971
>                 URL: https://issues.apache.org/jira/browse/ARROW-15971
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: C++, Java, Python
>    Affects Versions: 7.0.0
>            Reporter: Arrow User
>            Priority: Major
>              Labels: bug
>
> h2. Description
> When using C++ (or Python) to construct a null or empty outer array of type 
> `array_1: list<item: struct<array_sub_col: list<item: string>>>`, either:
>  - `array_1:null` or
>  - `array_1:[]`
> an out of bounds exceptions (see stack trace below) follows when later 
> retrieving the field reader for the inner list (`array_sub_col`) in Java.
> h2. Reproduction
> Java: 7.0.0
> C++: 4.0.0
> Python: 7.0.0
> Creating a stream on C++ of type `array_1: list<item: struct<array_sub_col: 
> list<item: string>>> ` with an empty (or null) outer list:
> {code:c++}arrow::MemoryPool* pool = arrow::default_memory_pool();
> arrow::Result<std::shared_ptr<arrow::io::BufferOutputStream>> stream_buffer =
>     arrow::io::BufferOutputStream::Create(1, pool);
> std::vector<std::shared_ptr<arrow::Field>> 
> inner_list_field\{std::make_shared<arrow::Field>("array_sub_col",arrow::list(arrow::utf8()))};
> // Datatype for the builder: list<struct<list<string>>>
> std::shared_ptr<DataType> data_type = list(struct_(inner_list_field));
> std::unique_ptr<arrow::ArrayBuilder> builder;
> arrow::MakeBuilder(pool, data_type, &builder);
> auto* list_builder = dynamic_cast<arrow::ListBuilder*>(builder.get());
> // Append a null or an empty list to the outer list
> list_builder->AppendNull(); // or list_builder->AppendEmptyValue()
> std::vector<std::shared_ptr<arrow::Array>> value_batch;
> value_batch.resize(1);
> list_builder->Finish(&value_batch[0]);
> std::vector<std::shared_ptr<arrow::Field>> 
> outer_list_field\{std::make_shared<arrow::Field>("array_1",data_type)};
> auto schema = std::make_shared<arrow::Schema>(outer_list_field);
> // Build a single row record batch
> std::shared_ptr<arrow::RecordBatch> batch = RecordBatch::Make(schema, 1, 
> value_batch);
> ASSERT_OK(batch->Validate());
> // Stream the batch to a file
> arrow::Result<std::shared_ptr<ipc::RecordBatchWriter>> stream_writer = 
>     arrow::ipc::MakeStreamWriter(stream_buffer.ValueOrDie().get(), schema, 
> arrow::ipc::IpcWriteOptions::Defaults());
> stream_writer.ValueOrDie()->WriteRecordBatch(*batch);
> arrow::Result<std::shared_ptr<arrow::Buffer>> buffer_result = 
> stream_buffer.ValueOrDie()->Finish();
> std::shared_ptr<arrow::Buffer> buffer = buffer_result.ValueOrDie();
> auto file_output = 
> arrow::io::FileOutputStream::Open("/tmp/batch_stream.out").ValueOrDie();
> file_output->Write(buffer->data(), buffer->size());
> file_output->Close();
> {code}
> As expected, Python holds the same memory layout for the field vectors as the 
> code above:
> {code:python}array = pa.array([None], 
> type=pa.list_(pa.struct([pa.field("array_sub_col", pa.list_(pa.utf8()))])))
> batch = pa.record_batch([struct_array], names=["array_1"])
> sink = pa.BufferOutputStream()
> with pa.ipc.new_stream(sink, batch.schema) as writer:
>     writer.write_batch(batch)
> buf = sink.getvalue()
> with open('/tmp/batch_stream.out', 'wb') as f:
>     f.write(buf)
> {code}
> *Java fails when then trying to access the inner list's field reader:*
> {code:java}File file = new File("/tmp/batch_stream.out");
> byte[] bytes = FileUtils.readFileToByteArray(file);
> try (ArrowStreamReader reader = new ArrowStreamReader(new 
> ByteArrayInputStream(bytes), allocator)) {
>      Schema schema = reader.getVectorSchemaRoot().getSchema();
>      reader.loadNextBatch();
>      
> readBatch.getVector("array_1").getReader().reader().reader("array_sub_col");  
>                    // <- fails: reader("array_sub_col") fails with OOB
>      // Concrete readers:
>      // FieldVector array_1 = readBatch.getVector("array_1");
>      // UnionListReader array_1_reader = (UnionListReader) 
> array_1.getReader();
>      // NullableStructReaderImpl struct_reader = (NullableStructReaderImpl) 
> array_1_reader.reader();
>      // FieldReader union_list_reader = 
> struct_reader.reader("array_sub_col");                        // <- fails: OOB
> {code}
> h3. Stack trace:
> {quote}java.lang.reflect.InvocationTargetException
>     at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0 (Native Method)
>     at jdk.internal.reflect.NativeMethodAccessorImpl.invoke 
> (NativeMethodAccessorImpl.java:62)
>     at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke 
> (DelegatingMethodAccessorImpl.java:43)
>     at java.lang.reflect.Method.invoke (Method.java:566)
>     at org.codehaus.mojo.exec.ExecJavaMojo$1.run (ExecJavaMojo.java:297)
>     at java.lang.Thread.run (Thread.java:829)
> Caused by: java.lang.IndexOutOfBoundsException: index: 4, length: 4 
> (expected: range(0, 4))
>     at org.apache.arrow.memory.ArrowBuf.checkIndexD (ArrowBuf.java:318)
>     at org.apache.arrow.memory.ArrowBuf.chk (ArrowBuf.java:305)
>     at org.apache.arrow.memory.ArrowBuf.getInt (ArrowBuf.java:424)
>     at com.test.arrow.ValidateArrow.testArrow (ValidateArrow.java:433)
>     at com.test.arrow.ValidateArrow.main (ValidateArrow.java:440)
>     at jdk.internal.reflect.NativeMethodAccessorImpl.invoke0 (Native Method)
>     at jdk.internal.reflect.NativeMethodAccessorImpl.invoke 
> (NativeMethodAccessorImpl.java:62)
>     at jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke 
> (DelegatingMethodAccessorImpl.java:43)
>     at java.lang.reflect.Method.invoke (Method.java:566)
>     at org.codehaus.mojo.exec.ExecJavaMojo$1.run (ExecJavaMojo.java:297)
>     at java.lang.Thread.run (Thread.java:829)
> {quote}



--
This message was sent by Atlassian Jira
(v8.20.1#820001)

Reply via email to