This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 2895af4922 GH-37845: [Go][Parquet] Check the number of logical fields 
instead of physical columns (#37846)
2895af4922 is described below

commit 2895af492236e5dc42ac665406de8648a9aac6db
Author: Tim Schaub <[email protected]>
AuthorDate: Tue Sep 26 09:01:36 2023 -0600

    GH-37845: [Go][Parquet] Check the number of logical fields instead of 
physical columns (#37846)
    
    ### Rationale for this change
    
    This makes it so trying to read with a column chunk reader consistently 
returns an error if the index is outside the bounds of the logical fields 
(currently it panics in some cases and returns an error in others).
    
    ### What changes are included in this PR?
    
    This makes it so the column chunk reader checks the number of logical 
fields instead of the number of physical columns when checking if an index is 
out of range.
    
    ### Are these changes tested?
    
    The new test will panics without the accompanying code change.
    
    ### Are there any user-facing changes?
    
    Applications that used to panic will now have an error to handle instead.
    
    * Closes: #37845
    
    Authored-by: Tim Schaub <[email protected]>
    Signed-off-by: Matt Topol <[email protected]>
---
 go/parquet/pqarrow/file_reader.go      |  4 +-
 go/parquet/pqarrow/file_reader_test.go | 67 ++++++++++++++++++++++++++++++++++
 2 files changed, 69 insertions(+), 2 deletions(-)

diff --git a/go/parquet/pqarrow/file_reader.go 
b/go/parquet/pqarrow/file_reader.go
index d54e365b55..d91010c62c 100755
--- a/go/parquet/pqarrow/file_reader.go
+++ b/go/parquet/pqarrow/file_reader.go
@@ -394,8 +394,8 @@ func (fr *FileReader) ReadRowGroups(ctx context.Context, 
indices, rowGroups []in
 }
 
 func (fr *FileReader) getColumnReader(ctx context.Context, i int, colFactory 
itrFactory) (*ColumnReader, error) {
-       if i < 0 || i >= fr.rdr.MetaData().Schema.NumColumns() {
-               return nil, fmt.Errorf("invalid column index chosen %d, there 
are only %d columns", i, fr.rdr.MetaData().Schema.NumColumns())
+       if i < 0 || i >= len(fr.Manifest.Fields) {
+               return nil, fmt.Errorf("invalid column index chosen %d, there 
are only %d columns", i, len(fr.Manifest.Fields))
        }
 
        ctx = context.WithValue(ctx, rdrCtxKey{}, readerCtx{
diff --git a/go/parquet/pqarrow/file_reader_test.go 
b/go/parquet/pqarrow/file_reader_test.go
index 2b4aa8ab78..d1f3ae1c98 100644
--- a/go/parquet/pqarrow/file_reader_test.go
+++ b/go/parquet/pqarrow/file_reader_test.go
@@ -19,9 +19,11 @@ package pqarrow_test
 import (
        "bytes"
        "context"
+       "fmt"
        "io"
        "os"
        "path/filepath"
+       "strings"
        "testing"
 
        "github.com/apache/arrow/go/v14/arrow"
@@ -216,3 +218,68 @@ func TestFileReaderWriterMetadata(t *testing.T) {
        assert.Equal(t, []string{"foo", "bar"}, kvMeta.Keys())
        assert.Equal(t, []string{"bar", "baz"}, kvMeta.Values())
 }
+
+func TestFileReaderColumnChunkBoundsErrors(t *testing.T) {
+       schema := arrow.NewSchema([]arrow.Field{
+               {Name: "zero", Type: arrow.PrimitiveTypes.Float64},
+               {Name: "g", Type: arrow.StructOf(
+                       arrow.Field{Name: "one", Type: 
arrow.PrimitiveTypes.Float64},
+                       arrow.Field{Name: "two", Type: 
arrow.PrimitiveTypes.Float64},
+                       arrow.Field{Name: "three", Type: 
arrow.PrimitiveTypes.Float64},
+               )},
+       }, nil)
+
+       // generate Parquet data with four columns
+       // that are represented by two logical fields
+       data := `[
+               {
+                       "zero": 1,
+                       "g": {
+                               "one": 1,
+                               "two": 1,
+                               "three": 1
+                       }
+               },
+               {
+                       "zero": 2,
+                       "g": {
+                               "one": 2,
+                               "two": 2,
+                               "three": 2
+                       }
+               }
+       ]`
+
+       record, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, 
strings.NewReader(data))
+       require.NoError(t, err)
+
+       output := &bytes.Buffer{}
+       writer, err := pqarrow.NewFileWriter(schema, output, 
parquet.NewWriterProperties(), pqarrow.DefaultWriterProps())
+       require.NoError(t, err)
+
+       require.NoError(t, writer.Write(record))
+       require.NoError(t, writer.Close())
+
+       fileReader, err := 
file.NewParquetReader(bytes.NewReader(output.Bytes()))
+       require.NoError(t, err)
+
+       arrowReader, err := pqarrow.NewFileReader(fileReader, 
pqarrow.ArrowReadProperties{BatchSize: 1024}, memory.DefaultAllocator)
+       require.NoError(t, err)
+
+       // assert that errors are returned for indexes outside the bounds of 
the logical fields (instead of the physical columns)
+       ctx := pqarrow.NewArrowWriteContext(context.Background(), nil)
+       assert.Greater(t, fileReader.NumRowGroups(), 0)
+       for rowGroupIndex := 0; rowGroupIndex < fileReader.NumRowGroups(); 
rowGroupIndex += 1 {
+               rowGroupReader := arrowReader.RowGroup(rowGroupIndex)
+               for fieldNum := 0; fieldNum < schema.NumFields(); fieldNum += 1 
{
+                       _, err := rowGroupReader.Column(fieldNum).Read(ctx)
+                       assert.NoError(t, err, "reading field num: %d", 
fieldNum)
+               }
+
+               _, subZeroErr := rowGroupReader.Column(-1).Read(ctx)
+               assert.Error(t, subZeroErr)
+
+               _, tooHighErr := 
rowGroupReader.Column(schema.NumFields()).Read(ctx)
+               assert.ErrorContains(t, tooHighErr, fmt.Sprintf("there are only 
%d columns", schema.NumFields()))
+       }
+}

Reply via email to