This is an automated email from the ASF dual-hosted git repository.

zeroshade pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 61692b69e4 GH-35871: [Go] Account for struct validity bitmap in 
`array.ApproxEqual` (#35872)
61692b69e4 is described below

commit 61692b69e4954f50ced110bf46dd92041748776f
Author: Alex Shcherbakov <[email protected]>
AuthorDate: Thu Jun 1 22:30:30 2023 +0300

    GH-35871: [Go] Account for struct validity bitmap in `array.ApproxEqual` 
(#35872)
    
    ### Rationale for this change
    
    When comparing `array.Struct` values with `array.ApproxEqual` the validity 
bitmap of the struct itself should take precedence:
    > When reading the struct array the parent validity bitmap takes priority.
    
    This follows a brief discussion from #35851.
    
    ### What changes are included in this PR?
    
    `array.arrayApproxEqualStruct` will check the fields data only if the 
struct elem is valid.
    
    ### Are these changes tested?
    
    `pqarrow` tests are updated accordingly (no special treatment for structs, 
just `array.ApproxEqual`
    
    ### Are there any user-facing changes?
    
    `array.ApproxEqual` behavior changed to match the docs about validity 
bitmap precedence.
    
    * Closes: #35871
    
    Authored-by: candiduslynx <[email protected]>
    Signed-off-by: Matt Topol <[email protected]>
---
 go/arrow/array/compare.go               | 20 +++++++---
 go/arrow/array/struct.go                |  2 +-
 go/parquet/pqarrow/column_readers.go    | 54 +++++++++++----------------
 go/parquet/pqarrow/encode_arrow_test.go | 66 ++++++++++++++++++++-------------
 4 files changed, 79 insertions(+), 63 deletions(-)

diff --git a/go/arrow/array/compare.go b/go/arrow/array/compare.go
index 48b0df9c84..230c985929 100644
--- a/go/arrow/array/compare.go
+++ b/go/arrow/array/compare.go
@@ -22,6 +22,7 @@ import (
 
        "github.com/apache/arrow/go/v13/arrow"
        "github.com/apache/arrow/go/v13/arrow/float16"
+       "github.com/apache/arrow/go/v13/internal/bitutils"
 )
 
 // RecordEqual reports whether the two provided records are equal.
@@ -735,13 +736,22 @@ func arrayApproxEqualFixedSizeList(left, right 
*FixedSizeList, opt equalOption)
 }
 
 func arrayApproxEqualStruct(left, right *Struct, opt equalOption) bool {
-       for i, lf := range left.fields {
-               rf := right.fields[i]
-               if !arrayApproxEqual(lf, rf, opt) {
-                       return false
+       return bitutils.VisitSetBitRuns(
+               left.NullBitmapBytes(),
+               int64(left.Offset()), int64(left.Len()),
+               approxEqualStructRun(left, right, opt),
+       ) == nil
+}
+
+func approxEqualStructRun(left, right *Struct, opt equalOption) 
bitutils.VisitFn {
+       return func(pos int64, length int64) error {
+               for i := range left.fields {
+                       if !sliceApproxEqual(left.fields[i], pos, pos+length, 
right.fields[i], pos, pos+length, opt) {
+                               return arrow.ErrInvalid
+                       }
                }
+               return nil
        }
-       return true
 }
 
 // arrayApproxEqualMap doesn't care about the order of keys (in Go map 
traversal order is undefined)
diff --git a/go/arrow/array/struct.go b/go/arrow/array/struct.go
index cb4fb5ec5e..9ebc794006 100644
--- a/go/arrow/array/struct.go
+++ b/go/arrow/array/struct.go
@@ -129,7 +129,7 @@ func (a *Struct) 
newStructFieldWithParentValidityMask(fieldIndex int) arrow.Arra
        maskedNullBitmapBytes := make([]byte, len(nullBitmapBytes))
        copy(maskedNullBitmapBytes, nullBitmapBytes)
        for i := 0; i < field.Len(); i++ {
-               if !a.IsValid(i) {
+               if a.IsNull(i) {
                        bitutil.ClearBit(maskedNullBitmapBytes, i)
                }
        }
diff --git a/go/parquet/pqarrow/column_readers.go 
b/go/parquet/pqarrow/column_readers.go
index df02b2c949..767131f1a8 100644
--- a/go/parquet/pqarrow/column_readers.go
+++ b/go/parquet/pqarrow/column_readers.go
@@ -168,16 +168,6 @@ func (sr *structReader) Release() {
 }
 
 func newStructReader(rctx *readerCtx, filtered *arrow.Field, levelInfo 
file.LevelInfo, children []*ColumnReader, props ArrowReadProperties) 
*ColumnReader {
-       // there could be a mix of children some might be repeated and some 
might not be
-       // if possible use one that isn't since that will be guaranteed to have 
the least
-       // number of levels to reconstruct a nullable bitmap
-       var result *ColumnReader
-       for _, child := range children {
-               if !child.IsOrHasRepeatedChild() {
-                       result = child
-               }
-       }
-
        ret := &structReader{
                rctx:      rctx,
                filtered:  filtered,
@@ -186,10 +176,18 @@ func newStructReader(rctx *readerCtx, filtered 
*arrow.Field, levelInfo file.Leve
                props:     props,
                refCount:  1,
        }
-       if result != nil {
-               ret.defRepLevelChild = result
-               ret.hasRepeatedChild = false
-       } else {
+
+       // there could be a mix of children some might be repeated and some 
might not be
+       // if possible use one that isn't since that will be guaranteed to have 
the least
+       // number of levels to reconstruct a nullable bitmap
+       for _, child := range children {
+               if !child.IsOrHasRepeatedChild() {
+                       ret.defRepLevelChild = child
+                       break
+               }
+       }
+
+       if ret.defRepLevelChild == nil {
                ret.defRepLevelChild = children[0]
                ret.hasRepeatedChild = true
        }
@@ -250,15 +248,16 @@ func (sr *structReader) BuildArray(lenBound int64) 
(*arrow.Chunked, error) {
 
        var nullBitmap *memory.Buffer
 
-       if lenBound > 0 {
+       if lenBound > 0 && (sr.hasRepeatedChild || sr.filtered.Nullable) {
+               nullBitmap = memory.NewResizableBuffer(sr.rctx.mem)
+               nullBitmap.Resize(int(bitutil.BytesForBits(lenBound)))
+               validityIO.ValidBits = nullBitmap.Bytes()
+               defLevels, err := sr.GetDefLevels()
+               if err != nil {
+                       return nil, err
+               }
+
                if sr.hasRepeatedChild {
-                       nullBitmap = memory.NewResizableBuffer(sr.rctx.mem)
-                       nullBitmap.Resize(int(bitutil.BytesForBits(lenBound)))
-                       validityIO.ValidBits = nullBitmap.Bytes()
-                       defLevels, err := sr.GetDefLevels()
-                       if err != nil {
-                               return nil, err
-                       }
                        repLevels, err := sr.GetRepLevels()
                        if err != nil {
                                return nil, err
@@ -267,16 +266,7 @@ func (sr *structReader) BuildArray(lenBound int64) 
(*arrow.Chunked, error) {
                        if err := file.DefRepLevelsToBitmap(defLevels, 
repLevels, sr.levelInfo, &validityIO); err != nil {
                                return nil, err
                        }
-
-               } else if sr.filtered.Nullable {
-                       nullBitmap = memory.NewResizableBuffer(sr.rctx.mem)
-                       nullBitmap.Resize(int(bitutil.BytesForBits(lenBound)))
-                       validityIO.ValidBits = nullBitmap.Bytes()
-                       defLevels, err := sr.GetDefLevels()
-                       if err != nil {
-                               return nil, err
-                       }
-
+               } else {
                        file.DefLevelsToBitmap(defLevels, sr.levelInfo, 
&validityIO)
                }
        }
diff --git a/go/parquet/pqarrow/encode_arrow_test.go 
b/go/parquet/pqarrow/encode_arrow_test.go
index 877f584f20..31f8a2b5ab 100644
--- a/go/parquet/pqarrow/encode_arrow_test.go
+++ b/go/parquet/pqarrow/encode_arrow_test.go
@@ -19,7 +19,6 @@ package pqarrow_test
 import (
        "bytes"
        "context"
-       "errors"
        "fmt"
        "math"
        "strconv"
@@ -32,7 +31,6 @@ import (
        "github.com/apache/arrow/go/v13/arrow/decimal128"
        "github.com/apache/arrow/go/v13/arrow/ipc"
        "github.com/apache/arrow/go/v13/arrow/memory"
-       "github.com/apache/arrow/go/v13/internal/bitutils"
        "github.com/apache/arrow/go/v13/internal/types"
        "github.com/apache/arrow/go/v13/internal/utils"
        "github.com/apache/arrow/go/v13/parquet"
@@ -931,6 +929,44 @@ func (ps *ParquetIOTestSuite) TestReadDecimals() {
        ps.True(array.Equal(expected, chunked.Chunk(0)))
 }
 
+func (ps *ParquetIOTestSuite) TestReadNestedStruct() {
+       mem := memory.NewCheckedAllocator(memory.DefaultAllocator)
+       defer mem.AssertSize(ps.T(), 0)
+
+       dt := arrow.StructOf(arrow.Field{
+               Name: "nested",
+               Type: arrow.StructOf(
+                       arrow.Field{Name: "bool", Type: 
arrow.FixedWidthTypes.Boolean},
+                       arrow.Field{Name: "int32", Type: 
arrow.PrimitiveTypes.Int32},
+                       arrow.Field{Name: "int64", Type: 
arrow.PrimitiveTypes.Int64},
+               ),
+       })
+       field := arrow.Field{Name: "struct", Type: dt, Nullable: true}
+
+       builder := array.NewStructBuilder(mem, dt)
+       defer builder.Release()
+       nested := builder.FieldBuilder(0).(*array.StructBuilder)
+
+       builder.Append(true)
+       nested.Append(true)
+       nested.FieldBuilder(0).(*array.BooleanBuilder).Append(true)
+       nested.FieldBuilder(1).(*array.Int32Builder).Append(int32(-1))
+       nested.FieldBuilder(2).(*array.Int64Builder).Append(int64(-2))
+       builder.AppendNull()
+
+       arr := builder.NewStructArray()
+       defer arr.Release()
+
+       expected := array.NewTable(
+               arrow.NewSchema([]arrow.Field{field}, nil),
+               []arrow.Column{*arrow.NewColumn(field, arrow.NewChunked(dt, 
[]arrow.Array{arr}))},
+               -1,
+       )
+       defer arr.Release() // NewChunked
+       defer expected.Release()
+       ps.roundTripTable(mem, expected, true)
+}
+
 func (ps *ParquetIOTestSuite) writeColumn(mem memory.Allocator, sc 
*schema.GroupNode, values arrow.Array) []byte {
        var buf bytes.Buffer
        arrsc, err := pqarrow.FromParquet(schema.NewSchema(sc), nil, nil)
@@ -1032,29 +1068,9 @@ func (ps *ParquetIOTestSuite) roundTripTable(_ 
memory.Allocator, expected arrow.
        tblChunk := tbl.Column(0).Data()
 
        ps.Equal(len(exChunk.Chunks()), len(tblChunk.Chunks()))
-       if exChunk.DataType().ID() != arrow.STRUCT {
-               exc := exChunk.Chunk(0)
-               tbc := tblChunk.Chunk(0)
-               ps.Truef(array.Equal(exc, tbc), "expected: %T %s\ngot: %T %s", 
exc, exc, tbc, tbc)
-       } else {
-               // current impl of ArrayEquals for structs doesn't correctly 
handle nulls in the parent
-               // with a non-nullable child when comparing. Since after the 
round trip, the data in the
-               // child will have the nulls, not the original data.
-               ex := exChunk.Chunk(0)
-               tb := tblChunk.Chunk(0)
-               ps.Equal(ex.NullN(), tb.NullN())
-               if ex.NullN() > 0 {
-                       
ps.Equal(ex.NullBitmapBytes()[:int(bitutil.BytesForBits(int64(ex.Len())))], 
tb.NullBitmapBytes()[:int(bitutil.BytesForBits(int64(tb.Len())))])
-               }
-               ps.Equal(ex.Len(), tb.Len())
-               // only compare the non-null values
-               ps.NoErrorf(bitutils.VisitSetBitRuns(ex.NullBitmapBytes(), 
int64(ex.Data().Offset()), int64(ex.Len()), func(pos, length int64) error {
-                       if !ps.True(array.SliceEqual(ex, pos, pos+length, tb, 
pos, pos+length)) {
-                               return errors.New("failed")
-                       }
-                       return nil
-               }), "expected: %s\ngot: %s", ex, tb)
-       }
+       exc := exChunk.Chunk(0)
+       tbc := tblChunk.Chunk(0)
+       ps.Truef(array.ApproxEqual(exc, tbc), "expected: %T %s\ngot: %T %s", 
exc, exc, tbc, tbc)
 }
 
 func makeEmptyListsArray(size int) arrow.Array {

Reply via email to