zeroshade commented on code in PR #891:
URL: https://github.com/apache/iceberg-go/pull/891#discussion_r3082375446
##########
table/internal/parquet_files.go:
##########
@@ -726,6 +762,65 @@ func (w wrapPqArrowReader) GetRecords(ctx context.Context,
cols []int, tester an
return w.GetRecordReader(ctx, cols, rgList)
}
+// buildFieldIDToColIdx maps each Iceberg field ID to its 0-based column index
in
+// the Parquet file schema. Used to translate bloom filter predicates (which
carry
+// field IDs) into the column positions required by BloomFilterReader.
+func buildFieldIDToColIdx(meta *metadata.FileMetaData) map[int]int {
+ sc := meta.Schema
+ result := make(map[int]int, sc.NumColumns())
+ for i := 0; i < sc.NumColumns(); i++ {
+ fieldID := int(sc.Column(i).SchemaNode().FieldID())
+ result[fieldID] = i
+ }
+
+ return result
+}
+
+// checkRowGroupBloomFilters checks each bloom predicate against the bloom
filter
+// for its column in row group rg. Returns false (skip) if ANY predicate has
none
+// of its values present in the bloom filter. Returns true (keep) on any error
or
+// missing bloom filter data — bloom filters are an optimisation, never a
+// correctness gate.
+func checkRowGroupBloomFilters(
+ bfReader *metadata.BloomFilterReader,
+ rg int,
+ fieldIDToColIdx map[int]int,
+ preds []RowGroupBloomPred,
+) (bool, error) {
+ rgBFReader, err := bfReader.RowGroup(rg)
+ if err != nil || rgBFReader == nil {
+ return true, nil
+ }
Review Comment:
why are we swallowing the error instead of returning it here?
##########
table/evaluators.go:
##########
@@ -1563,3 +1565,212 @@ func (m *strictMetricsEval) canContainNans(fieldID int)
bool {
return exists && cnt > 0
}
+
+// literalToPhysBytes converts an Iceberg literal to the physical byte
+// representation that the Parquet bloom filter writer hashes. The encoding
+// must match Arrow's internal getBytes[T] (little-endian for numerics, raw
+// bytes for byte-array types).
+func literalToPhysBytes(typ iceberg.PrimitiveType, lit iceberg.Literal)
([]byte, bool) {
Review Comment:
could most of these get replaced using the existing MarshalBinary on the
`iceberg.Literal` type?
##########
table/internal/parquet_files.go:
##########
@@ -673,6 +673,21 @@ type ParquetFileSource struct {
file iceberg.DataFile
}
+// RowGroupBloomPred holds the physical-encoded bytes for each literal in a
+// bloom-filterable predicate on one field. A row group can be skipped when
+// NONE of the bytes appear in the column's bloom filter.
+type RowGroupBloomPred struct {
+ FieldID int
+ PhysBytes [][]byte // one entry for EqualTo; one per value for In
+}
+
+// ParquetRowGroupTester combines stats-based and bloom filter row group
pruning.
+// Pass it as the tester argument to wrapPqArrowReader.GetRecords.
+type ParquetRowGroupTester struct {
Review Comment:
does this need to be exported?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]