zeroshade commented on code in PR #329:
URL: https://github.com/apache/iceberg-go/pull/329#discussion_r1989770403


##########
table/arrow_utils.go:
##########
@@ -892,3 +899,356 @@ func ToRequestedSchema(ctx context.Context, requested, 
fileSchema *iceberg.Schem
 
        return out, nil
 }
+
+func must[T any](v T, err error) T {
+       if err != nil {
+               panic(err)
+       }
+
+       return v
+}
+
+type metricModeType string
+
+const (
+       metricModeTruncate metricModeType = "truncate"
+       metricModeNone     metricModeType = "none"
+       metricModeCounts   metricModeType = "counts"
+       metricModeFull     metricModeType = "full"
+)
+
+type metricsMode struct {
+       typ metricModeType
+       len int
+}
+
+var truncationExpr = regexp.MustCompile(`^truncate\((\d+)\)$`)
+
+func matchMetricsMode(mode string) (metricsMode, error) {
+       sanitized := strings.ToLower(strings.TrimSpace(mode))
+       if strings.HasPrefix(sanitized, string(metricModeTruncate)) {
+               m := truncationExpr.FindStringSubmatch(sanitized)
+               if len(m) < 2 {
+                       return metricsMode{}, fmt.Errorf("malformed truncate 
metrics mode: %s", mode)
+               }
+
+               truncLen, err := strconv.Atoi(m[1])
+               if err != nil {
+                       return metricsMode{}, fmt.Errorf("malformed truncate 
metrics mode: %s", mode)
+               }
+
+               if truncLen <= 0 {
+                       return metricsMode{}, fmt.Errorf("invalid truncate 
length: %d", truncLen)
+               }
+
+               return metricsMode{typ: metricModeTruncate, len: truncLen}, nil
+       }
+
+       switch sanitized {
+       case string(metricModeNone):
+               return metricsMode{typ: metricModeNone}, nil
+       case string(metricModeCounts):
+               return metricsMode{typ: metricModeCounts}, nil
+       case string(metricModeFull):
+               return metricsMode{typ: metricModeFull}, nil
+       default:
+               return metricsMode{}, fmt.Errorf("unsupported metrics mode: 
%s", mode)
+       }
+}
+
+type statisticsCollector struct {
+       fieldID    int
+       icebergTyp iceberg.PrimitiveType
+       mode       metricsMode
+       colName    string
+}
+
+type arrowStatsCollector struct {
+       fieldID     int
+       schema      *iceberg.Schema
+       props       iceberg.Properties
+       defaultMode string
+}
+
+func (a *arrowStatsCollector) Schema(_ *iceberg.Schema, results func() 
[]statisticsCollector) []statisticsCollector {
+       return results()
+}
+
+func (a *arrowStatsCollector) Struct(_ iceberg.StructType, results []func() 
[]statisticsCollector) []statisticsCollector {
+       result := make([]statisticsCollector, 0, len(results))
+       for _, res := range results {
+               result = append(result, res()...)
+       }
+
+       return result
+}
+
+func (a *arrowStatsCollector) Field(field iceberg.NestedField, fieldRes func() 
[]statisticsCollector) []statisticsCollector {
+       a.fieldID = field.ID
+
+       return fieldRes()
+}
+
+func (a *arrowStatsCollector) List(list iceberg.ListType, elemResult func() 
[]statisticsCollector) []statisticsCollector {
+       a.fieldID = list.ElementID
+
+       return elemResult()
+}
+
+func (a *arrowStatsCollector) Map(m iceberg.MapType, keyResult func() 
[]statisticsCollector, valResult func() []statisticsCollector) 
[]statisticsCollector {
+       a.fieldID = m.KeyID
+       keyRes := keyResult()
+
+       a.fieldID = m.ValueID
+       valRes := valResult()
+
+       return append(keyRes, valRes...)
+}
+
+func (a *arrowStatsCollector) Primitive(dt iceberg.PrimitiveType) 
[]statisticsCollector {
+       colName, ok := a.schema.FindColumnName(a.fieldID)
+       if !ok {
+               return []statisticsCollector{}
+       }
+
+       metMode, err := matchMetricsMode(a.defaultMode)
+       if err != nil {
+               panic(err)
+       }
+
+       colMode, ok := a.props[MetricsModeColumnConfPrefix+"."+colName]
+       if ok {
+               metMode, err = matchMetricsMode(colMode)
+               if err != nil {
+                       panic(err)
+               }
+       }
+
+       switch dt.(type) {
+       case iceberg.StringType:
+       case iceberg.BinaryType:
+       default:
+               if metMode.typ == metricModeTruncate {
+                       metMode = metricsMode{typ: metricModeFull, len: 0}
+               }
+       }
+
+       isNested := strings.Contains(colName, ".")

Review Comment:
   Columns with `.` in their name would prove kinda difficult for a lot of 
parquet readers since Parquet uses `.` as the path separator for determining 
full paths to columns/nodes. 
   
   Also, I got this from pyiceberg :smile: 
https://github.com/apache/iceberg-python/blob/main/pyiceberg/io/pyarrow.py#L2085



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to