zeroshade commented on code in PR #35654: URL: https://github.com/apache/arrow/pull/35654#discussion_r1221798676
########## go/arrow/compute/exprs/builders.go: ########## @@ -0,0 +1,445 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//go:build go1.18 + +package exprs + +import ( + "fmt" + "strconv" + "strings" + "unicode" + + "github.com/apache/arrow/go/v13/arrow" + "github.com/apache/arrow/go/v13/arrow/compute" + "github.com/substrait-io/substrait-go/expr" + "github.com/substrait-io/substrait-go/extensions" + "github.com/substrait-io/substrait-go/types" +) + +// NewDefaultExtensionSet constructs an empty extension set using the default +// Arrow Extension registry and the default collection of substrait extensions +// from the Substrait-go repo. +func NewDefaultExtensionSet() ExtensionIDSet { + return NewExtensionSetDefault(expr.NewEmptyExtensionRegistry(&extensions.DefaultCollection)) +} + +// NewScalarCall constructs a substrait ScalarFunction expression with the provided +// options and arguments. +// +// The function name (fn) is looked up in the internal Arrow DefaultExtensionIDRegistry +// to ensure it exists and to convert from the Arrow function name to the substrait +// function name. It is then looked up using the DefaultCollection from the +// substrait extensions module to find the declaration. If it cannot be found, +// we try constructing the compound signature name by getting the types of the +// arguments which were passed and appending them to the function name appropriately. +// +// An error is returned if the function cannot be resolved. +func NewScalarCall(reg ExtensionIDSet, fn string, opts []*types.FunctionOption, args ...types.FuncArg) (*expr.ScalarFunction, error) { + conv, ok := reg.GetArrowRegistry().GetArrowToSubstrait(fn) + if !ok { + return nil, arrow.ErrNotFound + } + + id, convOpts, err := conv(fn) + if err != nil { + return nil, err + } + + opts = append(opts, convOpts...) + return expr.NewScalarFunc(reg.GetSubstraitRegistry(), id, opts, args...) +} + +// NewFieldRefFromDotPath constructs a substrait reference segment from +// a dot path and the base schema. +// +// dot_path = '.' name +// +// | '[' digit+ ']' +// | dot_path+ +// +// # Examples +// +// Assume root schema of {alpha: i32, beta: struct<gamma: list<i32>>, delta: map<string, i32>} +// +// ".alpha" => StructFieldRef(0) +// "[2]" => StructFieldRef(2) +// ".beta[0]" => StructFieldRef(1, StructFieldRef(0)) +// "[1].gamma[3]" => StructFieldRef(1, StructFieldRef(0, ListElementRef(3))) +// ".delta.foobar" => StructFieldRef(2, MapKeyRef("foobar")) +// +// Note: when parsing a name, a '\' preceding any other character +// will be dropped from the resulting name. Therefore if a name must +// contain the characters '.', '\', '[', or ']' then they must be escaped +// with a preceding '\'. +func NewFieldRefFromDotPath(dotpath string, rootSchema *arrow.Schema) (expr.ReferenceSegment, error) { + if len(dotpath) == 0 { + return nil, fmt.Errorf("%w dotpath was empty", arrow.ErrInvalid) + } + + parseName := func() string { + var name string + for { + idx := strings.IndexAny(dotpath, `\[.`) + if idx == -1 { + name += dotpath + dotpath = "" + break + } + + if dotpath[idx] != '\\' { + // subscript for a new field ref + name += dotpath[:idx] + dotpath = dotpath[idx:] + break + } + + if len(dotpath) == idx+1 { + // dotpath ends with a backslash; consume it all + name += dotpath + dotpath = "" + break + } + + // append all characters before backslash, then the character which follows it + name += dotpath[:idx] + string(dotpath[idx+1]) + dotpath = dotpath[idx+2:] + } + return name + } + + var curType arrow.DataType = arrow.StructOf(rootSchema.Fields()...) + children := make([]expr.ReferenceSegment, 0) + + for len(dotpath) > 0 { + subscript := dotpath[0] + dotpath = dotpath[1:] + switch subscript { + case '.': + // next element is a name + n := parseName() + switch ct := curType.(type) { + case *arrow.StructType: + idx, found := ct.FieldIdx(n) Review Comment: It's a map. The `arrow.StructType` in Go currently doesn't support multiple fields with the same name as it maintains a `map[string]int` to map field names to indices. `FieldIdx` just does a lookup in the map. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
