projjal commented on a change in pull request #9060: URL: https://github.com/apache/arrow/pull/9060#discussion_r603902501
########## File path: cpp/src/gandiva/array_ops.cc ########## @@ -0,0 +1,76 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/array_ops.h" + +#include "arrow/util/value_parsing.h" +#include "gandiva/engine.h" +#include "gandiva/exported_funcs.h" + +/// Stub functions that can be accessed from LLVM or the pre-compiled library. + +extern "C" { + +bool array_utf8_contains_utf8(int64_t context_ptr, const char* entry_buf, Review comment: its better to move these functions to the precompiled directory. ########## File path: cpp/src/gandiva/projector.cc ########## @@ -256,6 +255,35 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, // Create and return array arrays. output->clear(); for (auto& array_data : output_data_vecs) { + if (array_data->type->id() == arrow::Type::LIST) { + auto child_data = array_data->child_data[0]; + int64_t child_data_size = 1; + if (arrow::is_binary_like(child_data->type->id())) { + /* when allocate array data, child data length is an initialized value, + * after calculating, child data offsets buffer has been resized for results, + * but array data length is unchanged. + * We should recalculate child data length and make ArrayData with new length + * + * Otherwise, child data offsets buffer length is data length + 1 + * and offset data is int32_t, need use buffer->size()/4 - 1 + */ + child_data_size = child_data->buffers[1]->size() / 4 - 1; + } else if (child_data->type->id() == arrow::Type::INT32) { + child_data_size = child_data->buffers[1]->size() / 4; + } else if (child_data->type->id() == arrow::Type::INT64) { + child_data_size = child_data->buffers[1]->size() / 8; + } else if (child_data->type->id() == arrow::Type::FLOAT) { + child_data_size = child_data->buffers[1]->size() / 4; + } else if (child_data->type->id() == arrow::Type::DOUBLE) { + child_data_size = child_data->buffers[1]->size() / 8; + } Review comment: return error in else saying other types are not supported yet ########## File path: cpp/src/gandiva/projector.cc ########## @@ -328,7 +392,7 @@ Status Projector::ValidateArrayDataCapacity(const arrow::ArrayData& array_data, min_bitmap_len, " actual size ", bitmap_len)); auto type_id = field.type()->id(); - if (arrow::is_binary_like(type_id)) { + if (arrow::is_binary_like(type_id) || type_id == arrow::Type::LIST) { Review comment: good idea to also validate the capacity of child array buffers of list array. ########## File path: cpp/src/gandiva/field_descriptor.h ########## @@ -58,12 +63,15 @@ class FieldDescriptor { bool HasDataBufferPtrIdx() const { return data_buffer_ptr_idx_ != kInvalidIdx; } + bool HasChildOffsetsIdx() const { return child_offsets_idx_ != kInvalidIdx; } + private: FieldPtr field_; int data_idx_; int validity_idx_; int offsets_idx_; int data_buffer_ptr_idx_; + int child_offsets_idx_; Review comment: You should also add the child validity index. I think you should create a new field descriptor for the child data and store it in the list field descriptor, along with validity and offset idx. In this way you can support multilevel nested lists later. ########## File path: cpp/src/gandiva/projector.cc ########## @@ -301,7 +348,24 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, } buffers.push_back(std::move(data_buffer)); - *array_data = arrow::ArrayData::Make(type, num_records, std::move(buffers)); + if (type->id() == arrow::Type::LIST) { + auto internal_type = type->field(0)->type(); + ArrayDataPtr child_data; + if (arrow::is_primitive(internal_type->id())) { + child_data = arrow::ArrayData::Make(internal_type, 0 /*initialize length*/, + {nullptr, std::move(buffers[2])}, 0); Review comment: why are you setting child validity buffer as null ########## File path: cpp/src/gandiva/llvm_generator.cc ########## @@ -187,6 +186,14 @@ llvm::Value* LLVMGenerator::GetOffsetsReference(llvm::Value* arg_addrs, int idx, return ir_builder()->CreateIntToPtr(load, types()->i32_ptr_type(), name + "_oarray"); } +/// Get reference to child offsets array at specified index in the args list. +llvm::Value* LLVMGenerator::GetChildOffsetsReference(llvm::Value* arg_addrs, int idx, Review comment: You can just use GetOffsetsReference, they should be doing the same thing -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: [email protected]
