felipecrv commented on code in PR #41975:
URL: https://github.com/apache/arrow/pull/41975#discussion_r1637396286
##########
cpp/src/arrow/compute/exec.cc:
##########
@@ -297,11 +297,23 @@ void ComputeDataPreallocate(const DataType& type,
case Type::MAP:
widths->emplace_back(32, /*added_length=*/1);
return;
+ case Type::LIST_VIEW: {
+ // add offsets and size
+ widths->emplace_back(32, /*added_length=*/1);
+ widths->emplace_back(32, /*added_length=*/1);
+ return;
+ }
case Type::LARGE_BINARY:
case Type::LARGE_STRING:
case Type::LARGE_LIST:
widths->emplace_back(64, /*added_length=*/1);
return;
+ case Type::LARGE_LIST_VIEW: {
+ // add offsets and size
+ widths->emplace_back(64, /*added_length=*/1);
+ widths->emplace_back(64, /*added_length=*/1);
Review Comment:
list-views don't need an extra offset/size, so you don't need the
added_length=1.
##########
cpp/src/arrow/compute/exec.cc:
##########
@@ -473,7 +485,7 @@ bool ExecSpanIterator::Next(ExecSpan* span) {
namespace {
struct NullGeneralization {
- enum type { PERHAPS_NULL, ALL_VALID, ALL_NULL };
+ enum type { PERHAPS_NULL = 1, ALL_VALID = 2, ALL_NULL = 3 };
Review Comment:
Agreed. This should start from 0 to give an easier time to the compiler
(when it's proving boundaries in the optimizer).
##########
cpp/src/arrow/compute/exec.cc:
##########
@@ -498,15 +510,36 @@ struct NullGeneralization {
return PERHAPS_NULL;
}
+ static type Get(const ChunkedArray& chunk_array) {
+ if (chunk_array.num_chunks() == 0 ||
+ (chunk_array.null_count() == chunk_array.length())) {
+ return ALL_NULL;
+ }
+
+ type null_gen = ALL_NULL;
+ int chunk_idx = 0;
+ while (chunk_idx < chunk_array.num_chunks()) {
Review Comment:
You can use a `for` loop here.
##########
cpp/src/arrow/compute/exec.cc:
##########
@@ -498,15 +510,36 @@ struct NullGeneralization {
return PERHAPS_NULL;
}
+ static type Get(const ChunkedArray& chunk_array) {
+ if (chunk_array.num_chunks() == 0 ||
Review Comment:
if `num_chunks() == 0`, we should return `ALL_VALID` instead of `ALL_NULL`
or `PERHAPS_NULL` like `Get` does above (it doesn't check `.length()` at all
which I think was a missed opportunity of making `ALL_VALID` more likely).
##########
cpp/src/arrow/compute/exec.cc:
##########
@@ -498,15 +510,36 @@ struct NullGeneralization {
return PERHAPS_NULL;
}
+ static type Get(const ChunkedArray& chunk_array) {
+ if (chunk_array.num_chunks() == 0 ||
+ (chunk_array.null_count() == chunk_array.length())) {
+ return ALL_NULL;
+ }
+
+ type null_gen = ALL_NULL;
+ int chunk_idx = 0;
+ while (chunk_idx < chunk_array.num_chunks()) {
+ ExecValue value;
+ const ArrayData* curr_chunk = chunk_array.chunk(chunk_idx)->data().get();
+ value.SetArray(*curr_chunk);
+ null_gen = static_cast<type>((null_gen & Get(value)));
Review Comment:
if you get a `PERHAPS_NULL` you can early return
##########
cpp/src/arrow/compute/exec.cc:
##########
@@ -498,15 +510,36 @@ struct NullGeneralization {
return PERHAPS_NULL;
}
+ static type Get(const ChunkedArray& chunk_array) {
+ if (chunk_array.num_chunks() == 0 ||
+ (chunk_array.null_count() == chunk_array.length())) {
+ return ALL_NULL;
+ }
+
+ type null_gen = ALL_NULL;
+ int chunk_idx = 0;
+ while (chunk_idx < chunk_array.num_chunks()) {
+ ExecValue value;
+ const ArrayData* curr_chunk = chunk_array.chunk(chunk_idx)->data().get();
+ value.SetArray(*curr_chunk);
+ null_gen = static_cast<type>((null_gen & Get(value)));
Review Comment:
it's better to do the switch on null_gen here than to rely on `&` and clever
number assignment to the enum
##########
cpp/src/arrow/compute/exec.cc:
##########
@@ -1022,25 +1048,7 @@ Status CheckCanExecuteChunked(const VectorKernel*
kernel) {
class VectorExecutor : public KernelExecutorImpl<VectorKernel> {
public:
Status Execute(const ExecBatch& batch, ExecListener* listener) override {
- // Some vector kernels have a separate code path for handling
- // chunked arrays (VectorKernel::exec_chunked) so we check if we
- // have any chunked arrays. If we do and an exec_chunked function
- // is defined then we call that.
- bool have_chunked_arrays = false;
- for (const Datum& arg : batch.values) {
- if (arg.is_chunked_array()) have_chunked_arrays = true;
- }
-
- output_num_buffers_ =
static_cast<int>(output_type_.type->layout().buffers.size());
-
- // Decide if we need to preallocate memory for this kernel
- validity_preallocated_ =
Review Comment:
FWIW I'm working on Take at the moment and setting the chunked exec kernels
https://github.com/apache/arrow/pull/41700
##########
cpp/src/arrow/compute/exec.cc:
##########
@@ -1123,6 +1142,34 @@ class VectorExecutor : public
KernelExecutorImpl<VectorKernel> {
}
}
+ Status SetupPreallocation(const std::vector<Datum>& args) {
Review Comment:
Very hard to review this code extraction. Was this code churn necessary to
achieve the goals of this PR?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]