aocsa commented on a change in pull request #10802:
URL: https://github.com/apache/arrow/pull/10802#discussion_r690412001



##########
File path: cpp/src/arrow/compute/kernels/vector_selection.cc
##########
@@ -2146,6 +2147,219 @@ class TakeMetaFunction : public MetaFunction {
   }
 };
 
+// ----------------------------------------------------------------------
+// DropNull Implementation
+
+Status GetDropNullFilter(const Array& values, MemoryPool* memory_pool,
+                         std::shared_ptr<arrow::BooleanArray>* out_array) {
+  auto bitmap_buffer = values.null_bitmap();
+  *out_array = std::make_shared<BooleanArray>(values.length(), bitmap_buffer, 
nullptr, 0,
+                                              values.offset());
+  return Status::OK();
+}
+
+Status CreateEmptyArray(std::shared_ptr<DataType> type, MemoryPool* 
memory_pool,
+                        std::shared_ptr<Array>* output_array) {
+  std::unique_ptr<ArrayBuilder> builder;
+  RETURN_NOT_OK(MakeBuilder(memory_pool, type, &builder));
+  RETURN_NOT_OK(builder->Resize(0));
+  ARROW_ASSIGN_OR_RAISE(*output_array, builder->Finish());
+  return Status::OK();
+}
+
+Status CreateEmptyChunkedArray(std::shared_ptr<DataType> type, MemoryPool* 
memory_pool,
+                               std::shared_ptr<ChunkedArray>* output_array) {
+  std::vector<std::shared_ptr<Array>> new_chunks(1);  // Hard-coded 1 for now
+  ARROW_RETURN_NOT_OK(CreateEmptyArray(type, memory_pool, &new_chunks[0]));
+  *output_array = std::make_shared<ChunkedArray>(std::move(new_chunks));
+  return Status::OK();
+}
+
+Result<std::shared_ptr<Array>> DropNullArray(const std::shared_ptr<Array>& 
values,
+                                             ExecContext* ctx) {
+  if (values->null_count() == 0) {
+    return values;
+  }
+  if (values->type()->Equals(arrow::null())) {
+    return std::make_shared<NullArray>(0);
+  }
+  std::shared_ptr<BooleanArray> drop_null_filter;
+  RETURN_NOT_OK(GetDropNullFilter(*values, ctx->memory_pool(), 
&drop_null_filter));
+
+  if (drop_null_filter->null_count() == drop_null_filter->length()) {
+    std::shared_ptr<Array> empty_array;
+    RETURN_NOT_OK(CreateEmptyArray(values->type(), ctx->memory_pool(), 
&empty_array));
+    return empty_array;
+  }
+  auto options = FilterOptions::Defaults();
+  ARROW_ASSIGN_OR_RAISE(
+      Datum result,
+      CallFunction("array_filter", {Datum(*values), Datum(*drop_null_filter)}, 
&options,
+                   ctx));
+  return result.make_array();
+}
+
+Result<std::shared_ptr<ChunkedArray>> DropNullChunkedArray(const ChunkedArray& 
values,
+                                                           ExecContext* ctx) {
+  if (values.null_count() == values.length()) {
+    std::shared_ptr<ChunkedArray> empty_array;
+    RETURN_NOT_OK(
+        CreateEmptyChunkedArray(values.type(), ctx->memory_pool(), 
&empty_array));
+    return empty_array;
+  }
+  std::vector<std::shared_ptr<Array>> new_chunks;
+  for (const auto& chunk : values.chunks()) {
+    ARROW_ASSIGN_OR_RAISE(auto new_chunk, DropNullArray(chunk, ctx));
+    if (new_chunk->length() > 0) {
+      new_chunks.push_back(new_chunk);
+    }
+  }
+  return std::make_shared<ChunkedArray>(std::move(new_chunks));
+}
+
+Result<std::shared_ptr<RecordBatch>> DropNullRecordBatch(const RecordBatch& 
batch,
+                                                         ExecContext* ctx) {
+  int64_t null_count = 0;
+  for (const auto& column : batch.columns()) {
+    null_count += column->null_count();
+  }
+  if (null_count == 0) {
+    return RecordBatch::Make(batch.schema(), batch.num_rows(), 
batch.columns());
+  }
+  ARROW_ASSIGN_OR_RAISE(auto dst,
+                        AllocateEmptyBitmap(batch.num_rows(), 
ctx->memory_pool()));
+  BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), true);
+  for (const auto& column : batch.columns()) {
+    if (column->type()->Equals(arrow::null())) {
+      BitUtil::SetBitsTo(dst->mutable_data(), 0, batch.num_rows(), false);
+      break;
+    }
+    if (column->null_bitmap_data()) {
+      ::arrow::internal::BitmapAnd(column->null_bitmap_data(), 
column->offset(),
+                                   dst->data(), 0, column->length(), 0,
+                                   dst->mutable_data());
+    }
+  }
+  auto drop_null_filter =
+      std::make_shared<BooleanArray>(batch.num_rows(), dst, nullptr, 0, 0);
+  if (drop_null_filter->null_count() == batch.num_rows()) {
+    std::vector<std::shared_ptr<Array>> empty_batch(batch.num_columns());
+    for (int i = 0; i < batch.num_columns(); i++) {
+      RETURN_NOT_OK(
+          CreateEmptyArray(batch.column(i)->type(), ctx->memory_pool(), 
&empty_batch[i]));
+    }
+    return RecordBatch::Make(batch.schema(), 0, empty_batch);
+  }
+  ARROW_ASSIGN_OR_RAISE(Datum result, Filter(Datum(batch), 
Datum(drop_null_filter),
+                                             FilterOptions::Defaults(), ctx));
+  return result.record_batch();
+}
+
+Result<std::shared_ptr<Table>> DropNullTable(const Table& table, ExecContext* 
ctx) {
+  if (table.num_rows() == 0) {
+    return Table::Make(table.schema(), table.columns(), 0);
+  }
+  int64_t null_count = 0;
+  for (const auto& col : table.columns()) {
+    for (const auto& column_chunk : col->chunks()) {
+      null_count += column_chunk->null_count();
+    }
+  }
+  if (null_count == 0) {
+    return Table::Make(table.schema(), table.columns(), table.num_rows());
+  }
+
+  ARROW_ASSIGN_OR_RAISE(auto dst,
+                        AllocateEmptyBitmap(table.num_rows(), 
ctx->memory_pool()));
+  BitUtil::SetBitsTo(dst->mutable_data(), 0, table.num_rows(), true);
+
+  for (const auto& col : table.columns()) {
+    if (col->type()->Equals(arrow::null())) {
+      BitUtil::SetBitsTo(dst->mutable_data(), 0, table.num_rows(), false);
+      break;
+    }
+    std::vector<::arrow::internal::Bitmap> bitmaps;
+    std::transform(col->chunks().begin(), col->chunks().end(),
+                   std::back_inserter(bitmaps), [](const 
std::shared_ptr<Array>& array) {
+                     return 
::arrow::internal::Bitmap(array->null_bitmap_data(),
+                                                      array->offset(), 
array->length());
+                   });
+    int64_t global_offset = 0;
+    ARROW_ASSIGN_OR_RAISE(auto concatenated_bitmap,
+                          AllocateEmptyBitmap(table.num_rows(), 
ctx->memory_pool()));
+    BitUtil::SetBitsTo(concatenated_bitmap->mutable_data(), 0, 
table.num_rows(), true);

Review comment:
       I followed your last suggest using `TableBatchReader`. The reason that I 
needed to concatenate all source bitmap chunks was that `BitmapAnd` only has an 
unaligned implementation with respect to the offset (I don't why) not the 
bitmap length. So if `chunk->length() % 8 != 0` `BitmapAnd` will fail. 
https://github.com/apache/arrow/blob/820e5061847c9d6d261c416e57d6013321175565/cpp/src/arrow/util/bitmap_ops.cc#L285
   
   Referenced before
   https://github.com/apache/arrow/pull/10802#discussion_r684341804




-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to