mapleFU commented on code in PR #44822:
URL: https://github.com/apache/arrow/pull/44822#discussion_r1855369115


##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -327,31 +329,43 @@ BinaryToBinaryCastExec(KernelContext* ctx, const 
ExecSpan& batch, ExecResult* ou
     }
   }
 
-  const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes(
-      input.GetValues<BinaryViewType::c_type>(1), input.length);
-
-  // TODO(GH-43573): A more efficient implementation that copies the validity
-  // bitmap all at once is possible, but would mean we don't delegate all the
-  // building logic to the ArrayBuilder implementation for the output type.
-  OutputBuilderType builder(options.to_type.GetSharedPtr(), 
ctx->memory_pool());
-  RETURN_NOT_OK(builder.Resize(input.length));
-  RETURN_NOT_OK(builder.ReserveData(sum_of_binary_view_sizes));
-  arrow::internal::ArraySpanInlineVisitor<I> visitor;
-  RETURN_NOT_OK(visitor.VisitStatus(
-      input,
-      [&](std::string_view v) {
-        // Append valid string view
-        return builder.Append(v);
+  ArrayData* output = out->array_data().get();
+  output->length = input.length;
+  output->SetNullCount(input.null_count);
+
+  // Set up bitmap
+  if (input.offset == output->offset) {
+    output->buffers[0] = input.GetBuffer(0);
+  } else {
+    if (input.buffers[0].data != NULLPTR) {

Review Comment:
   ```
       // When the offsets are different (e.g., due to slice operation), we 
need to check if
       // the null bitmap buffer is not null before copying it. The null bitmap 
buffer can be
       // null if the input array value does not contain any null value.
   ```
   
   Do we also need a comment here?



##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -327,31 +329,43 @@ BinaryToBinaryCastExec(KernelContext* ctx, const 
ExecSpan& batch, ExecResult* ou
     }
   }
 
-  const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes(
-      input.GetValues<BinaryViewType::c_type>(1), input.length);
-
-  // TODO(GH-43573): A more efficient implementation that copies the validity
-  // bitmap all at once is possible, but would mean we don't delegate all the
-  // building logic to the ArrayBuilder implementation for the output type.
-  OutputBuilderType builder(options.to_type.GetSharedPtr(), 
ctx->memory_pool());
-  RETURN_NOT_OK(builder.Resize(input.length));
-  RETURN_NOT_OK(builder.ReserveData(sum_of_binary_view_sizes));
-  arrow::internal::ArraySpanInlineVisitor<I> visitor;
-  RETURN_NOT_OK(visitor.VisitStatus(
-      input,
-      [&](std::string_view v) {
-        // Append valid string view
-        return builder.Append(v);
+  ArrayData* output = out->array_data().get();
+  output->length = input.length;
+  output->SetNullCount(input.null_count);
+
+  // Set up bitmap
+  if (input.offset == output->offset) {
+    output->buffers[0] = input.GetBuffer(0);
+  } else {
+    if (input.buffers[0].data != NULLPTR) {
+      ARROW_ASSIGN_OR_RAISE(
+          output->buffers[0],
+          arrow::internal::CopyBitmap(ctx->memory_pool(), 
input.buffers[0].data,
+                                      input.offset, input.length));
+    }
+  }
+
+  // Set up offset and data buffer
+  DataBuilder data_builder(ctx->memory_pool());

Review Comment:
   The previous implementation caculate a  `sum_of_binary_view_sizes`, and 
`ReserveData` for it. Why did here doesn't use same way to reserve data? Would 
blindly check append the buffer being faster?



##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -327,31 +329,43 @@ BinaryToBinaryCastExec(KernelContext* ctx, const 
ExecSpan& batch, ExecResult* ou
     }
   }
 
-  const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes(
-      input.GetValues<BinaryViewType::c_type>(1), input.length);
-
-  // TODO(GH-43573): A more efficient implementation that copies the validity
-  // bitmap all at once is possible, but would mean we don't delegate all the
-  // building logic to the ArrayBuilder implementation for the output type.
-  OutputBuilderType builder(options.to_type.GetSharedPtr(), 
ctx->memory_pool());
-  RETURN_NOT_OK(builder.Resize(input.length));
-  RETURN_NOT_OK(builder.ReserveData(sum_of_binary_view_sizes));
-  arrow::internal::ArraySpanInlineVisitor<I> visitor;
-  RETURN_NOT_OK(visitor.VisitStatus(
-      input,
-      [&](std::string_view v) {
-        // Append valid string view
-        return builder.Append(v);
+  ArrayData* output = out->array_data().get();
+  output->length = input.length;
+  output->SetNullCount(input.null_count);
+
+  // Set up bitmap

Review Comment:
   ```suggestion
     // Set up validity bitmap
   ```



##########
cpp/src/arrow/compute/kernels/scalar_cast_string.cc:
##########
@@ -327,31 +329,43 @@ BinaryToBinaryCastExec(KernelContext* ctx, const 
ExecSpan& batch, ExecResult* ou
     }
   }
 
-  const int64_t sum_of_binary_view_sizes = util::SumOfBinaryViewSizes(
-      input.GetValues<BinaryViewType::c_type>(1), input.length);
-
-  // TODO(GH-43573): A more efficient implementation that copies the validity
-  // bitmap all at once is possible, but would mean we don't delegate all the
-  // building logic to the ArrayBuilder implementation for the output type.
-  OutputBuilderType builder(options.to_type.GetSharedPtr(), 
ctx->memory_pool());
-  RETURN_NOT_OK(builder.Resize(input.length));
-  RETURN_NOT_OK(builder.ReserveData(sum_of_binary_view_sizes));
-  arrow::internal::ArraySpanInlineVisitor<I> visitor;
-  RETURN_NOT_OK(visitor.VisitStatus(
-      input,
-      [&](std::string_view v) {
-        // Append valid string view
-        return builder.Append(v);
+  ArrayData* output = out->array_data().get();
+  output->length = input.length;
+  output->SetNullCount(input.null_count);
+
+  // Set up bitmap
+  if (input.offset == output->offset) {
+    output->buffers[0] = input.GetBuffer(0);
+  } else {
+    if (input.buffers[0].data != NULLPTR) {
+      ARROW_ASSIGN_OR_RAISE(
+          output->buffers[0],
+          arrow::internal::CopyBitmap(ctx->memory_pool(), 
input.buffers[0].data,
+                                      input.offset, input.length));
+    }
+  }
+
+  // Set up offset and data buffer
+  DataBuilder data_builder(ctx->memory_pool());
+  OffsetBuilder offset_builder(ctx->memory_pool());
+  RETURN_NOT_OK(offset_builder.Reserve(batch.length + 1));
+  offset_builder.UnsafeAppend(0);  // offsets start at 0
+  RETURN_NOT_OK(VisitArraySpanInline<I>(
+      batch[0].array,
+      [&](std::string_view s) {
+        // for non-null value, append string view to buffer and calculate 
offset
+        ARROW_RETURN_NOT_OK(data_builder.Append(
+            reinterpret_cast<const uint8_t*>(s.data()), 
static_cast<int64_t>(s.size())));
+        
offset_builder.UnsafeAppend(static_cast<offset_type>(data_builder.length()));
+        return Status::OK();
       },
       [&]() {
-        // Append null
-        builder.UnsafeAppendNull();
+        // for null value, no need to update buffer

Review Comment:
   ```suggestion
           // for null value, no need to update data buffer
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to