pitrou commented on code in PR #39383:
URL: https://github.com/apache/arrow/pull/39383#discussion_r1456057628


##########
cpp/src/arrow/compute/light_array.cc:
##########
@@ -575,18 +576,26 @@ Status ExecBatchBuilder::AppendSelected(const 
std::shared_ptr<ArrayData>& source
 
     // Step 1: calculate target offsets
     //
-    uint32_t* offsets = reinterpret_cast<uint32_t*>(target->mutable_data(1));
-    uint32_t sum = num_rows_before == 0 ? 0 : offsets[num_rows_before];
-    Visit(source, num_rows_to_append, row_ids,
-          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
-            offsets[num_rows_before + i] = num_bytes;
-          });
-    for (int i = 0; i < num_rows_to_append; ++i) {
-      uint32_t length = offsets[num_rows_before + i];
-      offsets[num_rows_before + i] = sum;
-      sum += length;
+    int32_t* offsets = reinterpret_cast<int32_t*>(target->mutable_data(1));
+    {
+      int64_t sum = num_rows_before == 0 ? 0 : offsets[num_rows_before];
+      Visit(source, num_rows_to_append, row_ids,
+            [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+              offsets[num_rows_before + i] = num_bytes;

Review Comment:
   Is there a possibility for `num_bytes` to be greater than 
`std::numeric_limits<int32_t>::max()`?
   
   Perhaps we should change `Visit` to pass a `int32_t num_bytes` instead.



##########
cpp/src/arrow/compute/light_array.cc:
##########
@@ -575,18 +576,26 @@ Status ExecBatchBuilder::AppendSelected(const 
std::shared_ptr<ArrayData>& source
 
     // Step 1: calculate target offsets
     //
-    uint32_t* offsets = reinterpret_cast<uint32_t*>(target->mutable_data(1));
-    uint32_t sum = num_rows_before == 0 ? 0 : offsets[num_rows_before];
-    Visit(source, num_rows_to_append, row_ids,
-          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
-            offsets[num_rows_before + i] = num_bytes;
-          });
-    for (int i = 0; i < num_rows_to_append; ++i) {
-      uint32_t length = offsets[num_rows_before + i];
-      offsets[num_rows_before + i] = sum;
-      sum += length;
+    int32_t* offsets = reinterpret_cast<int32_t*>(target->mutable_data(1));
+    {
+      int64_t sum = num_rows_before == 0 ? 0 : offsets[num_rows_before];
+      Visit(source, num_rows_to_append, row_ids,
+            [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+              offsets[num_rows_before + i] = num_bytes;
+            });
+      for (int i = 0; i < num_rows_to_append; ++i) {
+        int32_t length = offsets[num_rows_before + i];
+        offsets[num_rows_before + i] = static_cast<int32_t>(sum);
+        if (ARROW_PREDICT_FALSE(sum + length > BinaryBuilder::memory_limit())) 
{

Review Comment:
   This is correct, but an alternative would be to use `AddWithOverflow` 
instead of a 64-bit addition. Both will probably have the same performance, of 
course.



##########
cpp/src/arrow/compute/light_array_test.cc:
##########
@@ -20,6 +20,7 @@
 #include <gtest/gtest.h>
 #include <numeric>
 
+#include "arrow/array/builder_binary.h"

Review Comment:
   Similarly, this inclusion is probably not needed?



##########
cpp/src/arrow/compute/light_array_test.cc:
##########
@@ -407,6 +408,63 @@ TEST(ExecBatchBuilder, AppendValuesBeyondLimit) {
   ASSERT_EQ(0, pool->bytes_allocated());
 }
 
+TEST(ExecBatchBuilder, AppendVarLengthBeyondLimit) {
+  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
+  MemoryPool* pool = owned_pool.get();
+  constexpr auto eight_mb = 8 * 1024 * 1024;
+  std::string str_8mb(eight_mb, 'a');
+  std::string str_8mb_minus_1(eight_mb - 1, 'b');
+  std::string str_8mb_minus_2(eight_mb - 2,
+                              'b');  // BaseBinaryBuilder::memory_limit()
+  std::shared_ptr<Array> values_8mb = ConstantArrayGenerator::String(1, 
str_8mb);
+  std::shared_ptr<Array> values_8mb_minus_1 =
+      ConstantArrayGenerator::String(1, str_8mb_minus_1);
+  std::shared_ptr<Array> values_8mb_minus_2 =
+      ConstantArrayGenerator::String(1, str_8mb_minus_2);
+
+  ExecBatch batch_8mb({values_8mb}, 1);
+  ExecBatch batch_8mb_minus_1({values_8mb_minus_1}, 1);
+  ExecBatch batch_8mb_minus_2({values_8mb_minus_2}, 1);
+
+  auto num_rows = std::numeric_limits<int32_t>::max() / eight_mb;
+  StringBuilder values_ref_builder;
+  ASSERT_OK(values_ref_builder.Reserve(num_rows + 1));

Review Comment:
   You could also call `ReserveData` to avoid cascading resizes to grow the 
data buffer.



##########
cpp/src/arrow/compute/light_array_test.cc:
##########
@@ -407,6 +408,63 @@ TEST(ExecBatchBuilder, AppendValuesBeyondLimit) {
   ASSERT_EQ(0, pool->bytes_allocated());
 }
 
+TEST(ExecBatchBuilder, AppendVarLengthBeyondLimit) {
+  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
+  MemoryPool* pool = owned_pool.get();
+  constexpr auto eight_mb = 8 * 1024 * 1024;
+  std::string str_8mb(eight_mb, 'a');
+  std::string str_8mb_minus_1(eight_mb - 1, 'b');
+  std::string str_8mb_minus_2(eight_mb - 2,
+                              'b');  // BaseBinaryBuilder::memory_limit()
+  std::shared_ptr<Array> values_8mb = ConstantArrayGenerator::String(1, 
str_8mb);
+  std::shared_ptr<Array> values_8mb_minus_1 =
+      ConstantArrayGenerator::String(1, str_8mb_minus_1);
+  std::shared_ptr<Array> values_8mb_minus_2 =
+      ConstantArrayGenerator::String(1, str_8mb_minus_2);
+
+  ExecBatch batch_8mb({values_8mb}, 1);
+  ExecBatch batch_8mb_minus_1({values_8mb_minus_1}, 1);
+  ExecBatch batch_8mb_minus_2({values_8mb_minus_2}, 1);
+
+  auto num_rows = std::numeric_limits<int32_t>::max() / eight_mb;
+  StringBuilder values_ref_builder;
+  ASSERT_OK(values_ref_builder.Reserve(num_rows + 1));
+  for (int i = 0; i < num_rows; i++) {
+    ASSERT_OK(values_ref_builder.Append(str_8mb));
+  }
+  ASSERT_OK(values_ref_builder.Append(str_8mb_minus_2));
+  ASSERT_OK_AND_ASSIGN(auto values_ref, values_ref_builder.Finish());
+  auto values_ref_1 = values_ref->Slice(0, num_rows);
+  ExecBatch batch_ref_1({values_ref_1}, num_rows);
+  ExecBatch batch_ref_2({values_ref}, num_rows + 1);
+
+  std::vector<uint16_t> first_set_row_ids(num_rows, 0);
+  std::vector<uint16_t> second_set_row_ids(1, 0);
+
+  {
+    ExecBatchBuilder builder;
+    ASSERT_OK(builder.AppendSelected(pool, batch_8mb, num_rows, 
first_set_row_ids.data(),
+                                     /*num_cols=*/1));
+    ASSERT_RAISES(Invalid, builder.AppendSelected(pool, batch_8mb_minus_1, 1,

Review Comment:
   You could perhaps use `ASSERT_RAISES_WITH_MESSAGE` to make sure that the 
error message is as expected.



##########
cpp/src/arrow/compute/light_array.cc:
##########
@@ -19,7 +19,9 @@
 
 #include <type_traits>
 
+#include "arrow/array/builder_binary.h"

Review Comment:
   It's a bit overkill to include this just for the binary size limit. You can 
instead use `std::numeric_limits<int32_t>::max()`.



##########
cpp/src/arrow/compute/light_array.cc:
##########
@@ -325,11 +327,11 @@ Status ResizableArrayData::ResizeVaryingLengthBuffer() {
   column_metadata = ColumnMetadataFromDataType(data_type_).ValueOrDie();
 
   if (!column_metadata.is_fixed_length) {
-    int min_new_size = static_cast<int>(reinterpret_cast<const uint32_t*>(
-        buffers_[kFixedLengthBuffer]->data())[num_rows_]);
+    int64_t min_new_size =
+        reinterpret_cast<const 
int32_t*>(buffers_[kFixedLengthBuffer]->data())[num_rows_];

Review Comment:
   This is correct but I think it can also be written as:
   ```suggestion
       int64_t min_new_size = 
buffers_[kFixedLengthBuffer]->data_as<int32_t>()[num_rows_];
   ```



##########
cpp/src/arrow/compute/light_array.cc:
##########
@@ -575,18 +576,26 @@ Status ExecBatchBuilder::AppendSelected(const 
std::shared_ptr<ArrayData>& source
 
     // Step 1: calculate target offsets
     //
-    uint32_t* offsets = reinterpret_cast<uint32_t*>(target->mutable_data(1));
-    uint32_t sum = num_rows_before == 0 ? 0 : offsets[num_rows_before];
-    Visit(source, num_rows_to_append, row_ids,
-          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
-            offsets[num_rows_before + i] = num_bytes;
-          });
-    for (int i = 0; i < num_rows_to_append; ++i) {
-      uint32_t length = offsets[num_rows_before + i];
-      offsets[num_rows_before + i] = sum;
-      sum += length;
+    int32_t* offsets = reinterpret_cast<int32_t*>(target->mutable_data(1));
+    {
+      int64_t sum = num_rows_before == 0 ? 0 : offsets[num_rows_before];
+      Visit(source, num_rows_to_append, row_ids,
+            [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
+              offsets[num_rows_before + i] = num_bytes;
+            });
+      for (int i = 0; i < num_rows_to_append; ++i) {
+        int32_t length = offsets[num_rows_before + i];

Review Comment:
   For the record, I find it amusing that we're looping over the offsets a 
second time here. Perhaps `Visit` for variable-length types isn't useful?



##########
cpp/src/arrow/compute/light_array.cc:
##########
@@ -575,18 +576,26 @@ Status ExecBatchBuilder::AppendSelected(const 
std::shared_ptr<ArrayData>& source
 
     // Step 1: calculate target offsets
     //
-    uint32_t* offsets = reinterpret_cast<uint32_t*>(target->mutable_data(1));
-    uint32_t sum = num_rows_before == 0 ? 0 : offsets[num_rows_before];
-    Visit(source, num_rows_to_append, row_ids,
-          [&](int i, const uint8_t* ptr, uint32_t num_bytes) {
-            offsets[num_rows_before + i] = num_bytes;
-          });
-    for (int i = 0; i < num_rows_to_append; ++i) {
-      uint32_t length = offsets[num_rows_before + i];
-      offsets[num_rows_before + i] = sum;
-      sum += length;
+    int32_t* offsets = reinterpret_cast<int32_t*>(target->mutable_data(1));
+    {

Review Comment:
   Not sure why you're opening a new block here, is it necessary for some 
reason?



##########
cpp/src/arrow/compute/light_array_test.cc:
##########
@@ -407,6 +408,63 @@ TEST(ExecBatchBuilder, AppendValuesBeyondLimit) {
   ASSERT_EQ(0, pool->bytes_allocated());
 }
 
+TEST(ExecBatchBuilder, AppendVarLengthBeyondLimit) {
+  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
+  MemoryPool* pool = owned_pool.get();
+  constexpr auto eight_mb = 8 * 1024 * 1024;
+  std::string str_8mb(eight_mb, 'a');
+  std::string str_8mb_minus_1(eight_mb - 1, 'b');
+  std::string str_8mb_minus_2(eight_mb - 2,
+                              'b');  // BaseBinaryBuilder::memory_limit()

Review Comment:
   I'm not sure what this string has to do with 
`BaseBinaryBuilder::memory_limit()`?



##########
cpp/src/arrow/compute/light_array_test.cc:
##########
@@ -407,6 +408,63 @@ TEST(ExecBatchBuilder, AppendValuesBeyondLimit) {
   ASSERT_EQ(0, pool->bytes_allocated());
 }
 
+TEST(ExecBatchBuilder, AppendVarLengthBeyondLimit) {

Review Comment:
   Can you add a comment referring to the GH issue?



##########
cpp/src/arrow/compute/light_array_test.cc:
##########
@@ -407,6 +408,63 @@ TEST(ExecBatchBuilder, AppendValuesBeyondLimit) {
   ASSERT_EQ(0, pool->bytes_allocated());
 }
 
+TEST(ExecBatchBuilder, AppendVarLengthBeyondLimit) {
+  std::unique_ptr<MemoryPool> owned_pool = MemoryPool::CreateDefault();
+  MemoryPool* pool = owned_pool.get();
+  constexpr auto eight_mb = 8 * 1024 * 1024;
+  std::string str_8mb(eight_mb, 'a');
+  std::string str_8mb_minus_1(eight_mb - 1, 'b');
+  std::string str_8mb_minus_2(eight_mb - 2,
+                              'b');  // BaseBinaryBuilder::memory_limit()
+  std::shared_ptr<Array> values_8mb = ConstantArrayGenerator::String(1, 
str_8mb);
+  std::shared_ptr<Array> values_8mb_minus_1 =
+      ConstantArrayGenerator::String(1, str_8mb_minus_1);
+  std::shared_ptr<Array> values_8mb_minus_2 =
+      ConstantArrayGenerator::String(1, str_8mb_minus_2);
+
+  ExecBatch batch_8mb({values_8mb}, 1);
+  ExecBatch batch_8mb_minus_1({values_8mb_minus_1}, 1);
+  ExecBatch batch_8mb_minus_2({values_8mb_minus_2}, 1);
+
+  auto num_rows = std::numeric_limits<int32_t>::max() / eight_mb;
+  StringBuilder values_ref_builder;
+  ASSERT_OK(values_ref_builder.Reserve(num_rows + 1));
+  for (int i = 0; i < num_rows; i++) {
+    ASSERT_OK(values_ref_builder.Append(str_8mb));
+  }
+  ASSERT_OK(values_ref_builder.Append(str_8mb_minus_2));
+  ASSERT_OK_AND_ASSIGN(auto values_ref, values_ref_builder.Finish());
+  auto values_ref_1 = values_ref->Slice(0, num_rows);
+  ExecBatch batch_ref_1({values_ref_1}, num_rows);

Review Comment:
   This one isn't used, is it?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to