andishgar commented on code in PR #46229:
URL: https://github.com/apache/arrow/pull/46229#discussion_r2208006736


##########
cpp/src/arrow/array/array_binary.cc:
##########
@@ -105,6 +111,392 @@ 
BinaryViewArray::BinaryViewArray(std::shared_ptr<DataType> type, int64_t length,
       ArrayData::Make(std::move(type), length, std::move(buffers), null_count, 
offset));
 }
 
+namespace {
+
+// TODO Should We move this to bitmap_ops.h and Remove from 
compute/kernels/util.s
+Result<std::shared_ptr<Buffer>> GetOrCopyNullBitmapBuffer(const ArrayData& 
in_array,
+                                                          MemoryPool* pool) {
+  if (in_array.buffers[0]->data() == nullptr) {
+    return nullptr;
+  } else if (in_array.offset == 0) {
+    return in_array.buffers[0];
+  } else if (in_array.offset % 8 == 0) {
+    return SliceBuffer(in_array.buffers[0], /*offset=*/in_array.offset / 8);
+  } else {
+    // If a non-zero offset, we need to shift the bitmap
+    return internal::CopyBitmap(pool, in_array.buffers[0]->data(), 
in_array.offset,
+                                in_array.length);
+  }
+}
+
+struct Interval {
+  int64_t start;
+  int64_t end;
+  int32_t offset = -1;
+};
+
+struct IntervalComparator {
+  bool operator()(const Interval& left, const Interval& right) const {
+    return left.start < right.start;
+  }
+};
+
+// inspired from boost::icl::interval_set
+class IntervalMerger {
+ public:
+  using IntervalSet = std::set<Interval, IntervalComparator>;
+  using Iterator = std::set<Interval, IntervalComparator>::iterator;
+
+  void AddInterval(const Interval& interval) {
+    auto [it, is_inserted] = interval_set.insert(interval);
+    if (is_inserted) {
+      JointLeft(it);
+      JoinRight(it);
+    } else {
+      if (it->end < interval.end) {
+        const_cast<int64_t&>(it->end) = interval.end;
+        JoinRight(it);
+      }
+    }
+  }
+
+  int64_t CalculateOffsetAndTotalSize() {
+    int64_t total_size = 0;
+    for (auto& it : interval_set) {
+      const_cast<int32_t&>(it.offset) = static_cast<int32_t>(total_size);

Review Comment:
   Regarding overflow handling, there is no issue—I’ve written a test for the 
overflow case, and it passes.
   
   I believe it's possible to reason why `static_cast<int32_t>` is safe in this 
context.
   
   The intervals in view elements are determined by their `size` and `offset`, 
and neither of these values can exceed `INT32_MAX`. Therefore, the maximum 
range of data that can be referenced from view elements into a buffer is 
approximately 4 GB (minus 2 bytes).
   
   Since the size attribute is limited to `INT32_MAX,` it is not possible to 
have an `occupancy` greater than `INT32_MAX`, unless there is an interval that 
includes or starts at the position `INT32_MAX.`
   
   
   As I mentioned above,`Interval.offset `refers to the position of the 
interval in an imaginary compacted buffer. Since intervals must either begin at 
or include the position `INT32_MAX `in such a situation, the value never 
exceeds` INT32_MAX`.
   
   What’s your opinion? Is my observation correct, or is there some exception?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to