pitrou commented on code in PR #13009:
URL: https://github.com/apache/arrow/pull/13009#discussion_r864576556


##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -59,11 +62,12 @@ class ArrayIterator {
 
   // Value access
   value_type operator*() const {
-    return array_->IsNull(index_) ? value_type{} : array_->GetView(index_);
+    return !array_ || array_->IsNull(index_) ? value_type{} : 
array_->GetView(index_);

Review Comment:
   Is there any circumstance where `array_` is null and you're still calling 
this operator? This doesn't seem right.



##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +132,153 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), 
current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t 
index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    auto chunk_location = GetChunkLocation(this->index_);
+    if (chunked_array.length()) {
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              
chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          index_);
+    } else {
+      current_array_iterator_ = {};
+    }
+
+    this->current_chunk_index_ = chunk_location.chunk_index;
+    current_array_iterator_ -= this->index() - chunk_location.index_in_chunk;
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    if (current_chunk_index_ == chunk_location.chunk_index) {
+      return current_array_iterator_[chunk_location.index_in_chunk -
+                                     current_array_iterator_.index()];
+    } else {
+      ArrayIterator<ArrayType> target_iterator{
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              
chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+      return target_iterator[chunk_location.index_in_chunk];
+    }
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }

Review Comment:
   This is the one place where `current_array_iterator_` could be useful for 
performance but you're not using it here?



##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +132,153 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), 
current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t 
index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    auto chunk_location = GetChunkLocation(this->index_);
+    if (chunked_array.length()) {
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              
chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          index_);
+    } else {
+      current_array_iterator_ = {};
+    }
+
+    this->current_chunk_index_ = chunk_location.chunk_index;
+    current_array_iterator_ -= this->index() - chunk_location.index_in_chunk;
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    if (current_chunk_index_ == chunk_location.chunk_index) {
+      return current_array_iterator_[chunk_location.index_in_chunk -
+                                     current_array_iterator_.index()];
+    } else {
+      ArrayIterator<ArrayType> target_iterator{
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              
chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index)))};
+      return target_iterator[chunk_location.index_in_chunk];
+    }
+  }
+
+  int64_t index() const { return index_; }
+
+  // Forward / backward
+  ChunkedArrayIterator& operator++() {
+    (*this) += 1;
+    return *this;
+  }
+  ChunkedArrayIterator& operator--() {
+    (*this) -= 1;
+    return *this;
+  }
+
+  ChunkedArrayIterator operator++(int) {
+    ChunkedArrayIterator tmp(*this);
+    ++*this;
+    return tmp;
+  }
+  ChunkedArrayIterator operator--(int) {
+    ChunkedArrayIterator tmp(*this);
+    --*this;
+    return tmp;
+  }
+
+  // Arithmetic
+  difference_type operator-(const ChunkedArrayIterator& other) const {
+    return index_ - other.index_;
+  }
+  ChunkedArrayIterator operator+(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ + n);
+  }
+  ChunkedArrayIterator operator-(difference_type n) const {
+    return ChunkedArrayIterator(*chunked_array_, index_ - n);
+  }
+  friend inline ChunkedArrayIterator operator+(difference_type diff,
+                                               const ChunkedArrayIterator& 
other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff + other.index_);
+  }
+  friend inline ChunkedArrayIterator operator-(difference_type diff,
+                                               const ChunkedArrayIterator& 
other) {
+    return ChunkedArrayIterator(*other.chunked_array_, diff - other.index_);
+  }
+  ChunkedArrayIterator& operator+=(difference_type n) {
+    index_ += n;
+    auto chunk_location = GetChunkLocation(index_);
+    if (current_chunk_index_ == chunk_location.chunk_index) {

Review Comment:
   Again, I don't think this `if` branch is needed as constructing an 
`ArrayIterator` should be cheap.



##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +132,153 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), 
current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t 
index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    auto chunk_location = GetChunkLocation(this->index_);
+    if (chunked_array.length()) {
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              
chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          index_);
+    } else {
+      current_array_iterator_ = {};
+    }
+
+    this->current_chunk_index_ = chunk_location.chunk_index;
+    current_array_iterator_ -= this->index() - chunk_location.index_in_chunk;
+  }
+
+  // Value access
+  value_type operator*() const { return *current_array_iterator_; }
+
+  value_type operator[](difference_type n) const {
+    auto chunk_location = GetChunkLocation(index_ + n);
+    if (current_chunk_index_ == chunk_location.chunk_index) {

Review Comment:
   I don't think this `if` branch is needed, constructing a `ArrayIterator` 
should be cheap.



##########
cpp/src/arrow/stl_iterator_test.cc:
##########
@@ -248,5 +248,201 @@ TEST(ArrayIterator, StdMerge) {
   ASSERT_EQ(values, expected);
 }
 
+TEST(ChunkedArrayIterator, Basics) {
+  auto result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6])"});
+  auto it = Iterate<Int32Type>(*result);
+  optional<int32_t> v = *it;
+  ASSERT_EQ(v, 4);
+  ASSERT_EQ(it[0], 4);
+  ++it;
+  ASSERT_EQ(it[0], 5);
+  ASSERT_EQ(*it, 5);
+  ASSERT_EQ(it[1], nullopt);
+  ASSERT_EQ(it[2], 6);
+}
+
+TEST(ChunkedArrayIterator, Arithmetic) {
+  auto result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6, null, 
7])"});
+
+  auto it = Iterate<Int32Type>(*result);
+  auto it2 = it + 2;
+  ASSERT_EQ(*it, 4);
+  ASSERT_EQ(*it2, nullopt);
+  ASSERT_EQ(it2 - it, 2);
+  ASSERT_EQ(it - it2, -2);
+  auto it3 = it++;
+  ASSERT_EQ(it2 - it, 1);
+  ASSERT_EQ(it2 - it3, 2);
+  ASSERT_EQ(*it3, 4);
+  ASSERT_EQ(*it, 5);
+  auto it4 = ++it;
+  ASSERT_EQ(*it, nullopt);
+  ASSERT_EQ(*it4, nullopt);
+  ASSERT_EQ(it2 - it, 0);
+  ASSERT_EQ(it2 - it4, 0);
+  auto it5 = it + 3;
+  ASSERT_EQ(*it5, 7);
+  ASSERT_EQ(*(it5 - 2), 6);
+  ASSERT_EQ(*(it5 + (-2)), 6);
+  auto it6 = (--it5)--;
+  ASSERT_EQ(*it6, nullopt);
+  ASSERT_EQ(*it5, 6);
+  ASSERT_EQ(it6 - it5, 1);
+}
+
+TEST(ChunkedArrayIterator, Comparison) {
+  auto result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6, null, 
7])"});
+
+  auto it = Iterate<Int32Type>(*result) + 2;
+  auto it2 = Iterate<Int32Type>(*result) + 2;
+  auto it3 = Iterate<Int32Type>(*result) + 4;
+
+  ASSERT_TRUE(it == it2);
+  ASSERT_TRUE(it <= it2);
+  ASSERT_TRUE(it >= it2);
+  ASSERT_FALSE(it != it2);
+  ASSERT_FALSE(it < it2);
+  ASSERT_FALSE(it > it2);
+
+  ASSERT_FALSE(it == it3);
+  ASSERT_TRUE(it <= it3);
+  ASSERT_FALSE(it >= it3);
+  ASSERT_TRUE(it != it3);
+  ASSERT_TRUE(it < it3);
+  ASSERT_FALSE(it > it3);
+}
+
+TEST(ChunkedArrayIterator, MultipleChunks) {
+  auto result =
+      ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([6])", R"([7, 9, 
10, 8])",
+                                     R"([11, 13])", R"([14])", R"([15])", 
R"([16])"});
+
+  auto it = Iterate<Int32Type>(*result);
+  ASSERT_EQ(it[8], 11);
+  ASSERT_EQ(it[9], 13);
+  it += 3;
+  ASSERT_EQ(it[0], 6);
+  ++it;
+  ASSERT_EQ(it[0], 7);
+  ASSERT_EQ(*it, 7);
+  ASSERT_EQ(it[1], 9);
+  ASSERT_EQ(it[2], 10);
+  it -= 4;
+  ASSERT_EQ(it[0], 4);
+  ASSERT_EQ(it[1], 5);
+  ASSERT_EQ(it[2], nullopt);
+  ASSERT_EQ(it[3], 6);
+  ASSERT_EQ(it[4], 7);
+  it += 9;
+  ASSERT_EQ(*it, 13);
+  --it;
+  ASSERT_EQ(*it, 11);
+  --it;
+  ASSERT_EQ(*it, 8);
+  ASSERT_EQ(it[0], 8);
+  ASSERT_EQ(it[1], 11);
+  ASSERT_EQ(it[2], 13);
+  ASSERT_EQ(it[3], 14);
+  ASSERT_EQ(it[4], 15);
+  ASSERT_EQ(it[5], 16);
+  ++it;
+  ASSERT_EQ(*it, 11);
+  ASSERT_EQ(it[0], 11);
+  ASSERT_EQ(it[1], 13);
+  ASSERT_EQ(it[2], 14);
+  ASSERT_EQ(it[3], 15);
+  ASSERT_EQ(it[4], 16);
+  ++it;
+  ASSERT_EQ(*it, 13);
+  ASSERT_EQ(it[0], 13);
+  ASSERT_EQ(it[1], 14);
+  ASSERT_EQ(it[2], 15);
+  ASSERT_EQ(it[3], 16);
+  ++it;
+  ++it;
+  ASSERT_EQ(*it, 15);
+  ASSERT_EQ(it[0], 15);
+  ASSERT_EQ(it[1], 16);
+  --it;
+  ASSERT_EQ(*it, 14);
+  ASSERT_EQ(it[0], 14);
+  ASSERT_EQ(it[1], 15);
+  ASSERT_EQ(it[2], 16);
+  --it;
+  ASSERT_EQ(*it, 13);
+  ASSERT_EQ(it[0], 13);
+  ASSERT_EQ(it[1], 14);
+  ASSERT_EQ(it[2], 15);
+  it -= 2;
+  ASSERT_EQ(*it, 8);
+  ASSERT_EQ(it[0], 8);
+  ASSERT_EQ(it[1], 11);
+  ASSERT_EQ(it[2], 13);
+}
+
+TEST(ChunkedArrayIterator, EmptyChunks) {
+  auto result = ChunkedArrayFromJSON(int32(), {});
+  auto it = Iterate<Int32Type>(*result);
+  ASSERT_EQ(*it, nullopt);
+  ASSERT_EQ(it[0], nullopt);
+  it++;
+  ASSERT_EQ(it[0], nullopt);
+  it += 2;
+  ASSERT_EQ(it[1], nullopt);
+
+  result = ChunkedArrayFromJSON(int32(), {R"([4, 5, null])", R"([])", R"([7, 
9, 10, 8])",
+                                          R"([11, 13])", R"([])", R"([])", 
R"([16])"});
+
+  it = Iterate<Int32Type>(*result);
+  ASSERT_EQ(it[8], 13);
+  ASSERT_EQ(it[9], 16);
+  it += 3;
+  ASSERT_EQ(it[0], 7);
+  ++it;
+  ASSERT_EQ(it[0], 9);
+  ASSERT_EQ(*it, 9);
+  ASSERT_EQ(it[1], 10);
+  ASSERT_EQ(it[2], 8);
+  it -= 4;
+  ASSERT_EQ(it[0], 4);
+  ASSERT_EQ(it[1], 5);
+  ASSERT_EQ(it[2], nullopt);
+  ASSERT_EQ(it[3], 7);
+  ASSERT_EQ(it[4], 9);
+  it += 9;
+  ASSERT_EQ(*it, 16);
+  --it;
+  ASSERT_EQ(*it, 13);
+  --it;
+  ASSERT_EQ(*it, 11);
+  ASSERT_EQ(it[0], 11);
+  ASSERT_EQ(it[1], 13);
+  ASSERT_EQ(it[2], 16);
+  ++it;
+  ASSERT_EQ(*it, 13);
+  ASSERT_EQ(it[0], 13);
+  ASSERT_EQ(it[1], 16);
+  ++it;
+  ASSERT_EQ(*it, 16);
+  ASSERT_EQ(it[0], 16);
+  --it;
+  ASSERT_EQ(*it, 13);
+  ASSERT_EQ(it[0], 13);
+  ASSERT_EQ(it[1], 16);
+  --it;
+  ASSERT_EQ(*it, 11);
+  ASSERT_EQ(it[0], 11);
+  ASSERT_EQ(it[1], 13);
+  ASSERT_EQ(it[2], 16);
+  it += 2;
+  ASSERT_EQ(*it, 16);
+  ASSERT_EQ(it[0], 16);
+  it -= 3;
+  ASSERT_EQ(*it, 8);
+  ASSERT_EQ(it[0], 8);
+  ASSERT_EQ(it[1], 11);
+  ASSERT_EQ(it[2], 13);
+}
+

Review Comment:
   Two things:
   1) can you add tests with an empty chunked array?
   2) can you add tests with some standard algorithms as done above for 
`ArrayIterator`?



##########
cpp/src/arrow/stl_iterator.h:
##########
@@ -128,6 +132,153 @@ class ArrayIterator {
   int64_t index_;
 };
 
+template <typename ArrayType,
+          typename ValueAccessor = detail::DefaultValueAccessor<ArrayType>>
+class ChunkedArrayIterator {
+ public:
+  using value_type = arrow::util::optional<typename ValueAccessor::ValueType>;
+  using difference_type = int64_t;
+  using pointer = value_type*;
+  using reference = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  // Some algorithms need to default-construct an iterator
+  ChunkedArrayIterator() : chunked_array_(NULLPTR), index_(0), 
current_chunk_index_(0) {}
+
+  explicit ChunkedArrayIterator(const ChunkedArray& chunked_array, int64_t 
index = 0)
+      : chunked_array_(&chunked_array), index_(index) {
+    auto chunk_location = GetChunkLocation(this->index_);
+    if (chunked_array.length()) {
+      current_array_iterator_ = ArrayIterator<ArrayType>(
+          *arrow::internal::checked_pointer_cast<ArrayType>(
+              
chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
+          index_);

Review Comment:
   This seems more logical:
   ```suggestion
         current_array_iterator_ = ArrayIterator<ArrayType>(
             *arrow::internal::checked_pointer_cast<ArrayType>(
                 
chunked_array_->chunk(static_cast<int>(chunk_location.chunk_index))),
             chunk_location.index_in_chunk);
   ```
   ... and then you don't need the subtraction below.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to