westonpace commented on code in PR #35628:
URL: https://github.com/apache/arrow/pull/35628#discussion_r1227602328


##########
cpp/src/arrow/array/array_binary.h:
##########
@@ -217,6 +218,90 @@ class ARROW_EXPORT LargeStringArray : public 
LargeBinaryArray {
   Status ValidateUTF8() const;
 };
 
+// ----------------------------------------------------------------------
+// BinaryView and StringView
+
+/// Concrete Array class for variable-size binary view data using the
+/// StringHeader struct to reference in-line or out-of-line string values
+class ARROW_EXPORT BinaryViewArray : public PrimitiveArray {
+ public:
+  using TypeClass = BinaryViewType;
+  using IteratorType = stl::ArrayIterator<BinaryViewArray>;
+
+  explicit BinaryViewArray(const std::shared_ptr<ArrayData>& data);
+
+  BinaryViewArray(int64_t length, std::shared_ptr<Buffer> data, BufferVector 
char_buffers,

Review Comment:
   Minor nit but maybe rename `data` since there is already a member variable 
named `data_` which is something different?



##########
cpp/src/arrow/array/array_binary.h:
##########
@@ -217,6 +218,90 @@ class ARROW_EXPORT LargeStringArray : public 
LargeBinaryArray {
   Status ValidateUTF8() const;
 };
 
+// ----------------------------------------------------------------------
+// BinaryView and StringView
+
+/// Concrete Array class for variable-size binary view data using the
+/// StringHeader struct to reference in-line or out-of-line string values
+class ARROW_EXPORT BinaryViewArray : public PrimitiveArray {
+ public:
+  using TypeClass = BinaryViewType;
+  using IteratorType = stl::ArrayIterator<BinaryViewArray>;
+
+  explicit BinaryViewArray(const std::shared_ptr<ArrayData>& data);
+
+  BinaryViewArray(int64_t length, std::shared_ptr<Buffer> data, BufferVector 
char_buffers,
+                  std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+                  int64_t null_count = kUnknownNullCount, int64_t offset = 0)
+      : PrimitiveArray(binary_view(), length, std::move(data), 
std::move(null_bitmap),
+                       null_count, offset) {
+    for (auto& char_buffer : char_buffers) {
+      data_->buffers.push_back(std::move(char_buffer));

Review Comment:
   Given there could be quite a few char buffers do we want to do a `reserve` 
here?



##########
cpp/src/arrow/array/array_binary_test.cc:
##########
@@ -365,38 +368,206 @@ TYPED_TEST(TestStringArray, TestValidateOffsets) { 
this->TestValidateOffsets();
 
 TYPED_TEST(TestStringArray, TestValidateData) { this->TestValidateData(); }
 
+namespace string_header_helpers {
+
+StringHeader Inline(std::string_view chars) {
+  assert(StringHeader::IsInline(chars.size()));

Review Comment:
   I think we can use dcheck in tests.  Or even `EXPECT_TRUE`.  Probably not a 
big deal though.



##########
cpp/src/arrow/array/data.h:
##########
@@ -547,6 +551,21 @@ struct ARROW_EXPORT ArraySpan {
 
 namespace internal {
 
+template <typename F>

Review Comment:
   Perhaps `Func` instead of `F` for consistency?



##########
cpp/src/arrow/array/dict_internal.h:
##########
@@ -156,6 +156,60 @@ struct DictionaryTraits<T, enable_if_base_binary<T>> {
   }
 };
 
+template <typename T>
+struct DictionaryTraits<T, enable_if_binary_view_like<T>> {

Review Comment:
   Did this move here from somewhere?  Why are you using `Status` for 
`GetDictionaryArrayData` instead of `Result`?



##########
cpp/src/arrow/array/util.cc:
##########
@@ -267,6 +270,47 @@ class ArrayDataEndianSwapper {
     return Status::OK();
   }
 
+  Status Visit(const BinaryViewType& type) {
+    if (type.has_raw_pointers()) {
+      return Status::NotImplemented(
+          "Swapping endianness of binary / string view with raw pointers");

Review Comment:
   Would we ever want to do this?  It seems you wouldn't need to swap the 
endianness of raw pointers because they can't leave the machine anyways.



##########
cpp/src/arrow/compare.cc:
##########
@@ -261,6 +261,36 @@ class RangeDataEqualsImpl {
   // Also matches StringType
   Status Visit(const BinaryType& type) { return CompareBinary(type); }
 
+  // Also matches StringViewType
+  Status Visit(const BinaryViewType& type) {
+    auto* left_values = left_.GetValues<StringHeader>(1) + left_start_idx_;
+    auto* right_values = right_.GetValues<StringHeader>(1) + right_start_idx_;
+    if (type.has_raw_pointers()) {
+      VisitValidRuns([&](int64_t i, int64_t length) {
+        for (auto end_i = i + length; i < end_i; ++i) {
+          if (left_values[i] != right_values[i]) {
+            return false;
+          }
+        }
+        return true;
+      });
+      return Status::OK();
+    }
+
+    auto* left_buffers = left_.buffers.data() + 2;
+    auto* right_buffers = right_.buffers.data() + 2;
+    VisitValidRuns([&](int64_t i, int64_t length) {
+      for (auto end_i = i + length; i < end_i; ++i) {
+        if (!left_values[i].EqualsIndexOffset(left_buffers, right_values[i],
+                                              right_buffers)) {

Review Comment:
   This discrepancy between `==` and `EqualsIndexOffset` is slightly 
unsettling.  I really want there to be `EqualsRawPointers`.  But it's not a 
terrible thing.



##########
cpp/src/arrow/util/string_header.h:
##########
@@ -0,0 +1,377 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+/*
+ * Copyright (c) Facebook, Inc. and its affiliates.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <array>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <iosfwd>
+#include <string>
+#include <string_view>
+
+namespace arrow {
+
+/// Variable length string or binary with 4 byte prefix and inline optimization
+/// for small values (12 bytes or fewer). This is similar to std::string_view
+/// except that the referenced is limited in size to UINT32_MAX and up to the
+/// first four bytes of the string are copied into the struct. The prefix 
allows
+/// failing comparisons early and can reduce the CPU cache working set when
+/// dealing with short strings.
+///
+/// This structure supports three states:
+///
+/// Short string   |----|----|--------|
+///                 ^    ^      ^
+///                 |    |      |
+///                 size prefix remaining in-line portion, zero padded
+///
+/// Long string    |----|----|--------|
+///                 ^    ^      ^
+///                 |    |      |
+///                 size prefix raw pointer to out-of-line portion
+///
+/// IO Long string |----|----|----|----|
+///                 ^    ^      ^     ^
+///                 |    |      |     `----------.
+///                 size prefix buffer index and offset to out-of-line portion
+///
+/// Adapted from TU Munich's UmbraDB [1], Velox, DuckDB.
+///
+/// [1]: https://db.in.tum.de/~freitag/papers/p29-neumann-cidr20.pdf
+///
+/// There is no way to determine from a non-inline StringHeader whether it 
refers
+/// to its out-of-line portion with a raw pointer or with index/offset. This
+/// information is stored at the column level; so a buffer of StringHeader will
+/// contain only one or the other. In general unless a StringHeader is resident
+/// in a StringView array's buffer it will refer to out-of-line data with a raw
+/// pointer. This default is assumed by several members of StringHeader such as
+/// operator==() and operator string_view() since these and other operations 
cannot
+/// be performed on index/offset StringHeaders without also accessing the 
buffers
+/// storing their out-of-line data. Which states pertain to each accessor and
+/// constructor are listed in their comments.
+struct alignas(8) StringHeader {
+ public:
+  using value_type = char;
+
+  static constexpr size_t kTotalSize = 16;
+  static constexpr size_t kSizeSize = sizeof(uint32_t);
+  static constexpr size_t kIndexOffsetSize = sizeof(uint32_t) * 2;
+  static constexpr size_t kPrefixSize = kTotalSize - kSizeSize - 
kIndexOffsetSize;
+  static_assert(kPrefixSize == 4);
+  static constexpr size_t kInlineSize = kTotalSize - kPrefixSize;
+  static_assert(kInlineSize == 12);
+
+  /// Construct an empty view.
+  StringHeader() = default;
+
+  /// Construct a RAW POINTER view.
+  StringHeader(const char* data, size_t len) : 
size_(static_cast<uint32_t>(len)) {
+    if (size_ == 0) return;
+
+    // TODO(bkietz) better option than assert?
+    assert(data);
+    if (IsInline()) {
+      // small string: inlined. Bytes beyond size_ are already 0
+      memcpy(prefix_.data(), data, size_);
+    } else {
+      // large string: store pointer
+      memcpy(prefix_.data(), data, kPrefixSize);
+      value_.data = data;
+    }
+  }
+
+  /// Construct a RAW POINTER view.
+  StringHeader(const uint8_t* data, int64_t len)
+      : StringHeader(reinterpret_cast<const char*>(data), 
static_cast<size_t>(len)) {}
+
+  /// Convenience implicit constructor for RAW POINTER views from C 
string/string literal.
+  ///
+  /// NOLINTNEXTLINE runtime/explicit
+  StringHeader(const char* data) : StringHeader(data, std::strlen(data)) {}
+
+  /// Construct a RAW POINTER view.
+  explicit StringHeader(const std::string& value)
+      : StringHeader(value.data(), value.size()) {}
+
+  /// Construct a RAW POINTER view.
+  explicit StringHeader(std::string_view value)
+      : StringHeader(value.data(), value.size()) {}
+
+  /// Construct an INDEX/OFFSET view.
+  StringHeader(const char* data, uint32_t len, uint32_t buffer_index,
+               const char* buffer_data)
+      : size_(len) {
+    if (size_ == 0) return;
+
+    // TODO(bkietz) better option than assert?
+    assert(data);
+    if (IsInline()) {
+      // small string: inlined. Bytes beyond size_ are already 0
+      memcpy(prefix_.data(), data, size_);
+    } else {
+      // large string: store index/offset
+      memcpy(prefix_.data(), data, kPrefixSize);
+      SetIndexOffset(buffer_index, static_cast<uint32_t>(data - buffer_data));
+    }
+  }
+
+  /// Construct an INDEX/OFFSET view.
+  StringHeader(uint32_t len, std::array<char, kPrefixSize> prefix, uint32_t 
buffer_index,
+               uint32_t offset)
+      : size_(len), prefix_(prefix) {
+    SetIndexOffset(buffer_index, offset);
+  }
+
+  template <typename I>
+  static constexpr bool IsInline(I size) {
+    return size <= static_cast<I>(kInlineSize);
+  }
+  static constexpr bool IsInline(uint32_t size) { return size <= kInlineSize; }
+
+  /// True if the view's data is entirely stored inline.
+  /// This function is safe for use against both RAW POINTER and INDEX/OFFSET 
views.
+  bool IsInline() const { return IsInline(size_); }
+
+  /// Return a RAW POINTER view's data.
+  const char* data() const& { return IsInline() ? prefix_.data() : 
value_.data; }
+  const char* data() && = delete;
+
+  /// The number of characters viewed by this StringHeader.
+  /// This function is safe for use against both RAW POINTER and INDEX/OFFSET 
views.
+  size_t size() const { return size_; }
+
+  /// Print a RAW POINTER view to a std::ostream.
+  friend std::ostream& operator<<(std::ostream& os, const StringHeader& 
header);
+
+  /// Equality comparison between RAW POINTER views.
+  bool operator==(const StringHeader& other) const {
+    // Compare lengths and first 4 characters.
+    if (SizeAndPrefixAsInt64() != other.SizeAndPrefixAsInt64()) {
+      return false;
+    }
+    if (IsInline()) {
+      // The inline part is zeroed at construction, so we can compare
+      // a word at a time if data extends past 'prefix_'.
+      return size_ <= kPrefixSize || InlinedAsInt64() == 
other.InlinedAsInt64();
+    }
+    // Sizes are equal and this is not inline, therefore both are out
+    // of line and have kPrefixSize first in common.
+    return memcmp(value_.data + kPrefixSize, other.value_.data + kPrefixSize,
+                  size_ - kPrefixSize) == 0;
+  }
+
+  /// Inequality comparison between RAW POINTER views.
+  bool operator!=(const StringHeader& other) const { return !(*this == other); 
}
+
+  /// Less-than comparison between RAW POINTER views.
+  bool operator<(const StringHeader& other) const { return Compare(other) < 0; 
}
+
+  /// Less-than-or-equal comparison between RAW POINTER views.
+  bool operator<=(const StringHeader& other) const { return Compare(other) <= 
0; }
+
+  /// Greater-than comparison between RAW POINTER views.
+  bool operator>(const StringHeader& other) const { return Compare(other) > 0; 
}
+
+  /// Greater-than-or-equal comparison between RAW POINTER views.
+  bool operator>=(const StringHeader& other) const { return Compare(other) >= 
0; }
+
+  /// Conversion to std::string_view for RAW POINTER views.
+  explicit operator std::string_view() const& { return {data(), size()}; }
+  explicit operator std::string_view() && = delete;
+
+  /// Return the always-inline cached first 4 bytes of this StringHeader.
+  /// This function is safe for use against both RAW POINTER and INDEX/OFFSET 
views.
+  std::array<char, kPrefixSize> GetPrefix() const { return prefix_; }
+
+  /// Return an INDEX/OFFSET view's buffer index.
+  uint32_t GetBufferIndex() const { return value_.io_data.buffer_index; }
+
+  /// Return an INDEX/OFFSET view's buffer offset.
+  uint32_t GetBufferOffset() const { return value_.io_data.offset; }
+
+  /// Return a RAW POINTER view's data pointer.
+  ///
+  /// NOT VALID FOR INLINE VIEWS.
+  const char* GetRawPointer() const { return value_.data; }
+
+  /// Return an INDEX/OFFSET view's data pointer.
+  ///
+  /// NOT VALID FOR INLINE VIEWS.
+  template <typename BufferPtr>
+  const char* GetPointerFromBuffers(const BufferPtr* char_buffers) const {
+    return char_buffers[GetBufferIndex()]->template data_as<char>() + 
GetBufferOffset();
+  }
+
+  /// Return an INDEX/OFFSET view's data pointer.
+  template <typename BufferPtr>
+  const char* GetPointerFromBuffersOrInlineData(const BufferPtr* char_buffers) 
const {
+    return IsInline() ? GetInlineData() : GetPointerFromBuffers(char_buffers);
+  }
+
+  /// Return a the inline data of a view.
+  ///
+  /// For inline views, this points to the entire data of the view.
+  /// For other views, this points to the 4 byte prefix.
+  const char* GetInlineData() const& { return prefix_.data(); }
+  const char* GetInlineData() && = delete;
+
+  /// Mutate into a RAW POINTER view.
+  ///
+  /// This function is only intended for use in converting from an equivalent 
INDEX/OFFSET
+  /// view; in particular it does not check or modify the prefix for 
consistency with the
+  /// new data pointer.
+  void SetRawPointer(const char* data) { value_.data = data; }
+
+  /// Mutate into an INDEX/OFFSET view.
+  ///
+  /// This function is only intended for use in converting from an equivalent 
RAW POINTER
+  /// view; in particular it does not check or modify the prefix for 
consistency with the
+  /// new buffer index/offset.
+  void SetIndexOffset(uint32_t buffer_index, uint32_t offset) {
+    value_.io_data = {buffer_index, offset};
+  }
+
+  /// Equality compare an INDEX/OFFSET view in place.
+  ///
+  /// Equivalent comparison will be accomplished by (for example) first 
converting both
+  /// views to std::string_view and comparing those, but this would not take 
advantage
+  /// of the cached 4 byte prefix.
+  template <typename BufferPtr>
+  bool EqualsIndexOffset(const BufferPtr* char_buffers, const StringHeader& 
other,
+                         const BufferPtr* other_char_buffers) const {
+    if (SizeAndPrefixAsInt64() != other.SizeAndPrefixAsInt64()) {
+      return false;
+    }
+    if (IsInline()) {
+      return InlinedAsInt64() == other.InlinedAsInt64();
+    }
+    // Sizes are equal and this is not inline, therefore both are out of line 
and we
+    // have already checked that their kPrefixSize first characters are equal.
+    return memcmp(GetPointerFromBuffers(char_buffers) + kPrefixSize,
+                  other.GetPointerFromBuffers(other_char_buffers) + 
kPrefixSize,
+                  size() - kPrefixSize) == 0;
+  }
+
+  /// Less-than compare an INDEX/OFFSET view in place.
+  ///
+  /// Equivalent comparison will be accomplished by (for example) first 
converting both
+  /// views to std::string_view and comparing those, but this would not take 
advantage
+  /// of the cached 4 byte prefix.
+  template <typename BufferPtr>
+  bool LessThanIndexOffset(const BufferPtr* char_buffers, const StringHeader& 
other,
+                           const BufferPtr* other_char_buffers) const {
+    return CompareIndexOffset(char_buffers, other, other_char_buffers) < 0;
+  }
+
+ private:
+  // Returns 0, if this == other
+  //       < 0, if this < other
+  //       > 0, if this > other
+  int32_t Compare(const StringHeader& other) const {
+    if (PrefixAsInt() != other.PrefixAsInt()) {
+      // The result is decided on prefix. The shorter will be less
+      // because the prefix is padded with zeros.
+      return memcmp(prefix_.data(), other.prefix_.data(), kPrefixSize);
+    }
+    int32_t size = std::min(size_, other.size_) - kPrefixSize;
+    if (size <= 0) {
+      // One string is just the prefix.
+      return size_ - other.size_;
+    }
+    if (static_cast<uint32_t>(size) <= kInlineSize && IsInline() && 
other.IsInline()) {
+      int32_t result = memcmp(value_.inlined.data(), 
other.value_.inlined.data(), size);
+      return (result != 0) ? result : size_ - other.size_;
+    }
+    int32_t result = memcmp(data() + kPrefixSize, other.data() + kPrefixSize, 
size);
+    return (result != 0) ? result : size_ - other.size_;
+  }
+
+  template <typename BufferPtr>
+  int CompareIndexOffset(const BufferPtr* char_buffers, const StringHeader& 
other,
+                         const BufferPtr* other_char_buffers) const {
+    if (PrefixAsInt() != other.PrefixAsInt()) {
+      // The result is decided on prefix. The shorter will be less
+      // because the prefix is padded with zeros.
+      return memcmp(prefix_.data(), other.prefix_.data(), kPrefixSize);
+    }
+    int32_t size = std::min(size_, other.size_) - kPrefixSize;
+    if (size <= 0) {
+      // One string is just the prefix.
+      return size_ - other.size_;
+    }
+    if (static_cast<uint32_t>(size) <= kInlineSize && IsInline() && 
other.IsInline()) {
+      int32_t result = memcmp(value_.inlined.data(), 
other.value_.inlined.data(), size);
+      return (result != 0) ? result : size_ - other.size_;
+    }
+
+    int32_t result = memcmp(
+        GetPointerFromBuffersOrInlineData(char_buffers) + kPrefixSize,
+        other.GetPointerFromBuffersOrInlineData(other_char_buffers) + 
kPrefixSize, size);
+    return (result != 0) ? result : size_ - other.size_;
+  }
+
+  int64_t SizeAndPrefixAsInt64() const {
+    return reinterpret_cast<const int64_t*>(this)[0];
+  }
+
+  int64_t InlinedAsInt64() const { return reinterpret_cast<const 
int64_t*>(this)[1]; }
+
+  int32_t PrefixAsInt() const { return *reinterpret_cast<const 
int32_t*>(&prefix_); }
+
+  // FIXME(bkietz) replace this with a std::array<uint8_t, 16> and forgo the 
union.
+  // Type punning (AKA violation of the strict aliasing rule) is undefined 
behavior.
+  // Using memcpy to access the bytes of the object representation of 
trivially copyable
+  // objects is not undefined behavior. Given sufficiently explicit hints on 
alignment
+  // and size, compilers elide memcpy calls in favor of identical assembly to 
what
+  // the type punning implementation produces.
+  // We rely on all members being laid out top to bottom . C++
+  // guarantees this.
+  uint32_t size_ = 0;
+  std::array<char, kPrefixSize> prefix_ = {0};
+  union {
+    std::array<char, 8> inlined = {0};
+    const char* data;
+    struct {
+      uint32_t buffer_index;
+      uint32_t offset;

Review Comment:
   Why unsigned integers?  Is there some place you are relying on the rollover 
behavior?  This just seems slightly out of sync with the way strings (as in 
normal strings, not views) are done (e.g. using `int32_t`).



##########
cpp/src/arrow/array/array_binary_test.cc:
##########
@@ -365,38 +368,206 @@ TYPED_TEST(TestStringArray, TestValidateOffsets) { 
this->TestValidateOffsets();
 
 TYPED_TEST(TestStringArray, TestValidateData) { this->TestValidateData(); }
 
+namespace string_header_helpers {
+
+StringHeader Inline(std::string_view chars) {
+  assert(StringHeader::IsInline(chars.size()));
+  return StringHeader{chars};
+}
+
+StringHeader NotInline(std::string_view prefix, size_t length, size_t 
buffer_index,
+                       size_t offset) {
+  assert(prefix.size() == 4);
+  assert(!StringHeader::IsInline(length));
+  StringHeader s{prefix.data(), length};
+  s.SetIndexOffset(static_cast<uint32_t>(buffer_index), 
static_cast<uint32_t>(offset));
+  return s;
+}
+
+Result<std::shared_ptr<StringViewArray>> MakeArray(
+    BufferVector char_buffers, const std::vector<StringHeader>& headers,
+    bool validate = true) {
+  auto length = static_cast<int64_t>(headers.size());
+  ARROW_ASSIGN_OR_RAISE(auto headers_buf, CopyBufferFromVector(headers));
+  auto arr = std::make_shared<StringViewArray>(length, std::move(headers_buf),
+                                               std::move(char_buffers));
+  if (validate) {
+    RETURN_NOT_OK(arr->ValidateFull());
+  }
+  return arr;
+}
+
+Result<std::shared_ptr<StringViewArray>> MakeArrayFromRawPointerViews(
+    BufferVector char_buffers, const std::vector<StringHeader>& raw) {
+  ARROW_ASSIGN_OR_RAISE(auto raw_buf, CopyBufferFromVector(raw));
+  StringViewArray raw_arr{static_cast<int64_t>(raw.size()), std::move(raw_buf),
+                          char_buffers};
+  raw_arr.data()->type = utf8_view(/*has_raw_pointers=*/true);
+
+  ARROW_ASSIGN_OR_RAISE(auto io_buf, AllocateBuffer(raw.size() * 
sizeof(StringHeader)));
+  RETURN_NOT_OK(internal::SwapStringHeaderPointers(
+      *raw_arr.data(), io_buf->mutable_data_as<StringHeader>()));
+
+  auto arr = std::make_shared<StringViewArray>(raw.size(), std::move(io_buf),
+                                               std::move(char_buffers));
+  RETURN_NOT_OK(arr->ValidateFull());
+  return arr;
+}
+
+}  // namespace string_header_helpers
+
+TEST(StringViewArray, Validate) {
+  using string_header_helpers::Inline;
+  using string_header_helpers::MakeArray;
+  using string_header_helpers::NotInline;
+
+  // Since this is a test of validation, we need to be able to construct 
invalid arrays.
+  auto buffer_s = Buffer::FromString("supercalifragilistic(sp?)");
+  auto buffer_y = Buffer::FromString("yyyyyyyyyyyyyyyyyyyyyyyyy");
+
+  // empty array is valid
+  EXPECT_THAT(MakeArray({}, {}), Ok());
+
+  // empty array with some character buffers is valid
+  EXPECT_THAT(MakeArray({buffer_s, buffer_y}, {}), Ok());

Review Comment:
   Is `EXPECT_THAT(..., Ok())` more readable than `EXPECT_OK`?  Although I 
suppose the latter introduces more macros.



##########
cpp/src/arrow/array/array_binary_test.cc:
##########
@@ -365,38 +368,206 @@ TYPED_TEST(TestStringArray, TestValidateOffsets) { 
this->TestValidateOffsets();
 
 TYPED_TEST(TestStringArray, TestValidateData) { this->TestValidateData(); }
 
+namespace string_header_helpers {
+
+StringHeader Inline(std::string_view chars) {
+  assert(StringHeader::IsInline(chars.size()));
+  return StringHeader{chars};
+}
+
+StringHeader NotInline(std::string_view prefix, size_t length, size_t 
buffer_index,
+                       size_t offset) {
+  assert(prefix.size() == 4);
+  assert(!StringHeader::IsInline(length));
+  StringHeader s{prefix.data(), length};
+  s.SetIndexOffset(static_cast<uint32_t>(buffer_index), 
static_cast<uint32_t>(offset));
+  return s;
+}
+
+Result<std::shared_ptr<StringViewArray>> MakeArray(
+    BufferVector char_buffers, const std::vector<StringHeader>& headers,
+    bool validate = true) {
+  auto length = static_cast<int64_t>(headers.size());
+  ARROW_ASSIGN_OR_RAISE(auto headers_buf, CopyBufferFromVector(headers));
+  auto arr = std::make_shared<StringViewArray>(length, std::move(headers_buf),
+                                               std::move(char_buffers));
+  if (validate) {
+    RETURN_NOT_OK(arr->ValidateFull());
+  }
+  return arr;
+}
+
+Result<std::shared_ptr<StringViewArray>> MakeArrayFromRawPointerViews(

Review Comment:
   Does the returned array have `has_raw_pointers` set to true or false?



##########
cpp/src/arrow/array/array_binary.h:
##########
@@ -217,6 +218,90 @@ class ARROW_EXPORT LargeStringArray : public 
LargeBinaryArray {
   Status ValidateUTF8() const;
 };
 
+// ----------------------------------------------------------------------
+// BinaryView and StringView
+
+/// Concrete Array class for variable-size binary view data using the
+/// StringHeader struct to reference in-line or out-of-line string values
+class ARROW_EXPORT BinaryViewArray : public PrimitiveArray {
+ public:
+  using TypeClass = BinaryViewType;
+  using IteratorType = stl::ArrayIterator<BinaryViewArray>;
+
+  explicit BinaryViewArray(const std::shared_ptr<ArrayData>& data);
+
+  BinaryViewArray(int64_t length, std::shared_ptr<Buffer> data, BufferVector 
char_buffers,
+                  std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+                  int64_t null_count = kUnknownNullCount, int64_t offset = 0)
+      : PrimitiveArray(binary_view(), length, std::move(data), 
std::move(null_bitmap),
+                       null_count, offset) {
+    for (auto& char_buffer : char_buffers) {
+      data_->buffers.push_back(std::move(char_buffer));
+    }
+  }
+
+  const StringHeader* raw_values() const {
+    return reinterpret_cast<const StringHeader*>(raw_values_) + data_->offset;
+  }
+
+  // For API compatibility with BinaryArray etc.
+  std::string_view GetView(int64_t i) const {
+    const auto& s = raw_values()[i];
+    if (raw_pointers_) {
+      return std::string_view{s};
+    }
+    if (s.IsInline()) {
+      return {s.GetInlineData(), s.size()};
+    }
+    auto* char_buffers = data_->buffers.data() + 2;
+    return {char_buffers[s.GetBufferIndex()]->data_as<char>() + 
s.GetBufferOffset(),
+            s.size()};
+  }
+
+  // EXPERIMENTAL

Review Comment:
   What is experimental here?



##########
cpp/src/arrow/array/concatenate.cc:
##########
@@ -229,6 +229,43 @@ class ConcatenateImpl {
     return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]);
   }
 
+  Status Visit(const BinaryViewType& type) {
+    out_->buffers.resize(2);
+
+    for (const auto& in_data : in_) {
+      auto begin = in_data->buffers.begin() + 2;
+      auto end = in_data->buffers.end();
+
+      for (auto it = begin; it != end; ++it) {
+        out_->buffers.push_back(*it);
+      }
+    }
+
+    ARROW_ASSIGN_OR_RAISE(auto header_buffers, Buffers(1, 
sizeof(StringHeader)));
+    ARROW_ASSIGN_OR_RAISE(auto header_buffer, 
ConcatenateBuffers(header_buffers, pool_));
+
+    if (!type.has_raw_pointers()) {
+      auto* s = header_buffer->mutable_data_as<StringHeader>();
+
+      size_t preceding_buffer_count = 0;
+
+      int64_t i = in_[0]->length;
+      for (size_t in_index = 1; in_index < in_.size(); ++in_index) {
+        preceding_buffer_count += in_[in_index - 1]->buffers.size() - 2;
+
+        for (int64_t end_i = i + in_[in_index]->length; i < end_i; ++i) {
+          if (s[i].IsInline()) continue;
+          auto buffer_index =
+              static_cast<uint32_t>(s[i].GetBufferIndex() + 
preceding_buffer_count);
+          s[i].SetIndexOffset(buffer_index, s[i].GetBufferOffset());
+        }
+      }
+    }
+
+    out_->buffers[1] = std::move(header_buffer);

Review Comment:
   What about `out_->buffers[0]`?  Or maybe I'm just missing it.



##########
cpp/src/arrow/array/dict_internal.h:
##########
@@ -156,6 +156,60 @@ struct DictionaryTraits<T, enable_if_base_binary<T>> {
   }
 };
 
+template <typename T>
+struct DictionaryTraits<T, enable_if_binary_view_like<T>> {
+  using MemoTableType = typename HashTraits<T>::MemoTableType;
+
+  static_assert(std::is_same_v<MemoTableType, BinaryMemoTable<BinaryBuilder>>);
+
+  static Status GetDictionaryArrayData(MemoryPool* pool,

Review Comment:
   Could you add some kind of header explaining what is going on here?



##########
cpp/src/arrow/array/validate.cc:
##########
@@ -595,6 +614,61 @@ struct ValidateArrayImpl {
     return Status::OK();
   }
 
+  Status ValidateBinaryView(const BinaryViewType& type) {
+    int64_t headers_byte_size = data.buffers[1]->size();
+    int64_t required_headers = data.length + data.offset;
+    if (static_cast<int64_t>(headers_byte_size / sizeof(StringHeader)) <
+        required_headers) {
+      return Status::Invalid("Header buffer size (bytes): ", headers_byte_size,
+                             " isn't large enough for length: ", data.length,
+                             " and offset: ", data.offset);
+    }
+
+    if (!full_validation) {
+      return Status::OK();
+    }
+
+    if (type.has_raw_pointers()) {
+      // TODO(bkietz) validate as with conversions?

Review Comment:
   This is the kind of TODO that no one is going to understand 3 months from 
now I think



##########
cpp/src/arrow/array/array_binary.h:
##########
@@ -217,6 +218,90 @@ class ARROW_EXPORT LargeStringArray : public 
LargeBinaryArray {
   Status ValidateUTF8() const;
 };
 
+// ----------------------------------------------------------------------
+// BinaryView and StringView
+
+/// Concrete Array class for variable-size binary view data using the
+/// StringHeader struct to reference in-line or out-of-line string values
+class ARROW_EXPORT BinaryViewArray : public PrimitiveArray {
+ public:
+  using TypeClass = BinaryViewType;
+  using IteratorType = stl::ArrayIterator<BinaryViewArray>;
+
+  explicit BinaryViewArray(const std::shared_ptr<ArrayData>& data);
+
+  BinaryViewArray(int64_t length, std::shared_ptr<Buffer> data, BufferVector 
char_buffers,
+                  std::shared_ptr<Buffer> null_bitmap = NULLPTR,
+                  int64_t null_count = kUnknownNullCount, int64_t offset = 0)
+      : PrimitiveArray(binary_view(), length, std::move(data), 
std::move(null_bitmap),
+                       null_count, offset) {
+    for (auto& char_buffer : char_buffers) {
+      data_->buffers.push_back(std::move(char_buffer));
+    }
+  }
+
+  const StringHeader* raw_values() const {
+    return reinterpret_cast<const StringHeader*>(raw_values_) + data_->offset;
+  }
+
+  // For API compatibility with BinaryArray etc.
+  std::string_view GetView(int64_t i) const {
+    const auto& s = raw_values()[i];
+    if (raw_pointers_) {
+      return std::string_view{s};

Review Comment:
   I've started at this for longer than I care to admit.  `s` is `const 
StringHeader&`.  I'm confused how this works?  Is it forwarding the members of 
`s`?  In other words, is this equivalent to `std::string_view{s.size_, 
s.prefix_}`?



##########
cpp/src/arrow/array/validate.cc:
##########
@@ -595,6 +614,61 @@ struct ValidateArrayImpl {
     return Status::OK();
   }
 
+  Status ValidateBinaryView(const BinaryViewType& type) {
+    int64_t headers_byte_size = data.buffers[1]->size();
+    int64_t required_headers = data.length + data.offset;
+    if (static_cast<int64_t>(headers_byte_size / sizeof(StringHeader)) <
+        required_headers) {
+      return Status::Invalid("Header buffer size (bytes): ", headers_byte_size,
+                             " isn't large enough for length: ", data.length,
+                             " and offset: ", data.offset);
+    }
+

Review Comment:
   You could also perhaps validate that `headers_byte_size % 
sizeof(StringHeader) == 0`



##########
cpp/src/arrow/array/data.cc:
##########
@@ -214,6 +214,14 @@ void ArraySpan::SetMembers(const ArrayData& data) {
     this->buffers[i] = {};
   }
 
+  if (type_id == Type::STRING_VIEW || type_id == Type::BINARY_VIEW) {
+    // store the span of character buffers in the third buffer
+    this->buffers[2].data =
+        const_cast<uint8_t*>(reinterpret_cast<const 
uint8_t*>(data.buffers.data() + 2));
+    this->buffers[2].size =
+        static_cast<int64_t>(data.buffers.size() - 2) * 
sizeof(std::shared_ptr<Buffer>);

Review Comment:
   Clever, but feels a touch sinister.  Do you ever use this buffer for 
anything other than re-creating the array on round trip?



##########
cpp/src/arrow/array/data.h:
##########
@@ -547,6 +551,21 @@ struct ARROW_EXPORT ArraySpan {
 
 namespace internal {
 
+template <typename F>
+Status VisitSlices(ArraySpan input, int64_t slice_size, const F& f) {
+  int64_t num_slices = input.length / slice_size;
+  int64_t trailing_slice_size = input.length % slice_size;
+  int64_t offset = input.offset;
+
+  for (int64_t i = 0; i < num_slices; ++i) {
+    input.SetSlice(offset, slice_size);
+    ARROW_RETURN_NOT_OK(f(input));
+    offset += slice_size;
+  }
+  input.SetSlice(offset, trailing_slice_size);

Review Comment:
   This could be a slice of size zero?  Is that a problem?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to