[GitHub] [arrow] michalursa commented on a diff in pull request #12872: ARROW-16166: [C++][Compute] Utilities for assembling join output

GitBox Fri, 15 Apr 2022 00:08:48 -0700


michalursa commented on code in PR #12872:
URL: https://github.com/apache/arrow/pull/12872#discussion_r851102132



##########
cpp/src/arrow/compute/light_array.h:
##########
@@ -0,0 +1,384 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <cstdint>
+
+#include "arrow/array.h"
+#include "arrow/compute/exec.h"
+#include "arrow/type.h"
+#include "arrow/util/logging.h"
+
+/// This file contains lightweight containers for Arrow buffers.  These 
containers
+/// makes compromises in terms of strong ownership and the range of data types 
supported
+/// in order to gain performance and reduced overhead.
+
+namespace arrow {
+namespace compute {
+
+/// \brief Description of the layout of a "key" column
+///
+/// A "key" column is a non-nested, non-union column.
+/// Every key column has either 0 (null), 2 (e.g. int32) or 3 (e.g. string) 
buffers
+/// and no children.
+///
+/// This metadata object is a zero-allocation analogue of arrow::DataType
+struct KeyColumnMetadata {
+  KeyColumnMetadata() = default;
+  KeyColumnMetadata(bool is_fixed_length_in, uint32_t fixed_length_in,
+                    bool is_null_type_in = false)
+      : is_fixed_length(is_fixed_length_in),
+        is_null_type(is_null_type_in),
+        fixed_length(fixed_length_in) {}
+  /// \brief True if the column is not a varying-length binary type
+  ///
+  /// If this is true the column will have a validity buffer and
+  /// a data buffer and the third buffer will be unused.
+  bool is_fixed_length;
+  /// \brief True if this column is the null type
+  bool is_null_type;
+  /// \brief The number of bytes for each item
+  ///
+  /// Zero has a special meaning, indicating a bit vector with one bit per 
value if it
+  /// isn't a null type column.
+  ///
+  /// For a varying-length binary column this represents the number of bytes 
per offset.
+  uint32_t fixed_length;
+};
+
+/// \brief A lightweight view into a "key" array
+///
+/// A "key" column is a non-nested, non-union column \see KeyColumnMetadata
+///
+/// This metadata object is a zero-allocation analogue of arrow::ArrayData
+class KeyColumnArray {
+ public:
+  /// \brief Create an uninitialized KeyColumnArray
+  KeyColumnArray() = default;
+  /// \brief Create a read-only view from buffers
+  ///
+  /// This is a view only and does not take ownership of the buffers.  The 
lifetime
+  /// of the buffers must exceed the lifetime of this view
+  KeyColumnArray(const KeyColumnMetadata& metadata, int64_t length,
+                 const uint8_t* buffer0, const uint8_t* buffer1, const 
uint8_t* buffer2,
+                 int bit_offset0 = 0, int bit_offset1 = 0);
+  /// \brief Create a mutable view from buffers
+  ///
+  /// This is a view only and does not take ownership of the buffers.  The 
lifetime
+  /// of the buffers must exceed the lifetime of this view
+  KeyColumnArray(const KeyColumnMetadata& metadata, int64_t length, uint8_t* 
buffer0,
+                 uint8_t* buffer1, uint8_t* buffer2, int bit_offset0 = 0,
+                 int bit_offset1 = 0);
+  /// \brief Create a sliced view of `this`
+  ///
+  /// The number of rows used in offset must be divisible by 8
+  /// in order to not split bit vectors within a single byte.
+  KeyColumnArray Slice(int64_t offset, int64_t length) const;
+  /// \brief Create a copy of `this` with a buffer from `other`
+  ///
+  /// The copy will be identical to `this` except the buffer at 
buffer_id_to_replace
+  /// will be replaced by the corresponding buffer in `other`.
+  KeyColumnArray WithBufferFrom(const KeyColumnArray& other,
+                                int buffer_id_to_replace) const;
+
+  /// \brief Create a copy of `this` with new metadata
+  KeyColumnArray WithMetadata(const KeyColumnMetadata& metadata) const;
+  /// \brief Return one of the underlying mutable buffers
+  uint8_t* mutable_data(int i) {
+    ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+    return mutable_buffers_[i];
+  }
+  /// \brief Return one of the underlying read-only buffers
+  const uint8_t* data(int i) const {
+    ARROW_DCHECK(i >= 0 && i <= max_buffers_);
+    return buffers_[i];
+  }
+  /// \brief Return a mutable version of the offsets buffer
+  ///
+  /// Only valid if this is a view into a varbinary type
+  uint32_t* mutable_offsets() {
+    DCHECK(!metadata_.is_fixed_length);
+    return reinterpret_cast<uint32_t*>(mutable_data(1));
+  }
+  /// \brief Return a read-only version of the offsets buffer
+  ///
+  /// Only valid if this is a view into a varbinary type
+  const uint32_t* offsets() const {
+    DCHECK(!metadata_.is_fixed_length);
+    return reinterpret_cast<const uint32_t*>(data(1));
+  }
+  /// \brief Return the type metadata
+  const KeyColumnMetadata& metadata() const { return metadata_; }
+  /// \brief Return the length (in rows) of the array
+  int64_t length() const { return length_; }
+  /// \brief Return the bit offset into the corresponding vector
+  ///
+  /// if i == 1 then this must be a bool array
+  int bit_offset(int i) const {
+    ARROW_DCHECK(i >= 0 && i < max_buffers_);
+    return bit_offset_[i];
+  }
+
+ private:
+  static constexpr int max_buffers_ = 3;
+  const uint8_t* buffers_[max_buffers_];
+  uint8_t* mutable_buffers_[max_buffers_];
+  KeyColumnMetadata metadata_;
+  int64_t length_;
+  // Starting bit offset within the first byte (between 0 and 7)
+  // to be used when accessing buffers that store bit vectors.
+  int bit_offset_[max_buffers_ - 1];
+};
+
+/// \brief Create KeyColumnMetadata from a DataType
+///
+/// If `type` is a dictionary type then this will return the KeyColumnMetadata 
for
+/// the indices type
+///
+/// The caller should ensure this is only called on "key" columns.  Calling 
this with
+/// a non-key column will return a meaningless value (or abort on a debug 
build)
+KeyColumnMetadata ColumnMetadataFromDataType(const std::shared_ptr<DataType>& 
type);
+
+/// \brief Create KeyColumnArray from ArrayData
+///
+/// If `type` is a dictionary type then this will return the KeyColumnArray for
+/// the indices array
+///
+/// The caller should ensure this is only called on "key" columns.
+/// \see ColumnMetadataFromDataType for details
+KeyColumnArray ColumnArrayFromArrayData(const std::shared_ptr<ArrayData>& 
array_data,
+                                        int start_row, int num_rows);
+
+/// \brief Create KeyColumnMetadata instances from an ExecBatch
+///
+/// column_metadatas will be resized to fit
+///
+/// All columns in `batch` must be eligible "key" columns and have an array 
shape
+/// \see ColumnMetadataFromDataType for more details
+void ColumnMetadatasFromExecBatch(const ExecBatch& batch,
+                                  std::vector<KeyColumnMetadata>* 
column_metadatas);
+
+/// \brief Create KeyColumnArray instances from a slice of an ExecBatch
+///
+/// column_arrays will be resized to fit
+///
+/// All columns in `batch` must be eligible "key" columns and have an array 
shape
+/// \see ColumnArrayFromArrayData for more details
+void ColumnArraysFromExecBatch(const ExecBatch& batch, int start_row, int 
num_rows,
+                               std::vector<KeyColumnArray>* column_arrays);
+
+/// \brief Create KeyColumnArray instances from an ExecBatch
+///
+/// column_arrays will be resized to fit
+///
+/// All columns in `batch` must be eligible "key" columns and have an array 
shape
+/// \see ColumnArrayFromArrayData for more details
+void ColumnArraysFromExecBatch(const ExecBatch& batch,
+                               std::vector<KeyColumnArray>* column_arrays);
+
+/// A lightweight resizable array for "key" columns
+///
+/// Unlike KeyColumnArray this instance owns its buffers
+///
+/// Resizing is handled by arrow::ResizableBuffer and a doubling approach is
+/// used so that resizes will always grow up to the next power of 2
+class ResizableArrayData {
+ public:
+  /// \brief Create an uninitialized instance
+  ///
+  /// Init must be called before calling any other operations
+  ResizableArrayData()
+      : log_num_rows_min_(0),
+        pool_(NULLPTR),
+        num_rows_(0),
+        num_rows_allocated_(0),
+        var_len_buf_size_(0) {}
+  ~ResizableArrayData() { Clear(true); }
+  /// \brief Initialize the array
+  /// \param data_type The data type this array is holding data for.
+  /// \param pool The pool to make allocations on
+  /// \param log_num_rows_min All resize operations will allocate at least 
enough
+  ///                         space for (1 << log_num_rows_min) rows
+  void Init(const std::shared_ptr<DataType>& data_type, MemoryPool* pool,
+            int log_num_rows_min);
+  /// \brief Resets the array back to an empty state
+  /// \param release_buffers If true then allocated memory is released and the
+  ///                        next resize operation will have to reallocate 
memory
+  void Clear(bool release_buffers);
+  /// \brief Resize the fixed length buffers
+  ///
+  /// The buffers will be resized to hold at least `num_rows_new` rows of data
+  Status ResizeFixedLengthBuffers(int num_rows_new);
+  /// \brief Resize the varying length buffer if this array is a variable 
binary type
+  ///
+  /// This must be called after offsets have been populated and the buffer 
will be
+  /// resized to hold at least as much data as the offsets require
+  ///
+  /// Does nothing if the array is not a variable binary type
+  Status ResizeVaryingLengthBuffer();
+  /// \brief The current length (in rows) of the array
+  int num_rows() const { return num_rows_; }
+  /// \brief A non-owning view into this array
+  KeyColumnArray column_array() const;
+  /// \brief A lightweight descriptor of the data held by this array
+  KeyColumnMetadata column_metadata() const {
+    return ColumnMetadataFromDataType(data_type_);
+  }
+  /// \brief Convert the data to an arrow::ArrayData
+  ///
+  /// This is a zero copy operation and the created ArrayData will reference 
the
+  /// buffers held by this instance.
+  std::shared_ptr<ArrayData> array_data() const;
+  /// \brief A raw pointer to the requested buffer
+  ///
+  /// If i is 0 then this returns the validity buffer
+  /// If i is 1 then this returns the buffer used for values (if this is a 
fixed
+  ///           length data type) or offsets (if this is a variable binary 
type)
+  /// If i is 2 then this returns the buffer used for variable length binary 
data
+  uint8_t* mutable_data(int i) {
+    return i == 0   ? non_null_buf_->mutable_data()
+           : i == 1 ? fixed_len_buf_->mutable_data()
+                    : var_len_buf_->mutable_data();
+  }
+
+ private:
+  static constexpr int64_t kNumPaddingBytes = 64;
+  int log_num_rows_min_;
+  std::shared_ptr<DataType> data_type_;
+  MemoryPool* pool_;
+  int num_rows_;
+  int num_rows_allocated_;
+  int var_len_buf_size_;
+  std::shared_ptr<ResizableBuffer> non_null_buf_;
+  std::shared_ptr<ResizableBuffer> fixed_len_buf_;
+  std::shared_ptr<ResizableBuffer> var_len_buf_;
+};
+
+/// \brief A builder to concatenate batches of data into a larger batch
+///
+/// Will only store num_rows_max() rows
+class ExecBatchBuilder {
+ public:
+  /// \brief Add rows from `source` into `target` column
+  ///
+  /// If `target` is uninitialized or cleared it will be initialized to use
+  /// the given pool.
+  static Status AppendSelected(const std::shared_ptr<ArrayData>& source,
+                               ResizableArrayData* target, int 
num_rows_to_append,
+                               const uint16_t* row_ids, MemoryPool* pool);
+
+  /// \brief Add nulls into `target` column
+  ///
+  /// If `target` is uninitialized or cleared it will be initialized to use
+  /// the given pool.
+  static Status AppendNulls(const std::shared_ptr<DataType>& type,
+                            ResizableArrayData& target, int num_rows_to_append,
+                            MemoryPool* pool);
+
+  /// \brief Add selected rows from `batch`
+  ///
+  /// If `col_ids` is null then `num_cols` should less than batch.num_values() 
and
+  /// the first `num_cols` columns of batch will be appended.
+  ///
+  /// All columns in `batch` must have array shape
+  Status AppendSelected(MemoryPool* pool, const ExecBatch& batch, int 
num_rows_to_append,
+                        const uint16_t* row_ids, int num_cols,
+                        const int* col_ids = NULLPTR);

Review Comment:
   AppendSelected and AppendNulls variants that returned num_appended were 
wrappers on top of versions that did not. They can be removed as long as the 
caller is willing to make sure they don't exceed the batch size limit (there 
isn't anything incorrect about exceeding the limit at this point, but consumers 
of exec batches often assume that the rows can be indexed using 16-bit 
integers).
   
   I removed variants that returned num_appended (they were not used in new 
hash join) and added returning error status in case of exceeding limit.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: github-unsubscr...@arrow.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

[GitHub] [arrow] michalursa commented on a diff in pull request #12872: ARROW-16166: [C++][Compute] Utilities for assembling join output

Reply via email to