This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/master by this push:
     new 390f7d0  ARROW-3542: [C++] Use unsafe appends when building array from 
CSV
390f7d0 is described below

commit 390f7d0e235cc09914ebde465c8ca2fd03d2ef67
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Oct 17 13:29:39 2018 -0400

    ARROW-3542: [C++] Use unsafe appends when building array from CSV
    
    This makes reading a CSV file of binary string columns 5-10% faster.
    
    Author: Antoine Pitrou <[email protected]>
    
    Closes #2780 from pitrou/ARROW-3542-csv-unsafe-appends and squashes the 
following commits:
    
    4b47cb49f <Antoine Pitrou> ARROW-3542:  Use unsafe appends when building 
array from CSV
---
 cpp/src/arrow/array-test.cc    | 38 ++++++++++++++++++++++++++++++++++++++
 cpp/src/arrow/builder.h        | 35 +++++++++++++++++++++++++++++++++--
 cpp/src/arrow/csv/converter.cc |  6 ++++--
 cpp/src/arrow/csv/parser.h     |  3 ++-
 4 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index a0be049..4a50232 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -1494,6 +1494,44 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) {
   }
 }
 
+TEST_F(TestBinaryBuilder, TestScalarAppendUnsafe) {
+  vector<string> strings = {"", "bb", "a", "", "ccc"};
+  vector<uint8_t> is_null = {0, 0, 0, 1, 0};
+
+  int N = static_cast<int>(strings.size());
+  int reps = 10;
+
+  ASSERT_OK(builder_->Reserve(5));
+  ASSERT_OK(builder_->ReserveData(6));
+
+  for (int j = 0; j < reps; ++j) {
+    for (int i = 0; i < N; ++i) {
+      if (is_null[i]) {
+        ASSERT_OK(builder_->AppendNull());
+      } else {
+        builder_->UnsafeAppend(strings[i]);
+      }
+    }
+  }
+  Done();
+  ASSERT_OK(ValidateArray(*result_));
+  ASSERT_EQ(reps * N, result_->length());
+  ASSERT_EQ(reps, result_->null_count());
+  ASSERT_EQ(reps * 6, result_->value_data()->size());
+
+  int32_t length;
+  for (int i = 0; i < N * reps; ++i) {
+    if (is_null[i % N]) {
+      ASSERT_TRUE(result_->IsNull(i));
+    } else {
+      ASSERT_FALSE(result_->IsNull(i));
+      const uint8_t* vals = result_->GetValue(i, &length);
+      ASSERT_EQ(static_cast<int>(strings[i % N].size()), length);
+      ASSERT_EQ(0, std::memcmp(vals, strings[i % N].data(), length));
+    }
+  }
+}
+
 TEST_F(TestBinaryBuilder, TestCapacityReserve) {
   vector<string> strings = {"aaaaa", "bbbbbbbbbb", "ccccccccccccccc", 
"dddddddddd"};
   int N = static_cast<int>(strings.size());
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index cb4b9eb..185b506 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -643,6 +643,14 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
   /// Scalar append
   Status Append(const bool val) {
     ARROW_RETURN_NOT_OK(Reserve(1));
+    UnsafeAppend(val);
+    return Status::OK();
+  }
+
+  Status Append(const uint8_t val) { return Append(val != 0); }
+
+  /// Scalar append, without checking for capacity
+  void UnsafeAppend(const bool val) {
     BitUtil::SetBit(null_bitmap_data_, length_);
     if (val) {
       BitUtil::SetBit(raw_data_, length_);
@@ -650,10 +658,9 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
       BitUtil::ClearBit(raw_data_, length_);
     }
     ++length_;
-    return Status::OK();
   }
 
-  Status Append(const uint8_t val) { return Append(val != 0); }
+  void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); }
 
   /// \brief Append a sequence of elements in one shot
   /// \param[in] values a contiguous array of bytes (non-zero is 1)
@@ -850,6 +857,24 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
 
   Status AppendNull();
 
+  /// \brief Append without checking capacity
+  ///
+  /// Offsets and data should have been presized using Reserve() and
+  /// ReserveData(), respectively.
+  void UnsafeAppend(const uint8_t* value, int32_t length) {
+    UnsafeAppendNextOffset();
+    value_data_builder_.UnsafeAppend(value, length);
+    UnsafeAppendToBitmap(true);
+  }
+
+  void UnsafeAppend(const char* value, int32_t length) {
+    UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
+  }
+
+  void UnsafeAppend(const std::string& value) {
+    UnsafeAppend(value.c_str(), static_cast<int32_t>(value.size()));
+  }
+
   void Reset() override;
   Status Resize(int64_t capacity) override;
 
@@ -874,6 +899,11 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
   TypedBufferBuilder<uint8_t> value_data_builder_;
 
   Status AppendNextOffset();
+
+  void UnsafeAppendNextOffset() {
+    const int64_t num_bytes = value_data_builder_.length();
+    offsets_builder_.UnsafeAppend(static_cast<int32_t>(num_bytes));
+  }
 };
 
 /// \class StringBuilder
@@ -885,6 +915,7 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder {
 
   using BinaryBuilder::Append;
   using BinaryBuilder::Reset;
+  using BinaryBuilder::UnsafeAppend;
 
   /// \brief Append a sequence of strings in one shot.
   ///
diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc
index 19cf1eb..86f3ab0 100644
--- a/cpp/src/arrow/csv/converter.cc
+++ b/cpp/src/arrow/csv/converter.cc
@@ -208,7 +208,8 @@ Status VarSizeBinaryConverter<T>::Convert(const 
BlockParser& parser, int32_t col
   // TODO handle nulls
 
   auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
-    return builder.Append(data, size);
+    builder.UnsafeAppend(data, size);
+    return Status::OK();
   };
   RETURN_NOT_OK(builder.Resize(parser.num_rows()));
   RETURN_NOT_OK(builder.ReserveData(parser.num_bytes()));
@@ -282,7 +283,8 @@ Status NumericConverter<T>::Convert(const BlockParser& 
parser, int32_t col_index
             !converter(reinterpret_cast<const char*>(data), size, &value))) {
       return GenericConversionError(type_, data, size);
     }
-    return builder.Append(value);
+    builder.UnsafeAppend(value);
+    return Status::OK();
   };
   RETURN_NOT_OK(builder.Resize(parser.num_rows()));
   RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
diff --git a/cpp/src/arrow/csv/parser.h b/cpp/src/arrow/csv/parser.h
index 46991b4..4529fce 100644
--- a/cpp/src/arrow/csv/parser.h
+++ b/cpp/src/arrow/csv/parser.h
@@ -73,7 +73,8 @@ class ARROW_EXPORT BlockParser {
 
   /// \brief Visit parsed values in a column
   ///
-  /// The signature of the visitor is (const uint8_t* data, uint32_t size, 
bool quoted)
+  /// The signature of the visitor is
+  /// Status(const uint8_t* data, uint32_t size, bool quoted)
   template <typename Visitor>
   Status VisitColumn(int32_t col_index, Visitor&& visit) const {
 #ifdef CSV_PARSER_USE_BITFIELD

Reply via email to