This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 390f7d0 ARROW-3542: [C++] Use unsafe appends when building array from
CSV
390f7d0 is described below
commit 390f7d0e235cc09914ebde465c8ca2fd03d2ef67
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Oct 17 13:29:39 2018 -0400
ARROW-3542: [C++] Use unsafe appends when building array from CSV
This makes reading a CSV file of binary string columns 5-10% faster.
Author: Antoine Pitrou <[email protected]>
Closes #2780 from pitrou/ARROW-3542-csv-unsafe-appends and squashes the
following commits:
4b47cb49f <Antoine Pitrou> ARROW-3542: Use unsafe appends when building
array from CSV
---
cpp/src/arrow/array-test.cc | 38 ++++++++++++++++++++++++++++++++++++++
cpp/src/arrow/builder.h | 35 +++++++++++++++++++++++++++++++++--
cpp/src/arrow/csv/converter.cc | 6 ++++--
cpp/src/arrow/csv/parser.h | 3 ++-
4 files changed, 77 insertions(+), 5 deletions(-)
diff --git a/cpp/src/arrow/array-test.cc b/cpp/src/arrow/array-test.cc
index a0be049..4a50232 100644
--- a/cpp/src/arrow/array-test.cc
+++ b/cpp/src/arrow/array-test.cc
@@ -1494,6 +1494,44 @@ TEST_F(TestBinaryBuilder, TestScalarAppend) {
}
}
+TEST_F(TestBinaryBuilder, TestScalarAppendUnsafe) {
+ vector<string> strings = {"", "bb", "a", "", "ccc"};
+ vector<uint8_t> is_null = {0, 0, 0, 1, 0};
+
+ int N = static_cast<int>(strings.size());
+ int reps = 10;
+
+ ASSERT_OK(builder_->Reserve(5));
+ ASSERT_OK(builder_->ReserveData(6));
+
+ for (int j = 0; j < reps; ++j) {
+ for (int i = 0; i < N; ++i) {
+ if (is_null[i]) {
+ ASSERT_OK(builder_->AppendNull());
+ } else {
+ builder_->UnsafeAppend(strings[i]);
+ }
+ }
+ }
+ Done();
+ ASSERT_OK(ValidateArray(*result_));
+ ASSERT_EQ(reps * N, result_->length());
+ ASSERT_EQ(reps, result_->null_count());
+ ASSERT_EQ(reps * 6, result_->value_data()->size());
+
+ int32_t length;
+ for (int i = 0; i < N * reps; ++i) {
+ if (is_null[i % N]) {
+ ASSERT_TRUE(result_->IsNull(i));
+ } else {
+ ASSERT_FALSE(result_->IsNull(i));
+ const uint8_t* vals = result_->GetValue(i, &length);
+ ASSERT_EQ(static_cast<int>(strings[i % N].size()), length);
+ ASSERT_EQ(0, std::memcmp(vals, strings[i % N].data(), length));
+ }
+ }
+}
+
TEST_F(TestBinaryBuilder, TestCapacityReserve) {
vector<string> strings = {"aaaaa", "bbbbbbbbbb", "ccccccccccccccc",
"dddddddddd"};
int N = static_cast<int>(strings.size());
diff --git a/cpp/src/arrow/builder.h b/cpp/src/arrow/builder.h
index cb4b9eb..185b506 100644
--- a/cpp/src/arrow/builder.h
+++ b/cpp/src/arrow/builder.h
@@ -643,6 +643,14 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
/// Scalar append
Status Append(const bool val) {
ARROW_RETURN_NOT_OK(Reserve(1));
+ UnsafeAppend(val);
+ return Status::OK();
+ }
+
+ Status Append(const uint8_t val) { return Append(val != 0); }
+
+ /// Scalar append, without checking for capacity
+ void UnsafeAppend(const bool val) {
BitUtil::SetBit(null_bitmap_data_, length_);
if (val) {
BitUtil::SetBit(raw_data_, length_);
@@ -650,10 +658,9 @@ class ARROW_EXPORT BooleanBuilder : public ArrayBuilder {
BitUtil::ClearBit(raw_data_, length_);
}
++length_;
- return Status::OK();
}
- Status Append(const uint8_t val) { return Append(val != 0); }
+ void UnsafeAppend(const uint8_t val) { UnsafeAppend(val != 0); }
/// \brief Append a sequence of elements in one shot
/// \param[in] values a contiguous array of bytes (non-zero is 1)
@@ -850,6 +857,24 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
Status AppendNull();
+ /// \brief Append without checking capacity
+ ///
+ /// Offsets and data should have been presized using Reserve() and
+ /// ReserveData(), respectively.
+ void UnsafeAppend(const uint8_t* value, int32_t length) {
+ UnsafeAppendNextOffset();
+ value_data_builder_.UnsafeAppend(value, length);
+ UnsafeAppendToBitmap(true);
+ }
+
+ void UnsafeAppend(const char* value, int32_t length) {
+ UnsafeAppend(reinterpret_cast<const uint8_t*>(value), length);
+ }
+
+ void UnsafeAppend(const std::string& value) {
+ UnsafeAppend(value.c_str(), static_cast<int32_t>(value.size()));
+ }
+
void Reset() override;
Status Resize(int64_t capacity) override;
@@ -874,6 +899,11 @@ class ARROW_EXPORT BinaryBuilder : public ArrayBuilder {
TypedBufferBuilder<uint8_t> value_data_builder_;
Status AppendNextOffset();
+
+ void UnsafeAppendNextOffset() {
+ const int64_t num_bytes = value_data_builder_.length();
+ offsets_builder_.UnsafeAppend(static_cast<int32_t>(num_bytes));
+ }
};
/// \class StringBuilder
@@ -885,6 +915,7 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder {
using BinaryBuilder::Append;
using BinaryBuilder::Reset;
+ using BinaryBuilder::UnsafeAppend;
/// \brief Append a sequence of strings in one shot.
///
diff --git a/cpp/src/arrow/csv/converter.cc b/cpp/src/arrow/csv/converter.cc
index 19cf1eb..86f3ab0 100644
--- a/cpp/src/arrow/csv/converter.cc
+++ b/cpp/src/arrow/csv/converter.cc
@@ -208,7 +208,8 @@ Status VarSizeBinaryConverter<T>::Convert(const
BlockParser& parser, int32_t col
// TODO handle nulls
auto visit = [&](const uint8_t* data, uint32_t size, bool quoted) -> Status {
- return builder.Append(data, size);
+ builder.UnsafeAppend(data, size);
+ return Status::OK();
};
RETURN_NOT_OK(builder.Resize(parser.num_rows()));
RETURN_NOT_OK(builder.ReserveData(parser.num_bytes()));
@@ -282,7 +283,8 @@ Status NumericConverter<T>::Convert(const BlockParser&
parser, int32_t col_index
!converter(reinterpret_cast<const char*>(data), size, &value))) {
return GenericConversionError(type_, data, size);
}
- return builder.Append(value);
+ builder.UnsafeAppend(value);
+ return Status::OK();
};
RETURN_NOT_OK(builder.Resize(parser.num_rows()));
RETURN_NOT_OK(parser.VisitColumn(col_index, visit));
diff --git a/cpp/src/arrow/csv/parser.h b/cpp/src/arrow/csv/parser.h
index 46991b4..4529fce 100644
--- a/cpp/src/arrow/csv/parser.h
+++ b/cpp/src/arrow/csv/parser.h
@@ -73,7 +73,8 @@ class ARROW_EXPORT BlockParser {
/// \brief Visit parsed values in a column
///
- /// The signature of the visitor is (const uint8_t* data, uint32_t size,
bool quoted)
+ /// The signature of the visitor is
+ /// Status(const uint8_t* data, uint32_t size, bool quoted)
template <typename Visitor>
Status VisitColumn(int32_t col_index, Visitor&& visit) const {
#ifdef CSV_PARSER_USE_BITFIELD