This is an automated email from the ASF dual-hosted git repository. uwe pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push: new 5f0ff7f ARROW-3239: [C++] Implement simple random array generation 5f0ff7f is described below commit 5f0ff7fcce18cc2afed8cf1165696946adb517a4 Author: François Saint-Jacques <fsaintjacq...@gmail.com> AuthorDate: Tue Feb 5 14:37:08 2019 +0100 ARROW-3239: [C++] Implement simple random array generation This implement the following API. ``` random::RandomArrayGenerator rand(seed); auto bool_array = rand.Boolean(num_rows, 0.75, null_prob); auto u8_array = rand.Int8(num_rows, 0, 100, null_prob); ``` Author: François Saint-Jacques <fsaintjacq...@gmail.com> Closes #3533 from fsaintjacques/ARROW-3239-random-utils and squashes the following commits: a806b1ff <François Saint-Jacques> Add ARROW_EXPORT to RandomArrayGenerator 63d9103b <François Saint-Jacques> Fix GenerateOptions seed type 59c3a3bb <François Saint-Jacques> Add undef to macro 22eca801 <François Saint-Jacques> Handle special case with MSVC 728aadcd <François Saint-Jacques> Fix downcasting issues 4840ac0e <François Saint-Jacques> ARROW-3239: Implement simple random array generation --- cpp/src/arrow/CMakeLists.txt | 2 +- cpp/src/arrow/ipc/json-test.cc | 43 +++----- cpp/src/arrow/ipc/read-write-benchmark.cc | 33 ++---- cpp/src/arrow/ipc/test-common.h | 21 ++-- cpp/src/arrow/test-random.cc | 149 ++++++++++++++++++++++++++ cpp/src/arrow/test-random.h | 169 ++++++++++++++++++++++++++++++ cpp/src/arrow/test-util.h | 5 +- 7 files changed, 354 insertions(+), 68 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index 1dba589..c65824f 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -243,7 +243,7 @@ endif() if (ARROW_BUILD_TESTS OR ARROW_BUILD_BENCHMARKS) # that depend on gtest ADD_ARROW_LIB(arrow_testing - SOURCES test-util.cc + SOURCES test-util.cc test-random.cc OUTPUTS ARROW_TESTING_LIBRARIES DEPENDENCIES ${GTEST_LIBRARY} SHARED_LINK_LIBS arrow_shared ${GTEST_LIBRARY} diff --git a/cpp/src/arrow/ipc/json-test.cc b/cpp/src/arrow/ipc/json-test.cc index 47a0a29..bea6fbb 100644 --- a/cpp/src/arrow/ipc/json-test.cc +++ b/cpp/src/arrow/ipc/json-test.cc @@ -32,6 +32,7 @@ #include "arrow/ipc/test-common.h" #include "arrow/memory_pool.h" #include "arrow/record_batch.h" +#include "arrow/test-random.h" #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/type_traits.h" @@ -216,48 +217,38 @@ TEST(TestJsonArrayWriter, Unions) { // Data generation for test case below void MakeBatchArrays(const std::shared_ptr<Schema>& schema, const int num_rows, std::vector<std::shared_ptr<Array>>* arrays) { - std::vector<bool> is_valid; - random_is_valid(num_rows, 0.25, &is_valid); + const float null_prob = 0.25f; + random::RandomArrayGenerator rand(0x564a3bf0); - std::vector<int8_t> v1_values; - std::vector<int32_t> v2_values; - - randint(num_rows, 0, 100, &v1_values); - randint(num_rows, 0, 100, &v2_values); - - std::shared_ptr<Array> v1; - ArrayFromVector<Int8Type, int8_t>(is_valid, v1_values, &v1); - - std::shared_ptr<Array> v2; - ArrayFromVector<Int32Type, int32_t>(is_valid, v2_values, &v2); + *arrays = {rand.Boolean(num_rows, 0.75, null_prob), + rand.Int8(num_rows, 0, 100, null_prob), + rand.Int32(num_rows, -1000, 1000, null_prob), + rand.UInt64(num_rows, 0, 1UL << 16, null_prob)}; static const int kBufferSize = 10; static uint8_t buffer[kBufferSize]; static uint32_t seed = 0; StringBuilder string_builder; for (int i = 0; i < num_rows; ++i) { - if (!is_valid[i]) { - ASSERT_OK(string_builder.AppendNull()); - } else { - random_ascii(kBufferSize, seed++, buffer); - ASSERT_OK(string_builder.Append(buffer, kBufferSize)); - } + random_ascii(kBufferSize, seed++, buffer); + ASSERT_OK(string_builder.Append(buffer, kBufferSize)); } std::shared_ptr<Array> v3; ASSERT_OK(string_builder.Finish(&v3)); - arrays->emplace_back(v1); - arrays->emplace_back(v2); arrays->emplace_back(v3); } TEST(TestJsonFileReadWrite, BasicRoundTrip) { - auto v1_type = int8(); - auto v2_type = int32(); - auto v3_type = utf8(); + auto v1_type = boolean(); + auto v2_type = int8(); + auto v3_type = int32(); + auto v4_type = uint64(); + auto v5_type = utf8(); auto schema = - ::arrow::schema({field("f1", v1_type), field("f2", v2_type), field("f3", v3_type)}); + ::arrow::schema({field("f1", v1_type), field("f2", v2_type), field("f3", v3_type), + field("f4", v4_type), field("f5", v5_type)}); std::unique_ptr<JsonWriter> writer; ASSERT_OK(JsonWriter::Open(schema, &writer)); @@ -289,7 +280,7 @@ TEST(TestJsonFileReadWrite, BasicRoundTrip) { for (int i = 0; i < nbatches; ++i) { std::shared_ptr<RecordBatch> batch; ASSERT_OK(reader->ReadRecordBatch(i, &batch)); - ASSERT_TRUE(batch->Equals(*batches[i])); + ASSERT_RECORD_BATCHES_EQUAL(*batch, *batches[i]); } } diff --git a/cpp/src/arrow/ipc/read-write-benchmark.cc b/cpp/src/arrow/ipc/read-write-benchmark.cc index ace2965..359cd0e 100644 --- a/cpp/src/arrow/ipc/read-write-benchmark.cc +++ b/cpp/src/arrow/ipc/read-write-benchmark.cc @@ -24,34 +24,15 @@ #include "arrow/api.h" #include "arrow/io/memory.h" #include "arrow/ipc/api.h" +#include "arrow/test-random.h" #include "arrow/test-util.h" namespace arrow { -template <typename TYPE> std::shared_ptr<RecordBatch> MakeRecordBatch(int64_t total_size, int64_t num_fields) { - using T = typename TYPE::c_type; - size_t itemsize = sizeof(T); - int64_t length = total_size / num_fields / itemsize; - - auto type = TypeTraits<TYPE>::type_singleton(); - - std::vector<bool> is_valid; - random_is_valid(length, 0.1, &is_valid); - - std::vector<T> values; - randint<T>(length, 0, 100, &values); - - typename TypeTraits<TYPE>::BuilderType builder(type, default_memory_pool()); - for (size_t i = 0; i < values.size(); ++i) { - if (is_valid[i]) { - ABORT_NOT_OK(builder.Append(values[i])); - } else { - ABORT_NOT_OK(builder.AppendNull()); - } - } - std::shared_ptr<Array> array; - ABORT_NOT_OK(builder.Finish(&array)); + int64_t length = total_size / num_fields / sizeof(int64_t); + random::RandomArrayGenerator rand(0x4f32a908); + auto type = arrow::int64(); ArrayVector arrays; std::vector<std::shared_ptr<Field>> fields; @@ -59,7 +40,7 @@ std::shared_ptr<RecordBatch> MakeRecordBatch(int64_t total_size, int64_t num_fie std::stringstream ss; ss << "f" << i; fields.push_back(field(ss.str(), type)); - arrays.push_back(array); + arrays.push_back(rand.Int64(length, 0, 100, 0.1)); } auto schema = std::make_shared<Schema>(fields); @@ -72,7 +53,7 @@ static void BM_WriteRecordBatch(benchmark::State& state) { // NOLINT non-const std::shared_ptr<ResizableBuffer> buffer; ABORT_NOT_OK(AllocateResizableBuffer(kTotalSize & 2, &buffer)); - auto record_batch = MakeRecordBatch<Int64Type>(kTotalSize, state.range(0)); + auto record_batch = MakeRecordBatch(kTotalSize, state.range(0)); while (state.KeepRunning()) { io::BufferOutputStream stream(buffer); @@ -93,7 +74,7 @@ static void BM_ReadRecordBatch(benchmark::State& state) { // NOLINT non-const r std::shared_ptr<ResizableBuffer> buffer; ABORT_NOT_OK(AllocateResizableBuffer(kTotalSize & 2, &buffer)); - auto record_batch = MakeRecordBatch<Int64Type>(kTotalSize, state.range(0)); + auto record_batch = MakeRecordBatch(kTotalSize, state.range(0)); io::BufferOutputStream stream(buffer); diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 4f7de26..c9f577d 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -32,6 +32,7 @@ #include "arrow/pretty_print.h" #include "arrow/record_batch.h" #include "arrow/status.h" +#include "arrow/test-random.h" #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/util/bit-util.h" @@ -67,20 +68,12 @@ const auto kListListInt32 = list(kListInt32); Status MakeRandomInt32Array(int64_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr<Array>* out, uint32_t seed = 0) { - std::shared_ptr<ResizableBuffer> data; - RETURN_NOT_OK(MakeRandomBuffer<int32_t>(length, pool, &data, seed)); - Int32Builder builder(int32(), pool); - RETURN_NOT_OK(builder.Resize(length)); - if (include_nulls) { - std::shared_ptr<ResizableBuffer> valid_bytes; - RETURN_NOT_OK(MakeRandomByteBuffer(length, pool, &valid_bytes)); - RETURN_NOT_OK(builder.AppendValues(reinterpret_cast<const int32_t*>(data->data()), - length, valid_bytes->data())); - return builder.Finish(out); - } - RETURN_NOT_OK( - builder.AppendValues(reinterpret_cast<const int32_t*>(data->data()), length)); - return builder.Finish(out); + random::RandomArrayGenerator rand(seed); + const double null_probability = include_nulls ? 0.5 : 0.0; + + *out = rand.Int32(length, 0, 1000, null_probability); + + return Status::OK(); } Status MakeRandomListArray(const std::shared_ptr<Array>& child_array, int num_lists, diff --git a/cpp/src/arrow/test-random.cc b/cpp/src/arrow/test-random.cc new file mode 100644 index 0000000..cb35bfd --- /dev/null +++ b/cpp/src/arrow/test-random.cc @@ -0,0 +1,149 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/test-random.h" + +#include <algorithm> +#include <memory> +#include <random> +#include <vector> + +#include <gtest/gtest.h> + +#include "arrow/array.h" +#include "arrow/buffer.h" +#include "arrow/test-util.h" +#include "arrow/type.h" +#include "arrow/type_fwd.h" +#include "arrow/type_traits.h" +#include "arrow/util/bit-util.h" + +namespace arrow { +namespace random { + +template <typename ValueType, typename DistributionType> +struct GenerateOptions { + GenerateOptions(SeedType seed, ValueType min, ValueType max, double probability) + : min_(min), max_(max), seed_(seed), probability_(probability) {} + + void GenerateData(uint8_t* buffer, size_t n) { + std::default_random_engine rng(seed_++); + DistributionType dist(min_, max_); + + ValueType* data = reinterpret_cast<ValueType*>(buffer); + + // A static cast is required due to the int16 -> int8 handling. + std::generate(data, data + n, + [&dist, &rng] { return static_cast<ValueType>(dist(rng)); }); + } + + void GenerateBitmap(uint8_t* buffer, size_t n, int64_t* null_count) { + int64_t count = 0; + std::default_random_engine rng(seed_++); + std::bernoulli_distribution dist(1.0 - probability_); + + for (size_t i = 0; i < n; i++) { + if (dist(rng)) { + BitUtil::SetBit(buffer, i); + } else { + count++; + } + } + + if (null_count != nullptr) *null_count = count; + } + + ValueType min_; + ValueType max_; + SeedType seed_; + double probability_; +}; + +std::shared_ptr<Array> RandomArrayGenerator::Boolean(int64_t size, double probability, + double null_probability) { + // The boolean generator does not care about the value distribution since it + // only calls the GenerateBitmap method. + using GenOpt = GenerateOptions<int, std::uniform_int_distribution<int>>; + + std::vector<std::shared_ptr<Buffer>> buffers{2}; + // Need 2 distinct generators such that probabilities are not shared. + GenOpt value_gen(seed(), 0, 1, probability); + GenOpt null_gen(seed(), 0, 1, null_probability); + + int64_t null_count = 0; + ABORT_NOT_OK(AllocateEmptyBitmap(size, &buffers[0])); + null_gen.GenerateBitmap(buffers[0]->mutable_data(), size, &null_count); + + ABORT_NOT_OK(AllocateEmptyBitmap(size, &buffers[1])); + value_gen.GenerateBitmap(buffers[1]->mutable_data(), size, nullptr); + + auto array_data = ArrayData::Make(arrow::boolean(), size, buffers, null_count); + return std::make_shared<BooleanArray>(array_data); +} + +template <typename ArrowType, typename OptionType> +static std::shared_ptr<NumericArray<ArrowType>> GenerateNumericArray(int64_t size, + OptionType options) { + using CType = typename ArrowType::c_type; + auto type = TypeTraits<ArrowType>::type_singleton(); + std::vector<std::shared_ptr<Buffer>> buffers{2}; + + int64_t null_count = 0; + ABORT_NOT_OK(AllocateEmptyBitmap(size, &buffers[0])); + options.GenerateBitmap(buffers[0]->mutable_data(), size, &null_count); + + ABORT_NOT_OK(AllocateBuffer(sizeof(CType) * size, &buffers[1])) + options.GenerateData(buffers[1]->mutable_data(), size); + + auto array_data = ArrayData::Make(type, size, buffers, null_count); + return std::make_shared<NumericArray<ArrowType>>(array_data); +} + +#define PRIMITIVE_RAND_IMPL(Name, CType, ArrowType, Distribution) \ + std::shared_ptr<Array> RandomArrayGenerator::Name(int64_t size, CType min, CType max, \ + double probability) { \ + using OptionType = GenerateOptions<CType, Distribution>; \ + OptionType options(seed(), min, max, probability); \ + return GenerateNumericArray<ArrowType, OptionType>(size, options); \ + } + +#define PRIMITIVE_RAND_INTEGER_IMPL(Name, CType, ArrowType) \ + PRIMITIVE_RAND_IMPL(Name, CType, ArrowType, std::uniform_int_distribution<CType>) + +// Visual Studio does not implement uniform_int_distribution for char types. +PRIMITIVE_RAND_IMPL(UInt8, uint8_t, UInt8Type, std::uniform_int_distribution<uint16_t>) +PRIMITIVE_RAND_IMPL(Int8, int8_t, Int8Type, std::uniform_int_distribution<int16_t>) + +PRIMITIVE_RAND_INTEGER_IMPL(UInt16, uint16_t, UInt16Type) +PRIMITIVE_RAND_INTEGER_IMPL(Int16, int16_t, Int16Type) +PRIMITIVE_RAND_INTEGER_IMPL(UInt32, uint32_t, UInt32Type) +PRIMITIVE_RAND_INTEGER_IMPL(Int32, int32_t, Int32Type) +PRIMITIVE_RAND_INTEGER_IMPL(UInt64, uint64_t, UInt64Type) +PRIMITIVE_RAND_INTEGER_IMPL(Int64, int64_t, Int64Type) + +#define PRIMITIVE_RAND_FLOAT_IMPL(Name, CType, ArrowType) \ + PRIMITIVE_RAND_IMPL(Name, CType, ArrowType, std::uniform_real_distribution<CType>) + +PRIMITIVE_RAND_FLOAT_IMPL(Float32, float, FloatType) +PRIMITIVE_RAND_FLOAT_IMPL(Float64, double, DoubleType) + +#undef PRIMITIVE_RAND_INTEGER_IMPL +#undef PRIMITIVE_RAND_FLOAT_IMPL +#undef PRIMITIVE_RAND_IMPL + +} // namespace random +} // namespace arrow diff --git a/cpp/src/arrow/test-random.h b/cpp/src/arrow/test-random.h new file mode 100644 index 0000000..dc57dca --- /dev/null +++ b/cpp/src/arrow/test-random.h @@ -0,0 +1,169 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include <cstdint> +#include <limits> +#include <memory> +#include <random> + +#include "arrow/util/visibility.h" + +namespace arrow { + +class Array; + +namespace random { + +using SeedType = std::random_device::result_type; +constexpr SeedType kSeedMax = std::numeric_limits<SeedType>::max(); + +class ARROW_EXPORT RandomArrayGenerator { + public: + explicit RandomArrayGenerator(SeedType seed) + : seed_distribution_(static_cast<SeedType>(1), kSeedMax), seed_rng_(seed) {} + + /// \brief Generates a random BooleanArray + /// + /// \param[in] size the size of the array to generate + /// \param[in] probability the estimated number of active bits + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr<arrow::Array> Boolean(int64_t size, double probability, + double null_probability); + + /// \brief Generates a random UInt8Array + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr<arrow::Array> UInt8(int64_t size, uint8_t min, uint8_t max, + double null_probability); + + /// \brief Generates a random Int8Array + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr<arrow::Array> Int8(int64_t size, int8_t min, int8_t max, + double null_probability); + + /// \brief Generates a random UInt16Array + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr<arrow::Array> UInt16(int64_t size, uint16_t min, uint16_t max, + double null_probability); + + /// \brief Generates a random Int16Array + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr<arrow::Array> Int16(int64_t size, int16_t min, int16_t max, + double null_probability); + + /// \brief Generates a random UInt32Array + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr<arrow::Array> UInt32(int64_t size, uint32_t min, uint32_t max, + double null_probability); + + /// \brief Generates a random Int32Array + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr<arrow::Array> Int32(int64_t size, int32_t min, int32_t max, + double null_probability); + + /// \brief Generates a random UInt64Array + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr<arrow::Array> UInt64(int64_t size, uint64_t min, uint64_t max, + double null_probability); + + /// \brief Generates a random Int64Array + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr<arrow::Array> Int64(int64_t size, int64_t min, int64_t max, + double null_probability); + + /// \brief Generates a random FloatArray + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr<arrow::Array> Float32(int64_t size, float min, float max, + double null_probability); + + /// \brief Generates a random DoubleArray + /// + /// \param[in] size the size of the array to generate + /// \param[in] min the lower bound of the uniform distribution + /// \param[in] max the upper bound of the uniform distribution + /// \param[in] null_probability the probability of a row being null + /// + /// \return a generated Array + std::shared_ptr<arrow::Array> Float64(int64_t size, double min, double max, + double null_probability); + + private: + SeedType seed() { return seed_distribution_(seed_rng_); } + + std::uniform_int_distribution<SeedType> seed_distribution_; + std::default_random_engine seed_rng_; +}; + +} // namespace random +} // namespace arrow diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 713ff38..546cc4e 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -100,7 +100,7 @@ class Table; using ArrayVector = std::vector<std::shared_ptr<Array>>; -#define ASSERT_ARRAYS_EQUAL(LEFT, RIGHT) \ +#define ASSERT_PP_EQUAL(LEFT, RIGHT) \ do { \ if (!(LEFT).Equals((RIGHT))) { \ std::stringstream pp_result; \ @@ -112,6 +112,9 @@ using ArrayVector = std::vector<std::shared_ptr<Array>>; } \ } while (false) +#define ASSERT_ARRAYS_EQUAL(lhs, rhs) ASSERT_PP_EQUAL(lhs, rhs) +#define ASSERT_RECORD_BATCHES_EQUAL(lhs, rhs) ASSERT_PP_EQUAL(lhs, rhs) + template <typename T, typename U> void randint(int64_t N, T lower, T upper, std::vector<U>* out) { const int random_seed = 0;