Repository: parquet-cpp Updated Branches: refs/heads/master 3dedeed1d -> 18b504482
PARQUET-1054: Fixes for Arrow API changes in ARROW-1199 This will be broken until ARROW-1199 is merged Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #370 from wesm/arrow-api-changes and squashes the following commits: 6d10f87 [Wes McKinney] Fix decode_benchmark 18cbac7 [Wes McKinney] Update Arrow version 0f11953 [Wes McKinney] Restore prior parameters in parquet 1.0 compatibility test 1c35d49 [Wes McKinney] Fix toolchain lib paths 3914214 [Wes McKinney] Update Arrow EP to exclude lz4/zstd d033f25 [Wes McKinney] Active conda toolchain environment c1477a4 [Wes McKinney] Update Arrow version to include ARROW-1199 1d004fa [Wes McKinney] Fixes for Arrow API changes in ARROW-1199 Project: http://git-wip-us.apache.org/repos/asf/parquet-cpp/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-cpp/commit/18b50448 Tree: http://git-wip-us.apache.org/repos/asf/parquet-cpp/tree/18b50448 Diff: http://git-wip-us.apache.org/repos/asf/parquet-cpp/diff/18b50448 Branch: refs/heads/master Commit: 18b504482be09ee4da37f5f825d68c4b8b00fe70 Parents: 3dedeed Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Tue Jul 11 16:20:41 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Tue Jul 11 16:20:41 2017 -0400 ---------------------------------------------------------------------- benchmarks/decode_benchmark.cc | 1 + ci/travis_script_static.sh | 17 +++++--- cmake_modules/ThirdpartyToolchain.cmake | 4 +- src/parquet/arrow/arrow-reader-writer-test.cc | 13 ++++--- src/parquet/arrow/reader.cc | 4 +- src/parquet/arrow/test-util.h | 5 ++- src/parquet/arrow/writer.cc | 45 ++++++++-------------- src/parquet/file/writer.cc | 1 - 8 files changed, 43 insertions(+), 47 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/benchmarks/decode_benchmark.cc ---------------------------------------------------------------------- diff --git a/benchmarks/decode_benchmark.cc b/benchmarks/decode_benchmark.cc index 5514d8b..57279d0 100644 --- a/benchmarks/decode_benchmark.cc +++ b/benchmarks/decode_benchmark.cc @@ -20,6 +20,7 @@ #include <stdio.h> #include "arrow/util/compression.h" +#include "arrow/util/compression_snappy.h" #include "parquet/encoding-internal.h" #include "parquet/util/logging.h" http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/ci/travis_script_static.sh ---------------------------------------------------------------------- diff --git a/ci/travis_script_static.sh b/ci/travis_script_static.sh index de57acc..4af2653 100755 --- a/ci/travis_script_static.sh +++ b/ci/travis_script_static.sh @@ -43,6 +43,8 @@ conda create -y -q -p $CPP_TOOLCHAIN \ boost-cpp thrift-cpp cmake git \ -c conda-forge +source activate $CPP_TOOLCHAIN + # ---------------------------------------------------------------------- : ${CPP_BUILD_DIR=$TRAVIS_BUILD_DIR/parquet-build} @@ -51,12 +53,15 @@ export PARQUET_BUILD_TOOLCHAIN=$CPP_TOOLCHAIN export LD_LIBRARY_PATH=$CPP_TOOLCHAIN/lib:$LD_LIBRARY_PATH export BOOST_ROOT=$CPP_TOOLCHAIN export PARQUET_TEST_DATA=$TRAVIS_BUILD_DIR/data -export SNAPPY_STATIC_LIB=$TRAVIS_BUILD_DIR/parquet-build/arrow_ep-prefix/src/arrow_ep-build/snappy_ep/src/snappy_ep-install/lib/libsnappy.a -export BROTLI_STATIC_LIB_ENC=$TRAVIS_BUILD_DIR/parquet-build/arrow_ep-prefix/src/arrow_ep-build/brotli_ep/src/brotli_ep-install/lib/x86_64-linux-gnu/libbrotlienc.a -export BROTLI_STATIC_LIB_DEC=$TRAVIS_BUILD_DIR/parquet-build/arrow_ep-prefix/src/arrow_ep-build/brotli_ep/src/brotli_ep-install/lib/x86_64-linux-gnu/libbrotlidec.a -export BROTLI_STATIC_LIB_COMMON=$TRAVIS_BUILD_DIR/parquet-build/arrow_ep-prefix/src/arrow_ep-build/brotli_ep/src/brotli_ep-install/lib/x86_64-linux-gnu/libbrotlicommon.a -export ZLIB_STATIC_LIB=$TRAVIS_BUILD_DIR/parquet-build/arrow_ep-prefix/src/arrow_ep-build/zlib_ep/src/zlib_ep-install/lib/libz.a +ARROW_EP=$TRAVIS_BUILD_DIR/parquet-build/arrow_ep-prefix/src/arrow_ep-build +BROTLI_EP=$ARROW_EP/brotli_ep/src/brotli_ep-install/lib/x86_64-linux-gnu + +export SNAPPY_STATIC_LIB=$ARROW_EP/snappy_ep/src/snappy_ep-install/lib/libsnappy.a +export BROTLI_STATIC_LIB_ENC=$BROTLI_EP/libbrotlienc.a +export BROTLI_STATIC_LIB_DEC=$BROTLI_EP/libbrotlidec.a +export BROTLI_STATIC_LIB_COMMON=$BROTLI_EP/libbrotlicommon.a +export ZLIB_STATIC_LIB=$ARROW_EP/zlib_ep/src/zlib_ep-install/lib/libz.a cmake -DPARQUET_CXXFLAGS=-Werror \ -DPARQUET_TEST_MEMCHECK=ON \ @@ -70,7 +75,7 @@ cmake -DPARQUET_CXXFLAGS=-Werror \ pushd $CPP_BUILD_DIR -make -j4 || exit 1 +make -j4 VERBOSE=1 || exit 1 ctest -VV -L unittest || { cat $TRAVIS_BUILD_DIR/parquet-build/Testing/Temporary/LastTest.log; exit 1; } popd http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/cmake_modules/ThirdpartyToolchain.cmake ---------------------------------------------------------------------- diff --git a/cmake_modules/ThirdpartyToolchain.cmake b/cmake_modules/ThirdpartyToolchain.cmake index 85630a4..2717fb5 100644 --- a/cmake_modules/ThirdpartyToolchain.cmake +++ b/cmake_modules/ThirdpartyToolchain.cmake @@ -332,12 +332,14 @@ if (NOT ARROW_FOUND) -DCMAKE_INSTALL_LIBDIR=${ARROW_LIB_DIR} -DARROW_JEMALLOC=OFF -DARROW_IPC=OFF + -DARROW_WITH_LZ4=OFF + -DARROW_WITH_ZSTD=OFF -DARROW_BUILD_SHARED=${PARQUET_BUILD_SHARED} -DARROW_BOOST_USE_SHARED=${PARQUET_BOOST_USE_SHARED} -DARROW_BUILD_TESTS=OFF) if ("$ENV{PARQUET_ARROW_VERSION}" STREQUAL "") - set(ARROW_VERSION "a58893882ac8acd1ac4a5036685cbf09a9a09673") + set(ARROW_VERSION "afb192824a75ab81fbc8dcd2da56409186bb23e0") else() set(ARROW_VERSION "$ENV{PARQUET_ARROW_VERSION}") endif() http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/src/parquet/arrow/arrow-reader-writer-test.cc ---------------------------------------------------------------------- diff --git a/src/parquet/arrow/arrow-reader-writer-test.cc b/src/parquet/arrow/arrow-reader-writer-test.cc index ddc197c..746ce14 100644 --- a/src/parquet/arrow/arrow-reader-writer-test.cc +++ b/src/parquet/arrow/arrow-reader-writer-test.cc @@ -421,7 +421,9 @@ class TestParquetIO : public ::testing::Test { std::shared_ptr<ChunkedArray> chunked_array = out->column(0)->data(); ASSERT_EQ(1, chunked_array->num_chunks()); - ASSERT_TRUE(values->Equals(chunked_array->chunk(0))); + auto result = chunked_array->chunk(0); + + ASSERT_TRUE(values->Equals(result)); } void CheckRoundTrip(const std::shared_ptr<Table>& table) { @@ -762,7 +764,7 @@ TEST_F(TestUInt32ParquetIO, Parquet_1_0_Compability) { ASSERT_OK(int64_data->Resize(sizeof(int64_t) * values->length())); int64_t* int64_data_ptr = reinterpret_cast<int64_t*>(int64_data->mutable_data()); const uint32_t* uint32_data_ptr = - reinterpret_cast<const uint32_t*>(values->data()->data()); + reinterpret_cast<const uint32_t*>(values->values()->data()); // std::copy might be faster but this is explicit on the casts) for (int64_t i = 0; i < values->length(); i++) { int64_data_ptr[i] = static_cast<int64_t>(uint32_data_ptr[i]); @@ -1219,7 +1221,7 @@ class TestNestedSchemaRead : public ::testing::TestWithParam<Repetition::type> { // Produce values for the columns MakeValues(NUM_SIMPLE_TEST_ROWS); - int32_t* values = reinterpret_cast<int32_t*>(values_array_->data()->mutable_data()); + int32_t* values = reinterpret_cast<int32_t*>(values_array_->values()->mutable_data()); // Create the actual parquet file InitNewParquetFile( @@ -1283,7 +1285,7 @@ class TestNestedSchemaRead : public ::testing::TestWithParam<Repetition::type> { // Produce values for the columns MakeValues(num_rows); - int32_t* values = reinterpret_cast<int32_t*>(values_array_->data()->mutable_data()); + int32_t* values = reinterpret_cast<int32_t*>(values_array_->values()->mutable_data()); // Create the actual parquet file InitNewParquetFile(std::static_pointer_cast<GroupNode>(schema_node), num_rows); @@ -1323,7 +1325,8 @@ class TestNestedSchemaRead : public ::testing::TestWithParam<Repetition::type> { } virtual Status Visit(const ::arrow::StructArray& array) { - for (auto& child : array.fields()) { + for (int32_t i = 0; i < array.num_fields(); ++i) { + auto child = array.field(i); if (node_repetition_ == Repetition::REQUIRED) { RETURN_NOT_OK(child->Accept(this)); } else if (node_repetition_ == Repetition::OPTIONAL) { http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/src/parquet/arrow/reader.cc ---------------------------------------------------------------------- diff --git a/src/parquet/arrow/reader.cc b/src/parquet/arrow/reader.cc index 9d7ceed..e941c1f 100644 --- a/src/parquet/arrow/reader.cc +++ b/src/parquet/arrow/reader.cc @@ -974,9 +974,9 @@ Status PrimitiveImpl::WrapIntoListArray(const int16_t* def_levels, list_lengths.push_back(offset_builders[j]->length() - 1); std::shared_ptr<Array> array; RETURN_NOT_OK(offset_builders[j]->Finish(&array)); - offsets.emplace_back(std::static_pointer_cast<Int32Array>(array)->data()); + offsets.emplace_back(std::static_pointer_cast<Int32Array>(array)->values()); RETURN_NOT_OK(valid_bits_builders[j]->Finish(&array)); - valid_bits.emplace_back(std::static_pointer_cast<BooleanArray>(array)->data()); + valid_bits.emplace_back(std::static_pointer_cast<BooleanArray>(array)->values()); } std::shared_ptr<Array> output(*array); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/src/parquet/arrow/test-util.h ---------------------------------------------------------------------- diff --git a/src/parquet/arrow/test-util.h b/src/parquet/arrow/test-util.h index 191a399..946afad 100644 --- a/src/parquet/arrow/test-util.h +++ b/src/parquet/arrow/test-util.h @@ -332,7 +332,7 @@ template <typename T> void ExpectArray(T* expected, Array* result) { auto p_array = static_cast<::arrow::PrimitiveArray*>(result); for (int i = 0; i < result->length(); i++) { - EXPECT_EQ(expected[i], reinterpret_cast<const T*>(p_array->data()->data())[i]); + EXPECT_EQ(expected[i], reinterpret_cast<const T*>(p_array->values()->data())[i]); } } @@ -341,7 +341,8 @@ void ExpectArrayT(void* expected, Array* result) { ::arrow::PrimitiveArray* p_array = static_cast<::arrow::PrimitiveArray*>(result); for (int64_t i = 0; i < result->length(); i++) { EXPECT_EQ(reinterpret_cast<typename ArrowType::c_type*>(expected)[i], - reinterpret_cast<const typename ArrowType::c_type*>(p_array->data()->data())[i]); + reinterpret_cast<const typename ArrowType::c_type*>( + p_array->values()->data())[i]); } } http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/src/parquet/arrow/writer.cc ---------------------------------------------------------------------- diff --git a/src/parquet/arrow/writer.cc b/src/parquet/arrow/writer.cc index a7aebac..104c040 100644 --- a/src/parquet/arrow/writer.cc +++ b/src/parquet/arrow/writer.cc @@ -61,30 +61,14 @@ class LevelBuilder { Status VisitInline(const Array& array); - Status Visit(const ::arrow::NullArray& array) { - array_offsets_.push_back(static_cast<int32_t>(array.offset())); - valid_bitmaps_.push_back(array.null_bitmap_data()); - null_counts_.push_back(array.length()); - values_type_ = array.type_id(); - values_array_ = &array; - return Status::OK(); - } - - Status Visit(const ::arrow::PrimitiveArray& array) { - array_offsets_.push_back(static_cast<int32_t>(array.offset())); - valid_bitmaps_.push_back(array.null_bitmap_data()); - null_counts_.push_back(array.null_count()); - values_type_ = array.type_id(); - values_array_ = &array; - return Status::OK(); - } - - Status Visit(const ::arrow::BinaryArray& array) { + template <typename T> + typename std::enable_if<std::is_base_of<::arrow::FlatArray, T>::value, Status>::type + Visit(const T& array) { array_offsets_.push_back(static_cast<int32_t>(array.offset())); valid_bitmaps_.push_back(array.null_bitmap_data()); null_counts_.push_back(array.null_count()); values_type_ = array.type_id(); - values_array_ = &array; + values_array_ = std::make_shared<T>(array.data()); return Status::OK(); } @@ -115,7 +99,7 @@ class LevelBuilder { Status GenerateLevels(const Array& array, const std::shared_ptr<Field>& field, int64_t* values_offset, ::arrow::Type::type* values_type, int64_t* num_values, int64_t* num_levels, std::shared_ptr<Buffer>* def_levels, - std::shared_ptr<Buffer>* rep_levels, const Array** values_array) { + std::shared_ptr<Buffer>* rep_levels, std::shared_ptr<Array>* values_array) { // Work downwards to extract bitmaps and offsets min_offset_idx_ = 0; max_offset_idx_ = static_cast<int32_t>(array.length()); @@ -173,11 +157,11 @@ class LevelBuilder { std::shared_ptr<Array> def_levels_array; RETURN_NOT_OK(def_levels_.Finish(&def_levels_array)); - *def_levels = static_cast<PrimitiveArray*>(def_levels_array.get())->data(); + *def_levels = static_cast<PrimitiveArray*>(def_levels_array.get())->values(); std::shared_ptr<Array> rep_levels_array; RETURN_NOT_OK(rep_levels_.Finish(&rep_levels_array)); - *rep_levels = static_cast<PrimitiveArray*>(rep_levels_array.get())->data(); + *rep_levels = static_cast<PrimitiveArray*>(rep_levels_array.get())->values(); *num_levels = rep_levels_array->length(); } @@ -248,7 +232,7 @@ class LevelBuilder { int32_t min_offset_idx_; int32_t max_offset_idx_; ::arrow::Type::type values_type_; - const Array* values_array_; + std::shared_ptr<Array> values_array_; }; Status LevelBuilder::VisitInline(const Array& array) { @@ -311,7 +295,7 @@ Status FileWriter::Impl::TypedWriteBatch(ColumnWriter* column_writer, using ArrowCType = typename ArrowType::c_type; auto data = static_cast<const PrimitiveArray*>(array.get()); - auto data_ptr = reinterpret_cast<const ArrowCType*>(data->data()->data()); + auto data_ptr = reinterpret_cast<const ArrowCType*>(data->values()->data()); auto writer = reinterpret_cast<TypedColumnWriter<ParquetType>*>(column_writer); if (writer->descr()->schema_node()->is_required() || (data->null_count() == 0)) { @@ -501,7 +485,7 @@ Status FileWriter::Impl::TypedWriteBatch<BooleanType, ::arrow::BooleanType>( const int16_t* def_levels, const int16_t* rep_levels) { RETURN_NOT_OK(data_buffer_.Resize(array->length())); auto data = static_cast<const BooleanArray*>(array.get()); - auto data_ptr = reinterpret_cast<const uint8_t*>(data->data()->data()); + auto data_ptr = reinterpret_cast<const uint8_t*>(data->values()->data()); auto buffer_ptr = reinterpret_cast<bool*>(data_buffer_.mutable_data()); auto writer = reinterpret_cast<TypedColumnWriter<BooleanType>*>(column_writer); @@ -540,8 +524,8 @@ Status FileWriter::Impl::TypedWriteBatch<ByteArrayType, ::arrow::BinaryType>( // data->data() points already to a nullptr, thus data->data()->data() will // segfault. const uint8_t* data_ptr = nullptr; - if (data->data()) { - data_ptr = reinterpret_cast<const uint8_t*>(data->data()->data()); + if (data->value_data()) { + data_ptr = reinterpret_cast<const uint8_t*>(data->value_data()->data()); DCHECK(data_ptr != nullptr); } auto writer = reinterpret_cast<TypedColumnWriter<ByteArrayType>*>(column_writer); @@ -620,14 +604,15 @@ Status FileWriter::Impl::WriteColumnChunk(const Array& data) { std::shared_ptr<::arrow::Schema> arrow_schema; RETURN_NOT_OK(FromParquetSchema(writer_->schema(), {current_column_idx - 1}, writer_->key_value_metadata(), &arrow_schema)); - LevelBuilder level_builder(pool_); std::shared_ptr<Buffer> def_levels_buffer; std::shared_ptr<Buffer> rep_levels_buffer; int64_t values_offset; ::arrow::Type::type values_type; int64_t num_levels; int64_t num_values; - const Array* _values_array; + + std::shared_ptr<Array> _values_array; + LevelBuilder level_builder(pool_); RETURN_NOT_OK(level_builder.GenerateLevels(data, arrow_schema->field(0), &values_offset, &values_type, &num_values, &num_levels, &def_levels_buffer, &rep_levels_buffer, &_values_array)); http://git-wip-us.apache.org/repos/asf/parquet-cpp/blob/18b50448/src/parquet/file/writer.cc ---------------------------------------------------------------------- diff --git a/src/parquet/file/writer.cc b/src/parquet/file/writer.cc index cafa6b8..d52c25c 100644 --- a/src/parquet/file/writer.cc +++ b/src/parquet/file/writer.cc @@ -53,7 +53,6 @@ int64_t RowGroupWriter::num_rows() const { return contents_->num_rows(); } - // ---------------------------------------------------------------------- // ParquetFileWriter public API