Repository: arrow Updated Branches: refs/heads/master e18abac19 -> cdee23c27
ARROW-600: ZSTD compression lib support Author: Max Risukhin <risuhin....@gmail.com> Closes #807 from MaxRis/ARROW-600 and squashes the following commits: 2fc4578 [Max Risukhin] ARROW-600: ZSTD compression lib support Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/cdee23c2 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/cdee23c2 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/cdee23c2 Branch: refs/heads/master Commit: cdee23c27ac36f957512e33cc1ee49674c515dc8 Parents: e18abac Author: Max Risukhin <risuhin....@gmail.com> Authored: Mon Jul 3 22:39:09 2017 +0200 Committer: Uwe L. Korn <uw...@xhochy.com> Committed: Mon Jul 3 22:39:09 2017 +0200 ---------------------------------------------------------------------- cpp/CMakeLists.txt | 3 +- cpp/build-support/build-zstd-lib.sh | 16 ++++++ cpp/cmake_modules/FindZSTD.cmake | 70 ++++++++++++++++++++++++ cpp/cmake_modules/ThirdpartyToolchain.cmake | 49 ++++++++++++++++- cpp/src/arrow/util/compression-test.cc | 4 ++ cpp/src/arrow/util/compression.cc | 28 ++++++++++ cpp/src/arrow/util/compression.h | 16 +++++- python/manylinux1/Dockerfile-x86_64_base | 8 +++ python/manylinux1/scripts/build_lz4.sh | 24 ++++++++ python/manylinux1/scripts/build_zstd.sh | 25 +++++++++ 10 files changed, 239 insertions(+), 4 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/cpp/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5745338..28a3bb0 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -530,7 +530,8 @@ set(ARROW_STATIC_LINK_LIBS brotli_enc brotli_common snappy - zlib) + zlib + zstd_static) add_dependencies(arrow_dependencies ${ARROW_STATIC_LINK_LIBS}) http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/cpp/build-support/build-zstd-lib.sh ---------------------------------------------------------------------- diff --git a/cpp/build-support/build-zstd-lib.sh b/cpp/build-support/build-zstd-lib.sh new file mode 100755 index 0000000..62805ba --- /dev/null +++ b/cpp/build-support/build-zstd-lib.sh @@ -0,0 +1,16 @@ +#!/bin/sh +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +export CFLAGS="${CFLAGS} -O3 -fPIC" +make -j4 \ No newline at end of file http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/cpp/cmake_modules/FindZSTD.cmake ---------------------------------------------------------------------- diff --git a/cpp/cmake_modules/FindZSTD.cmake b/cpp/cmake_modules/FindZSTD.cmake new file mode 100644 index 0000000..1fda29e --- /dev/null +++ b/cpp/cmake_modules/FindZSTD.cmake @@ -0,0 +1,70 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# - Find ZSTD (zstd.h, libzstd.a, libzstd.so, and libzstd.so.0) +# This module defines +# ZSTD_INCLUDE_DIR, directory containing headers +# ZSTD_SHARED_LIB, path to libzstd shared library +# ZSTD_STATIC_LIB, path to libzstd static library +# ZSTD_FOUND, whether zstd has been found + +if( NOT "${ZSTD_HOME}" STREQUAL "") + file( TO_CMAKE_PATH "${ZSTD_HOME}" _native_path ) + list( APPEND _zstd_roots ${_native_path} ) +elseif ( ZStd_HOME ) + list( APPEND _zstd_roots ${ZStd_HOME} ) +endif() + +if (MSVC AND NOT ZSTD_MSVC_STATIC_LIB_SUFFIX) + set(ZSTD_MSVC_STATIC_LIB_SUFFIX "_static") +endif() + +set(ZSTD_STATIC_LIB_SUFFIX + "${ZSTD_MSVC_STATIC_LIB_SUFFIX}${CMAKE_STATIC_LIBRARY_SUFFIX}") + +set(ZSTD_STATIC_LIB_NAME + ${CMAKE_STATIC_LIBRARY_PREFIX}zstd${ZSTD_STATIC_LIB_SUFFIX}) + +if ( _zstd_roots ) + find_path(ZSTD_INCLUDE_DIR NAMES zstd.h + PATHS ${_zstd_roots} + NO_DEFAULT_PATH + PATH_SUFFIXES "include" ) + find_library(ZSTD_SHARED_LIB NAMES zstd + PATHS ${_zstd_roots} + NO_DEFAULT_PATH + PATH_SUFFIXES "lib" ) + find_library(ZSTD_STATIC_LIB NAMES ${ZSTD_STATIC_LIB_NAME} + PATHS ${_zstd_roots} + NO_DEFAULT_PATH + PATH_SUFFIXES "lib" ) +else() + find_path(ZSTD_INCLUDE_DIR zstd.h + # make sure we don't accidentally pick up a different version + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + find_library(ZSTD_SHARED_LIB zstd + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) + find_library(ZSTD_STATIC_LIB ${ZSTD_STATIC_LIB_NAME} + NO_CMAKE_SYSTEM_PATH + NO_SYSTEM_ENVIRONMENT_PATH) +endif() + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(ZSTD REQUIRED_VARS + ZSTD_SHARED_LIB ZSTD_STATIC_LIB ZSTD_INCLUDE_DIR) http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/cpp/cmake_modules/ThirdpartyToolchain.cmake ---------------------------------------------------------------------- diff --git a/cpp/cmake_modules/ThirdpartyToolchain.cmake b/cpp/cmake_modules/ThirdpartyToolchain.cmake index 8573345..33447ae 100644 --- a/cpp/cmake_modules/ThirdpartyToolchain.cmake +++ b/cpp/cmake_modules/ThirdpartyToolchain.cmake @@ -28,6 +28,7 @@ set(JEMALLOC_VERSION "4.4.0") set(SNAPPY_VERSION "1.1.3") set(BROTLI_VERSION "v0.6.0") set(LZ4_VERSION "1.7.5") +set(ZSTD_VERSION "1.2.0") string(TOUPPER ${CMAKE_BUILD_TYPE} UPPERCASE_BUILD_TYPE) @@ -49,6 +50,7 @@ if (NOT "$ENV{ARROW_BUILD_TOOLCHAIN}" STREQUAL "") set(ZLIB_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") set(BROTLI_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") set(LZ4_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") + set(ZSTD_HOME "$ENV{ARROW_BUILD_TOOLCHAIN}") if (NOT DEFINED ENV{BOOST_ROOT}) # Since we have to set this in the environment, we check whether @@ -89,6 +91,10 @@ if (DEFINED ENV{LZ4_HOME}) set(LZ4_HOME "$ENV{LZ4_HOME}") endif() +if (DEFINED ENV{ZSTD_HOME}) + set(ZSTD_HOME "$ENV{ZSTD_HOME}") +endif() + # ---------------------------------------------------------------------- # Find pthreads @@ -653,8 +659,8 @@ if (NOT LZ4_FOUND) set(LZ4_INCLUDE_DIR "${LZ4_BUILD_DIR}/lib") if (MSVC) - set(LZ4_STATIC_LIB "${LZ4_BUILD_DIR}/visual/VS2010/bin/x64_Release/liblz4_static.lib") - set(LZ4_BUILD_COMMAND BUILD_COMMAND msbuild.exe /m /p:Configuration=Release /p:Platform=x64 /p:PlatformToolset=v140 /t:Build ${LZ4_BUILD_DIR}/visual/VS2010/lz4.sln) + set(LZ4_STATIC_LIB "${LZ4_BUILD_DIR}/visual/VS2010/bin/x64_${CMAKE_BUILD_TYPE}/liblz4_static.lib") + set(LZ4_BUILD_COMMAND BUILD_COMMAND msbuild.exe /m /p:Configuration=${CMAKE_BUILD_TYPE} /p:Platform=x64 /p:PlatformToolset=v140 /t:Build ${LZ4_BUILD_DIR}/visual/VS2010/lz4.sln) else() set(LZ4_STATIC_LIB "${LZ4_BUILD_DIR}/lib/liblz4.a") set(LZ4_BUILD_COMMAND BUILD_COMMAND make -j4) @@ -682,3 +688,42 @@ ADD_THIRDPARTY_LIB(lz4_static if (LZ4_VENDORED) add_dependencies(lz4_static lz4_ep) endif() + +# ---------------------------------------------------------------------- +# ZSTD + +find_package(ZSTD) +if (NOT ZSTD_FOUND) + set(ZSTD_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/zstd_ep-prefix/src/zstd_ep") + set(ZSTD_INCLUDE_DIR "${ZSTD_BUILD_DIR}/lib") + + if (MSVC) + set(ZSTD_STATIC_LIB "${ZSTD_BUILD_DIR}/build/VS2010/bin/x64_${CMAKE_BUILD_TYPE}/libzstd_static.lib") + set(ZSTD_BUILD_COMMAND BUILD_COMMAND msbuild ${ZSTD_BUILD_DIR}/build/VS2010/zstd.sln /t:Build /v:minimal /p:Configuration=${CMAKE_BUILD_TYPE} /p:Platform=x64 /p:PlatformToolset=v140 /p:OutDir=${ZSTD_BUILD_DIR}/build/VS2010/bin/x64_${CMAKE_BUILD_TYPE}/ /p:SolutionDir=${ZSTD_BUILD_DIR}/build/VS2010/ ) + else() + set(ZSTD_STATIC_LIB "${ZSTD_BUILD_DIR}/lib/libzstd.a") + set(ZSTD_BUILD_COMMAND BUILD_COMMAND ${CMAKE_SOURCE_DIR}/build-support/build-zstd-lib.sh) + endif() + + ExternalProject_Add(zstd_ep + URL "https://github.com/facebook/zstd/archive/v${ZSTD_VERSION}.tar.gz" + UPDATE_COMMAND "" + PATCH_COMMAND "" + CONFIGURE_COMMAND "" + INSTALL_COMMAND "" + BINARY_DIR ${ZSTD_BUILD_DIR} + ${ZSTD_BUILD_COMMAND} + ) + + set(ZSTD_VENDORED 1) +else() + set(ZSTD_VENDORED 0) +endif() + +include_directories(SYSTEM ${ZSTD_INCLUDE_DIR}) +ADD_THIRDPARTY_LIB(zstd_static + STATIC_LIB ${ZSTD_STATIC_LIB}) + +if (ZSTD_VENDORED) + add_dependencies(zstd_static zstd_ep) +endif() http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/cpp/src/arrow/util/compression-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/compression-test.cc b/cpp/src/arrow/util/compression-test.cc index 1a0e5d7..3b19a6d 100644 --- a/cpp/src/arrow/util/compression-test.cc +++ b/cpp/src/arrow/util/compression-test.cc @@ -86,4 +86,8 @@ TEST(TestCompressors, GZip) { CheckCodec<GZipCodec>(); } +TEST(TestCompressors, ZSTD) { + CheckCodec<ZSTDCodec>(); +} + } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/cpp/src/arrow/util/compression.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/compression.cc b/cpp/src/arrow/util/compression.cc index 070f857..df1afa3 100644 --- a/cpp/src/arrow/util/compression.cc +++ b/cpp/src/arrow/util/compression.cc @@ -31,6 +31,7 @@ #include <brotli/encode.h> #include <snappy.h> #include <zlib.h> +#include <zstd.h> #include "arrow/status.h" #include "arrow/util/logging.h" @@ -329,4 +330,31 @@ Status BrotliCodec::Compress(int64_t input_len, const uint8_t* input, return Status::OK(); } +// ---------------------------------------------------------------------- +// ZSTD implementation + +Status ZSTDCodec::Decompress( + int64_t input_len, const uint8_t* input, int64_t output_len, uint8_t* output_buffer) { + int64_t decompressed_size = ZSTD_decompress(output_buffer, + static_cast<size_t>(output_len), input, static_cast<size_t>(input_len)); + if (decompressed_size != output_len) { + return Status::IOError("Corrupt ZSTD compressed data."); + } + return Status::OK(); +} + +int64_t ZSTDCodec::MaxCompressedLen(int64_t input_len, const uint8_t* input) { + return ZSTD_compressBound(input_len); +} + +Status ZSTDCodec::Compress(int64_t input_len, const uint8_t* input, + int64_t output_buffer_len, uint8_t* output_buffer, int64_t* output_length) { + *output_length = ZSTD_compress(output_buffer, static_cast<size_t>(output_buffer_len), + input, static_cast<size_t>(input_len), 1); + if (ZSTD_isError(*output_length)) { + return Status::IOError("ZSTD compression failure."); + } + return Status::OK(); +} + } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/cpp/src/arrow/util/compression.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/compression.h b/cpp/src/arrow/util/compression.h index 6886d04..9e581d8 100644 --- a/cpp/src/arrow/util/compression.h +++ b/cpp/src/arrow/util/compression.h @@ -27,7 +27,7 @@ namespace arrow { struct Compression { - enum type { UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI }; + enum type { UNCOMPRESSED, SNAPPY, GZIP, LZO, BROTLI, ZSTD }; }; class ARROW_EXPORT Codec { @@ -104,6 +104,20 @@ class ARROW_EXPORT GZipCodec : public Codec { std::unique_ptr<GZipCodecImpl> impl_; }; +// ZSTD codec. +class ARROW_EXPORT ZSTDCodec : public Codec { + public: + Status Decompress(int64_t input_len, const uint8_t* input, int64_t output_len, + uint8_t* output_buffer) override; + + Status Compress(int64_t input_len, const uint8_t* input, int64_t output_buffer_len, + uint8_t* output_buffer, int64_t* output_length) override; + + int64_t MaxCompressedLen(int64_t input_len, const uint8_t* input) override; + + const char* name() const override { return "zstd"; } +}; + } // namespace arrow #endif http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/python/manylinux1/Dockerfile-x86_64_base ---------------------------------------------------------------------- diff --git a/python/manylinux1/Dockerfile-x86_64_base b/python/manylinux1/Dockerfile-x86_64_base index 44a9888..cdd13e2 100644 --- a/python/manylinux1/Dockerfile-x86_64_base +++ b/python/manylinux1/Dockerfile-x86_64_base @@ -49,6 +49,14 @@ ADD scripts/build_snappy.sh / RUN /build_snappy.sh ENV SNAPPY_HOME /usr +ADD scripts/build_lz4.sh / +RUN /build_lz4.sh +ENV LZ4_HOME /usr + +ADD scripts/build_zstd.sh / +RUN /build_zstd.sh +ENV ZSTD_HOME /usr + ADD scripts/build_ccache.sh / RUN /build_ccache.sh http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/python/manylinux1/scripts/build_lz4.sh ---------------------------------------------------------------------- diff --git a/python/manylinux1/scripts/build_lz4.sh b/python/manylinux1/scripts/build_lz4.sh new file mode 100755 index 0000000..5a25d3d --- /dev/null +++ b/python/manylinux1/scripts/build_lz4.sh @@ -0,0 +1,24 @@ +#!/bin/bash -ex +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + +export LZ4_VERSION="1.7.5" +export PREFIX="/usr" +export LDFLAGS="${LDFLAGS} -Wl,-rpath,${PREFIX}/lib -L${PREFIX}/lib" +wget "https://github.com/lz4/lz4/archive/v${LZ4_VERSION}.tar.gz" -O lz4-${LZ4_VERSION}.tar.gz +tar xf lz4-${LZ4_VERSION}.tar.gz +pushd lz4-${LZ4_VERSION} + +make -j5 PREFIX=${PREFIX} +make install PREFIX=$PREFIX +popd +rm -rf lz4-${LZ4_VERSION}.tar.gz lz4-${LZ4_VERSION} http://git-wip-us.apache.org/repos/asf/arrow/blob/cdee23c2/python/manylinux1/scripts/build_zstd.sh ---------------------------------------------------------------------- diff --git a/python/manylinux1/scripts/build_zstd.sh b/python/manylinux1/scripts/build_zstd.sh new file mode 100755 index 0000000..268e5c8 --- /dev/null +++ b/python/manylinux1/scripts/build_zstd.sh @@ -0,0 +1,25 @@ +#!/bin/bash -ex +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. See accompanying LICENSE file. + +export ZSTD_VERSION="1.2.0" +export CFLAGS="${CFLAGS} -O3 -fPIC" +export PREFIX="/usr" +export LDFLAGS="${LDFLAGS} -Wl,-rpath,${PREFIX}/lib" +wget "https://github.com/facebook/zstd/archive/v${ZSTD_VERSION}.tar.gz" -O zstd-${ZSTD_VERSION}.tar.gz +tar xf zstd-${ZSTD_VERSION}.tar.gz +pushd zstd-${ZSTD_VERSION} + +make -j5 +make install PREFIX=$PREFIX +popd +rm -rf zstd-${ZSTD_VERSION}.tar.gz zstd-${ZSTD_VERSION}