Repository: arrow Updated Branches: refs/heads/master 2d8ec7893 -> 637584bec
ARROW-284: Disable arrow_parquet module in Travis CI to triage builds Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #132 from wesm/ARROW-284 and squashes the following commits: e3410cf [Wes McKinney] Install miniconda in $HOME to avoid long prefix issues in conda-build 2.0 9fd94f5 [Wes McKinney] Do not run death test when valgrind is enabled. Gracefully skip pyarrow.parquet when ARROW_PARQUET=off ccf56f8 [Wes McKinney] Disable arrow_parquet module in Travis CI Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/637584be Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/637584be Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/637584be Branch: refs/heads/master Commit: 637584becb2db88fc510824c22b87e6effb2232f Parents: 2d8ec78 Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Tue Sep 6 23:59:30 2016 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Tue Sep 6 23:59:30 2016 -0400 ---------------------------------------------------------------------- ci/travis_before_script_cpp.sh | 4 +-- ci/travis_install_conda.sh | 4 ++- ci/travis_script_python.sh | 6 +++-- cpp/cmake_modules/FindParquet.cmake | 1 + cpp/src/arrow/util/memory-pool-test.cc | 6 +++++ python/CMakeLists.txt | 41 ++++++++++++++++++----------- python/cmake_modules/FindArrow.cmake | 26 +++++++++++------- python/pyarrow/tests/test_io.py | 1 + python/pyarrow/tests/test_parquet.py | 38 +++++++++++++++++--------- python/pyarrow/tests/test_table.py | 7 +---- python/setup.py | 27 +++++++++++-------- 11 files changed, 101 insertions(+), 60 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/637584be/ci/travis_before_script_cpp.sh ---------------------------------------------------------------------- diff --git a/ci/travis_before_script_cpp.sh b/ci/travis_before_script_cpp.sh index 08551f3..2f02ef2 100755 --- a/ci/travis_before_script_cpp.sh +++ b/ci/travis_before_script_cpp.sh @@ -25,8 +25,8 @@ echo $GTEST_HOME CMAKE_COMMON_FLAGS="\ -DARROW_BUILD_BENCHMARKS=ON \ --DARROW_PARQUET=ON \ --DARROW_HDFS=on \ +-DARROW_PARQUET=OFF \ +-DARROW_HDFS=ON \ -DCMAKE_INSTALL_PREFIX=$ARROW_CPP_INSTALL" if [ $TRAVIS_OS_NAME == "linux" ]; then http://git-wip-us.apache.org/repos/asf/arrow/blob/637584be/ci/travis_install_conda.sh ---------------------------------------------------------------------- diff --git a/ci/travis_install_conda.sh b/ci/travis_install_conda.sh index 3a8f57b..e922525 100644 --- a/ci/travis_install_conda.sh +++ b/ci/travis_install_conda.sh @@ -9,7 +9,9 @@ else fi wget -O miniconda.sh $MINICONDA_URL -export MINICONDA=$TRAVIS_BUILD_DIR/miniconda + +export MINICONDA=$HOME/miniconda + bash miniconda.sh -b -p $MINICONDA export PATH="$MINICONDA/bin:$PATH" conda update -y -q conda http://git-wip-us.apache.org/repos/asf/arrow/blob/637584be/ci/travis_script_python.sh ---------------------------------------------------------------------- diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 4a37742..61c8e44 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -5,7 +5,7 @@ set -e PYTHON_DIR=$TRAVIS_BUILD_DIR/python # Re-use conda installation from C++ -export MINICONDA=$TRAVIS_BUILD_DIR/miniconda +export MINICONDA=$HOME/miniconda export PATH="$MINICONDA/bin:$PATH" export PARQUET_HOME=$MINICONDA @@ -31,7 +31,9 @@ python_version_tests() { # Expensive dependencies install from Continuum package repo conda install -y pip numpy pandas cython - conda install -y parquet-cpp arrow-cpp -c apache/channel/dev + # conda install -y parquet-cpp + + conda install -y arrow-cpp -c apache/channel/dev # Other stuff pip install pip install -r requirements.txt http://git-wip-us.apache.org/repos/asf/arrow/blob/637584be/cpp/cmake_modules/FindParquet.cmake ---------------------------------------------------------------------- diff --git a/cpp/cmake_modules/FindParquet.cmake b/cpp/cmake_modules/FindParquet.cmake index e3350d6..36f4828 100644 --- a/cpp/cmake_modules/FindParquet.cmake +++ b/cpp/cmake_modules/FindParquet.cmake @@ -72,6 +72,7 @@ else () endif () mark_as_advanced( + PARQUET_FOUND PARQUET_INCLUDE_DIR PARQUET_LIBS PARQUET_LIBRARIES http://git-wip-us.apache.org/repos/asf/arrow/blob/637584be/cpp/src/arrow/util/memory-pool-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/util/memory-pool-test.cc b/cpp/src/arrow/util/memory-pool-test.cc index deb7ffd..e767e95 100644 --- a/cpp/src/arrow/util/memory-pool-test.cc +++ b/cpp/src/arrow/util/memory-pool-test.cc @@ -46,6 +46,10 @@ TEST(DefaultMemoryPool, OOM) { ASSERT_RAISES(OutOfMemory, pool->Allocate(to_alloc, &data)); } +// Death tests and valgrind are known to not play well 100% of the time. See +// googletest documentation +#ifndef ARROW_VALGRIND + TEST(DefaultMemoryPoolDeathTest, FreeLargeMemory) { MemoryPool* pool = default_memory_pool(); @@ -60,4 +64,6 @@ TEST(DefaultMemoryPoolDeathTest, FreeLargeMemory) { pool->Free(data, 100); } +#endif // ARROW_VALGRIND + } // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/637584be/python/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt index fdbfce9..5228958 100644 --- a/python/CMakeLists.txt +++ b/python/CMakeLists.txt @@ -340,8 +340,10 @@ if (PYARROW_BUILD_TESTS) endif() ## Parquet -find_package(Parquet REQUIRED) -include_directories(SYSTEM ${PARQUET_INCLUDE_DIR}) +find_package(Parquet) +if(PARQUET_FOUND) + include_directories(SYSTEM ${PARQUET_INCLUDE_DIR}) +endif() ## Arrow find_package(Arrow REQUIRED) @@ -350,8 +352,6 @@ ADD_THIRDPARTY_LIB(arrow SHARED_LIB ${ARROW_SHARED_LIB}) ADD_THIRDPARTY_LIB(arrow_io SHARED_LIB ${ARROW_IO_SHARED_LIB}) -ADD_THIRDPARTY_LIB(arrow_parquet - SHARED_LIB ${ARROW_PARQUET_SHARED_LIB}) ############################################################ # Linker setup @@ -418,6 +418,16 @@ endif() add_subdirectory(src/pyarrow) add_subdirectory(src/pyarrow/util) +set(CYTHON_EXTENSIONS + array + config + error + io + scalar + schema + table +) + set(PYARROW_SRCS src/pyarrow/common.cc src/pyarrow/config.cc @@ -431,9 +441,19 @@ set(PYARROW_SRCS set(LINK_LIBS arrow arrow_io - arrow_parquet ) +if(PARQUET_FOUND AND ARROW_PARQUET_FOUND) + ADD_THIRDPARTY_LIB(arrow_parquet + SHARED_LIB ${ARROW_PARQUET_SHARED_LIB}) + set(LINK_LIBS + ${LINK_LIBS} + arrow_parquet) + set(CYTHON_EXTENSIONS + ${CYTHON_EXTENSIONS} + parquet) +endif() + SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE) add_library(pyarrow SHARED @@ -448,17 +468,6 @@ endif() # Setup and build Cython modules ############################################################ -set(CYTHON_EXTENSIONS - array - config - error - io - parquet - scalar - schema - table -) - foreach(module ${CYTHON_EXTENSIONS}) string(REPLACE "." ";" directories ${module}) list(GET directories -1 module_name) http://git-wip-us.apache.org/repos/asf/arrow/blob/637584be/python/cmake_modules/FindArrow.cmake ---------------------------------------------------------------------- diff --git a/python/cmake_modules/FindArrow.cmake b/python/cmake_modules/FindArrow.cmake index 6bd3056..5d5efc4 100644 --- a/python/cmake_modules/FindArrow.cmake +++ b/python/cmake_modules/FindArrow.cmake @@ -52,7 +52,7 @@ find_library(ARROW_IO_LIB_PATH NAMES arrow_io ${ARROW_SEARCH_LIB_PATH} NO_DEFAULT_PATH) -if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH AND ARROW_PARQUET_LIB_PATH) +if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH) set(ARROW_FOUND TRUE) set(ARROW_LIB_NAME libarrow) set(ARROW_IO_LIB_NAME libarrow_io) @@ -64,18 +64,9 @@ if (ARROW_INCLUDE_DIR AND ARROW_LIB_PATH AND ARROW_PARQUET_LIB_PATH) set(ARROW_IO_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_IO_LIB_NAME}.a) set(ARROW_IO_SHARED_LIB ${ARROW_LIBS}/${ARROW_IO_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) - - set(ARROW_PARQUET_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_PARQUET_LIB_NAME}.a) - set(ARROW_PARQUET_SHARED_LIB ${ARROW_LIBS}/${ARROW_PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) -else () - set(ARROW_FOUND FALSE) -endif () - -if (ARROW_FOUND) if (NOT Arrow_FIND_QUIETLY) message(STATUS "Found the Arrow core library: ${ARROW_LIB_PATH}") message(STATUS "Found the Arrow IO library: ${ARROW_IO_LIB_PATH}") - message(STATUS "Found the Arrow Parquet library: ${ARROW_PARQUET_LIB_PATH}") endif () else () if (NOT Arrow_FIND_QUIETLY) @@ -88,8 +79,23 @@ else () message(STATUS "${ARROW_ERR_MSG}") endif (Arrow_FIND_REQUIRED) endif () + set(ARROW_FOUND FALSE) endif () +if(ARROW_PARQUET_LIB_PATH) + set(ARROW_PARQUET_FOUND TRUE) + set(ARROW_PARQUET_STATIC_LIB ${ARROW_SEARCH_LIB_PATH}/${ARROW_PARQUET_LIB_NAME}.a) + set(ARROW_PARQUET_SHARED_LIB ${ARROW_LIBS}/${ARROW_PARQUET_LIB_NAME}${CMAKE_SHARED_LIBRARY_SUFFIX}) + if (NOT Arrow_FIND_QUIETLY) + message(STATUS "Found the Arrow Parquet library: ${ARROW_PARQUET_LIB_PATH}") + endif () +else() + if (NOT Arrow_FIND_QUIETLY) + message(STATUS "Could not find Arrow Parquet library") + endif() + set(ARROW_PARQUET_FOUND FALSE) +endif() + mark_as_advanced( ARROW_INCLUDE_DIR ARROW_LIBS http://git-wip-us.apache.org/repos/asf/arrow/blob/637584be/python/pyarrow/tests/test_io.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index 328e923..eb92e8e 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -46,6 +46,7 @@ libhdfs = pytest.mark.skipif(not io.have_libhdfs(), HDFS_TMP_PATH = '/tmp/pyarrow-test-{0}'.format(random.randint(0, 1000)) + @pytest.fixture(scope='session') def hdfs(request): fixture = hdfs_test_client() http://git-wip-us.apache.org/repos/asf/arrow/blob/637584be/python/pyarrow/tests/test_parquet.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index d89d947..8a2d8ca 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -15,33 +15,45 @@ # specific language governing permissions and limitations # under the License. -from pyarrow.compat import unittest -import pyarrow as arrow -import pyarrow.parquet +import pytest -A = arrow +import pyarrow as A import numpy as np -import os.path import pandas as pd import pandas.util.testing as pdt +try: + import pyarrow.parquet as pq + HAVE_PARQUET = True +except ImportError: + HAVE_PARQUET = False +# XXX: Make Parquet tests opt-in rather than skip-if-not-build +parquet = pytest.mark.skipif(not HAVE_PARQUET, + reason='Parquet support not built') + + +@parquet def test_single_pylist_column_roundtrip(tmpdir): for dtype in [int, float]: - filename = tmpdir.join('single_{}_column.parquet'.format(dtype.__name__)) + filename = tmpdir.join('single_{}_column.parquet' + .format(dtype.__name__)) data = [A.from_pylist(list(map(dtype, range(5))))] table = A.Table.from_arrays(('a', 'b'), data, 'table_name') A.parquet.write_table(table, filename.strpath) - table_read = pyarrow.parquet.read_table(filename.strpath) - for col_written, col_read in zip(table.itercolumns(), table_read.itercolumns()): + table_read = pq.read_table(filename.strpath) + for col_written, col_read in zip(table.itercolumns(), + table_read.itercolumns()): assert col_written.name == col_read.name assert col_read.data.num_chunks == 1 data_written = col_written.data.chunk(0) data_read = col_read.data.chunk(0) assert data_written.equals(data_read) + +@parquet def test_pandas_parquet_2_0_rountrip(tmpdir): size = 10000 np.random.seed(0) @@ -58,17 +70,20 @@ def test_pandas_parquet_2_0_rountrip(tmpdir): 'float64': np.arange(size, dtype=np.float64), 'bool': np.random.randn(size) > 0, # Pandas only support ns resolution, Arrow at the moment only ms - 'datetime': np.arange("2016-01-01T00:00:00.001", size, dtype='datetime64[ms]'), + 'datetime': np.arange("2016-01-01T00:00:00.001", size, + dtype='datetime64[ms]'), 'str': [str(x) for x in range(size)], 'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None] }) filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = A.from_pandas_dataframe(df, timestamps_to_ms=True) A.parquet.write_table(arrow_table, filename.strpath, version="2.0") - table_read = pyarrow.parquet.read_table(filename.strpath) + table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() pdt.assert_frame_equal(df, df_read) + +@parquet def test_pandas_parquet_1_0_rountrip(tmpdir): size = 10000 np.random.seed(0) @@ -88,11 +103,10 @@ def test_pandas_parquet_1_0_rountrip(tmpdir): filename = tmpdir.join('pandas_rountrip.parquet') arrow_table = A.from_pandas_dataframe(df) A.parquet.write_table(arrow_table, filename.strpath, version="1.0") - table_read = pyarrow.parquet.read_table(filename.strpath) + table_read = pq.read_table(filename.strpath) df_read = table_read.to_pandas() # We pass uint32_t as int64_t if we write Parquet version 1.0 df['uint32'] = df['uint32'].values.astype(np.int64) pdt.assert_frame_equal(df, df_read) - http://git-wip-us.apache.org/repos/asf/arrow/blob/637584be/python/pyarrow/tests/test_table.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_table.py b/python/pyarrow/tests/test_table.py index 83fcbb8..abf1431 100644 --- a/python/pyarrow/tests/test_table.py +++ b/python/pyarrow/tests/test_table.py @@ -16,11 +16,7 @@ # under the License. from pyarrow.compat import unittest -import pyarrow as arrow - -A = arrow - -import pandas as pd +import pyarrow as A class TestRowBatch(unittest.TestCase): @@ -76,4 +72,3 @@ class TestTable(unittest.TestCase): assert set(df.columns) == set(('a', 'b')) assert df.shape == (5, 2) assert df.ix[0, 'b'] == -10 - http://git-wip-us.apache.org/repos/asf/arrow/blob/637584be/python/setup.py ---------------------------------------------------------------------- diff --git a/python/setup.py b/python/setup.py index 59410d7..a5db2b0 100644 --- a/python/setup.py +++ b/python/setup.py @@ -97,6 +97,18 @@ class build_ext(_build_ext): _build_ext.initialize_options(self) self.extra_cmake_args = '' + CYTHON_MODULE_NAMES = [ + 'array', + 'config', + 'error', + 'io', + 'parquet', + 'scalar', + 'schema', + 'table'] + + CYTHON_ALLOWED_FAILURES = ['parquet'] + def _run_cmake(self): # The directory containing this setup.py source = osp.dirname(osp.abspath(__file__)) @@ -172,10 +184,13 @@ class build_ext(_build_ext): # Move the built C-extension to the place expected by the Python build self._found_names = [] - for name in self.get_cmake_cython_names(): + for name in self.CYTHON_MODULE_NAMES: built_path = self.get_ext_built(name) if not os.path.exists(built_path): print(built_path) + if name in self.CYTHON_ALLOWED_FAILURES: + print('Cython module {0} failure permitted'.format(name)) + continue raise RuntimeError('libpyarrow C-extension failed to build:', os.path.abspath(built_path)) @@ -213,16 +228,6 @@ class build_ext(_build_ext): suffix = sysconfig.get_config_var('SO') return name + suffix - def get_cmake_cython_names(self): - return ['array', - 'config', - 'error', - 'io', - 'parquet', - 'scalar', - 'schema', - 'table'] - def get_names(self): return self._found_names