ARROW-341: [Python] Move pyarrow's C++ code to the main C++ source tree, install libarrow_python and headers
This will enable third parties to link to `libarrow_python`. For now, the pyarrow build system continues to use CMake -- for the purpose of resolving the thirdparty toolchain we may or may not want to go completely to distutils, but we can sort that out later. Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #440 from wesm/ARROW-341 and squashes the following commits: 193bc51 [Wes McKinney] Ensure that '-undefined dynamic_lookup' is passed when linking shared library on OS X a93496b [Wes McKinney] Add missing backslash 7620f50 [Wes McKinney] Fix cpplint issues 0617c69 [Wes McKinney] Fix LD_LIBRARY_PATH, ARROW_HOME 090c78c [Wes McKinney] Build Arrow library stack specific to active Python version 10e4626 [Wes McKinney] Get Python test suite passing again cfb7f44 [Wes McKinney] Remove print statement c1e63dc [Wes McKinney] Scrubbing python/CMakeLists.txt b80b153 [Wes McKinney] Cleanup, build pandas-test within main test suite 7ef1f81 [Wes McKinney] Start moving python/src/pyarrow tp cpp/src/arrow/python Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/3aac4ade Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/3aac4ade Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/3aac4ade Branch: refs/heads/master Commit: 3aac4adef11345f211e4c66467ff758cbc397e43 Parents: 6d4e862 Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Sun Mar 26 11:45:38 2017 -0400 Committer: Wes McKinney <wes.mckin...@twosigma.com> Committed: Sun Mar 26 11:45:38 2017 -0400 ---------------------------------------------------------------------- ci/travis_script_python.sh | 26 +- cpp/CMakeLists.txt | 115 +- cpp/cmake_modules/BuildUtils.cmake | 88 +- cpp/cmake_modules/FindNumPy.cmake | 100 ++ cpp/cmake_modules/FindPythonLibsNew.cmake | 241 +++ cpp/src/arrow/python/CMakeLists.txt | 93 + cpp/src/arrow/python/api.h | 27 + cpp/src/arrow/python/builtin_convert.cc | 527 ++++++ cpp/src/arrow/python/builtin_convert.h | 54 + cpp/src/arrow/python/common.cc | 68 + cpp/src/arrow/python/common.h | 139 ++ cpp/src/arrow/python/config.cc | 35 + cpp/src/arrow/python/config.h | 45 + cpp/src/arrow/python/do_import_numpy.h | 21 + cpp/src/arrow/python/helpers.cc | 55 + cpp/src/arrow/python/helpers.h | 35 + cpp/src/arrow/python/io.cc | 222 +++ cpp/src/arrow/python/io.h | 99 ++ cpp/src/arrow/python/numpy_interop.h | 60 + cpp/src/arrow/python/pandas-test.cc | 64 + cpp/src/arrow/python/pandas_convert.cc | 1936 +++++++++++++++++++++ cpp/src/arrow/python/pandas_convert.h | 79 + cpp/src/arrow/python/type_traits.h | 213 +++ cpp/src/arrow/python/util/CMakeLists.txt | 39 + cpp/src/arrow/python/util/datetime.h | 42 + cpp/src/arrow/python/util/test_main.cc | 36 + python/CMakeLists.txt | 215 +-- python/cmake_modules/FindArrow.cmake | 9 + python/cmake_modules/FindNumPy.cmake | 100 -- python/cmake_modules/FindPythonLibsNew.cmake | 241 --- python/pyarrow/config.pyx | 14 +- python/pyarrow/includes/pyarrow.pxd | 6 +- python/setup.py | 11 +- python/src/pyarrow/CMakeLists.txt | 22 - python/src/pyarrow/adapters/builtin.cc | 527 ------ python/src/pyarrow/adapters/builtin.h | 54 - python/src/pyarrow/adapters/pandas-test.cc | 64 - python/src/pyarrow/adapters/pandas.cc | 1936 --------------------- python/src/pyarrow/adapters/pandas.h | 79 - python/src/pyarrow/api.h | 26 - python/src/pyarrow/common.cc | 69 - python/src/pyarrow/common.h | 137 -- python/src/pyarrow/config.cc | 35 - python/src/pyarrow/config.h | 46 - python/src/pyarrow/do_import_numpy.h | 21 - python/src/pyarrow/helpers.cc | 55 - python/src/pyarrow/helpers.h | 35 - python/src/pyarrow/io.cc | 221 --- python/src/pyarrow/io.h | 99 -- python/src/pyarrow/numpy_interop.h | 60 - python/src/pyarrow/type_traits.h | 212 --- python/src/pyarrow/util/CMakeLists.txt | 39 - python/src/pyarrow/util/datetime.h | 42 - python/src/pyarrow/util/test_main.cc | 36 - 54 files changed, 4409 insertions(+), 4461 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/ci/travis_script_python.sh ---------------------------------------------------------------------- diff --git a/ci/travis_script_python.sh b/ci/travis_script_python.sh index 6f4b8e9..df11209 100755 --- a/ci/travis_script_python.sh +++ b/ci/travis_script_python.sh @@ -23,7 +23,6 @@ export MINICONDA=$HOME/miniconda export PATH="$MINICONDA/bin:$PATH" export ARROW_HOME=$ARROW_CPP_INSTALL -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$ARROW_CPP_INSTALL/lib pushd $PYTHON_DIR export PARQUET_HOME=$TRAVIS_BUILD_DIR/parquet-env @@ -70,11 +69,31 @@ build_parquet_cpp() { build_parquet_cpp -export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:$PARQUET_HOME/lib +function build_arrow_libraries() { + CPP_BUILD_DIR=$1 + CPP_DIR=$TRAVIS_BUILD_DIR/cpp + + mkdir $CPP_BUILD_DIR + pushd $CPP_BUILD_DIR + + cmake -DARROW_BUILD_TESTS=off \ + -DARROW_PYTHON=on \ + -DCMAKE_INSTALL_PREFIX=$2 \ + $CPP_DIR + + make -j4 + make install + + popd +} python_version_tests() { PYTHON_VERSION=$1 CONDA_ENV_DIR=$TRAVIS_BUILD_DIR/pyarrow-test-$PYTHON_VERSION + + export ARROW_HOME=$TRAVIS_BUILD_DIR/arrow-install-$PYTHON_VERSION + export LD_LIBRARY_PATH=$ARROW_HOME/lib:$PARQUET_HOME/lib + conda create -y -q -p $CONDA_ENV_DIR python=$PYTHON_VERSION source activate $CONDA_ENV_DIR @@ -87,6 +106,9 @@ python_version_tests() { # Expensive dependencies install from Continuum package repo conda install -y pip numpy pandas cython + # Build C++ libraries + build_arrow_libraries arrow-build-$PYTHON_VERSION $ARROW_HOME + # Other stuff pip install pip install -r requirements.txt http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index c04afe4..c77cf60 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -106,6 +106,10 @@ if("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") "Rely on boost shared libraries where relevant" ON) + option(ARROW_PYTHON + "Build the Arrow CPython extensions" + OFF) + option(ARROW_SSE3 "Build Arrow with SSE3" ON) @@ -133,6 +137,7 @@ if(NOT ARROW_BUILD_BENCHMARKS) set(NO_BENCHMARKS 1) endif() +include(BuildUtils) ############################################################ # Compiler flags @@ -303,6 +308,14 @@ endfunction() # # Arguments after the test name will be passed to set_tests_properties(). function(ADD_ARROW_TEST REL_TEST_NAME) + set(options) + set(single_value_args) + set(multi_value_args STATIC_LINK_LIBS) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + if(NO_TESTS OR NOT ARROW_BUILD_STATIC) return() endif() @@ -312,7 +325,13 @@ function(ADD_ARROW_TEST REL_TEST_NAME) # This test has a corresponding .cc file, set it up as an executable. set(TEST_PATH "${EXECUTABLE_OUTPUT_PATH}/${TEST_NAME}") add_executable(${TEST_NAME} "${REL_TEST_NAME}.cc") - target_link_libraries(${TEST_NAME} ${ARROW_TEST_LINK_LIBS}) + + if (ARG_STATIC_LINK_LIBS) + # Customize link libraries + target_link_libraries(${TEST_NAME} ${ARG_STATIC_LINK_LIBS}) + else() + target_link_libraries(${TEST_NAME} ${ARROW_TEST_LINK_LIBS}) + endif() add_dependencies(unittest ${TEST_NAME}) else() # No executable, just invoke the test (probably a script) directly. @@ -332,10 +351,6 @@ function(ADD_ARROW_TEST REL_TEST_NAME) ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} test ${TEST_PATH}) endif() set_tests_properties(${TEST_NAME} PROPERTIES LABELS "unittest") - - if(ARGN) - set_tests_properties(${TEST_NAME} PROPERTIES ${ARGN}) - endif() endfunction() # A wrapper for add_dependencies() that is compatible with NO_TESTS. @@ -363,72 +378,6 @@ enable_testing() ############################################################ # Dependencies ############################################################ -function(ADD_THIRDPARTY_LIB LIB_NAME) - set(options) - set(one_value_args SHARED_LIB STATIC_LIB) - set(multi_value_args DEPS) - cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) - if(ARG_UNPARSED_ARGUMENTS) - message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") - endif() - - if(ARG_STATIC_LIB AND ARG_SHARED_LIB) - if(NOT ARG_STATIC_LIB) - message(FATAL_ERROR "No static or shared library provided for ${LIB_NAME}") - endif() - - SET(AUG_LIB_NAME "${LIB_NAME}_static") - add_library(${AUG_LIB_NAME} STATIC IMPORTED) - set_target_properties(${AUG_LIB_NAME} - PROPERTIES IMPORTED_LOCATION "${ARG_STATIC_LIB}") - message("Added static library dependency ${LIB_NAME}: ${ARG_STATIC_LIB}") - - SET(AUG_LIB_NAME "${LIB_NAME}_shared") - add_library(${AUG_LIB_NAME} SHARED IMPORTED) - - if(MSVC) - # Mark the â.libâ location as part of a Windows DLL - set_target_properties(${AUG_LIB_NAME} - PROPERTIES IMPORTED_IMPLIB "${ARG_SHARED_LIB}") - else() - set_target_properties(${AUG_LIB_NAME} - PROPERTIES IMPORTED_LOCATION "${ARG_SHARED_LIB}") - endif() - message("Added shared library dependency ${LIB_NAME}: ${ARG_SHARED_LIB}") - elseif(ARG_STATIC_LIB) - add_library(${LIB_NAME} STATIC IMPORTED) - set_target_properties(${LIB_NAME} - PROPERTIES IMPORTED_LOCATION "${ARG_STATIC_LIB}") - SET(AUG_LIB_NAME "${LIB_NAME}_static") - add_library(${AUG_LIB_NAME} STATIC IMPORTED) - set_target_properties(${AUG_LIB_NAME} - PROPERTIES IMPORTED_LOCATION "${ARG_STATIC_LIB}") - message("Added static library dependency ${LIB_NAME}: ${ARG_STATIC_LIB}") - elseif(ARG_SHARED_LIB) - add_library(${LIB_NAME} SHARED IMPORTED) - set_target_properties(${LIB_NAME} - PROPERTIES IMPORTED_LOCATION "${ARG_SHARED_LIB}") - SET(AUG_LIB_NAME "${LIB_NAME}_shared") - add_library(${AUG_LIB_NAME} SHARED IMPORTED) - - if(MSVC) - # Mark the â.libâ location as part of a Windows DLL - set_target_properties(${AUG_LIB_NAME} - PROPERTIES IMPORTED_IMPLIB "${ARG_SHARED_LIB}") - else() - set_target_properties(${AUG_LIB_NAME} - PROPERTIES IMPORTED_LOCATION "${ARG_SHARED_LIB}") - endif() - message("Added shared library dependency ${LIB_NAME}: ${ARG_SHARED_LIB}") - else() - message(FATAL_ERROR "No static or shared library provided for ${LIB_NAME}") - endif() - - if(ARG_DEPS) - set_target_properties(${LIB_NAME} - PROPERTIES IMPORTED_LINK_INTERFACE_LIBRARIES "${ARG_DEPS}") - endif() -endfunction() # ---------------------------------------------------------------------- # Add Boost dependencies (code adapted from Apache Kudu (incubating)) @@ -798,8 +747,7 @@ if (${CLANG_FORMAT_FOUND}) add_custom_target(format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 1 `find ${CMAKE_CURRENT_SOURCE_DIR}/src -name \\*.cc -or -name \\*.h | sed -e '/_generated/g' | - sed -e '/windows_compatibility.h/g'` - `find ${CMAKE_CURRENT_SOURCE_DIR}/../python -name \\*.cc -or -name \\*.h`) + sed -e '/windows_compatibility.h/g'`) # runs clang format and exits with a non-zero exit code if any files need to be reformatted add_custom_target(check-format ${BUILD_SUPPORT_DIR}/run-clang-format.sh ${CMAKE_CURRENT_SOURCE_DIR} ${CLANG_FORMAT_BIN} 0 @@ -857,11 +805,9 @@ if(NOT APPLE) set(ARROW_SHARED_LINK_FLAGS "-Wl,--version-script=${CMAKE_CURRENT_SOURCE_DIR}/src/arrow/symbols.map") endif() -include(BuildUtils) - ADD_ARROW_LIB(arrow - SOURCES ${ARROW_SRCS} - SHARED_LINK_FLAGS ${ARROW_SHARED_LINK_FLAGS} + SOURCES ${ARROW_SRCS} + SHARED_LINK_FLAGS ${ARROW_SHARED_LINK_FLAGS} ) add_subdirectory(src/arrow) @@ -875,6 +821,10 @@ endif() #---------------------------------------------------------------------- # IPC library +if(ARROW_PYTHON) + set(ARROW_IPC on) +endif() + ## Flatbuffers if(ARROW_IPC) if("$ENV{FLATBUFFERS_HOME}" STREQUAL "") @@ -908,3 +858,14 @@ if(ARROW_IPC) add_subdirectory(src/arrow/ipc) endif() + +if(ARROW_PYTHON) + find_package(PythonLibsNew REQUIRED) + find_package(NumPy REQUIRED) + + include_directories(SYSTEM + ${NUMPY_INCLUDE_DIRS} + ${PYTHON_INCLUDE_DIRS}) + + add_subdirectory(src/arrow/python) +endif() http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/cmake_modules/BuildUtils.cmake ---------------------------------------------------------------------- diff --git a/cpp/cmake_modules/BuildUtils.cmake b/cpp/cmake_modules/BuildUtils.cmake index 78b514c..c993041 100644 --- a/cpp/cmake_modules/BuildUtils.cmake +++ b/cpp/cmake_modules/BuildUtils.cmake @@ -15,6 +15,73 @@ # specific language governing permissions and limitations # under the License. +function(ADD_THIRDPARTY_LIB LIB_NAME) + set(options) + set(one_value_args SHARED_LIB STATIC_LIB) + set(multi_value_args DEPS) + cmake_parse_arguments(ARG "${options}" "${one_value_args}" "${multi_value_args}" ${ARGN}) + if(ARG_UNPARSED_ARGUMENTS) + message(SEND_ERROR "Error: unrecognized arguments: ${ARG_UNPARSED_ARGUMENTS}") + endif() + + if(ARG_STATIC_LIB AND ARG_SHARED_LIB) + if(NOT ARG_STATIC_LIB) + message(FATAL_ERROR "No static or shared library provided for ${LIB_NAME}") + endif() + + SET(AUG_LIB_NAME "${LIB_NAME}_static") + add_library(${AUG_LIB_NAME} STATIC IMPORTED) + set_target_properties(${AUG_LIB_NAME} + PROPERTIES IMPORTED_LOCATION "${ARG_STATIC_LIB}") + message("Added static library dependency ${LIB_NAME}: ${ARG_STATIC_LIB}") + + SET(AUG_LIB_NAME "${LIB_NAME}_shared") + add_library(${AUG_LIB_NAME} SHARED IMPORTED) + + if(MSVC) + # Mark the â.libâ location as part of a Windows DLL + set_target_properties(${AUG_LIB_NAME} + PROPERTIES IMPORTED_IMPLIB "${ARG_SHARED_LIB}") + else() + set_target_properties(${AUG_LIB_NAME} + PROPERTIES IMPORTED_LOCATION "${ARG_SHARED_LIB}") + endif() + message("Added shared library dependency ${LIB_NAME}: ${ARG_SHARED_LIB}") + elseif(ARG_STATIC_LIB) + add_library(${LIB_NAME} STATIC IMPORTED) + set_target_properties(${LIB_NAME} + PROPERTIES IMPORTED_LOCATION "${ARG_STATIC_LIB}") + SET(AUG_LIB_NAME "${LIB_NAME}_static") + add_library(${AUG_LIB_NAME} STATIC IMPORTED) + set_target_properties(${AUG_LIB_NAME} + PROPERTIES IMPORTED_LOCATION "${ARG_STATIC_LIB}") + message("Added static library dependency ${LIB_NAME}: ${ARG_STATIC_LIB}") + elseif(ARG_SHARED_LIB) + add_library(${LIB_NAME} SHARED IMPORTED) + set_target_properties(${LIB_NAME} + PROPERTIES IMPORTED_LOCATION "${ARG_SHARED_LIB}") + SET(AUG_LIB_NAME "${LIB_NAME}_shared") + add_library(${AUG_LIB_NAME} SHARED IMPORTED) + + if(MSVC) + # Mark the â.libâ location as part of a Windows DLL + set_target_properties(${AUG_LIB_NAME} + PROPERTIES IMPORTED_IMPLIB "${ARG_SHARED_LIB}") + else() + set_target_properties(${AUG_LIB_NAME} + PROPERTIES IMPORTED_LOCATION "${ARG_SHARED_LIB}") + endif() + message("Added shared library dependency ${LIB_NAME}: ${ARG_SHARED_LIB}") + else() + message(FATAL_ERROR "No static or shared library provided for ${LIB_NAME}") + endif() + + if(ARG_DEPS) + set_target_properties(${LIB_NAME} + PROPERTIES IMPORTED_LINK_INTERFACE_LIBRARIES "${ARG_DEPS}") + endif() +endfunction() + function(ADD_ARROW_LIB LIB_NAME) set(options) set(one_value_args SHARED_LINK_FLAGS) @@ -45,9 +112,16 @@ function(ADD_ARROW_LIB LIB_NAME) if (ARROW_BUILD_SHARED) add_library(${LIB_NAME}_shared SHARED $<TARGET_OBJECTS:${LIB_NAME}_objlib>) + if(APPLE) - set_target_properties(${LIB_NAME}_shared PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") + # On OS X, you can avoid linking at library load time and instead + # expecting that the symbols have been loaded separately. This happens + # with libpython* where there can be conflicts between system Python and + # the Python from a thirdparty distribution + set(ARG_SHARED_LINK_FLAGS + "-undefined dynamic_lookup ${ARG_SHARED_LINK_FLAGS}") endif() + set_target_properties(${LIB_NAME}_shared PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}" @@ -55,6 +129,7 @@ function(ADD_ARROW_LIB LIB_NAME) OUTPUT_NAME ${LIB_NAME} VERSION "${ARROW_ABI_VERSION}" SOVERSION "${ARROW_SO_VERSION}") + target_link_libraries(${LIB_NAME}_shared LINK_PUBLIC ${ARG_SHARED_LINK_LIBS} LINK_PRIVATE ${ARG_SHARED_PRIVATE_LINK_LIBS}) @@ -68,28 +143,28 @@ function(ADD_ARROW_LIB LIB_NAME) set_target_properties(${LIB_NAME}_shared PROPERTIES INSTALL_RPATH ${_lib_install_rpath}) endif() - + install(TARGETS ${LIB_NAME}_shared LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) endif() - + if (ARROW_BUILD_STATIC) add_library(${LIB_NAME}_static STATIC $<TARGET_OBJECTS:${LIB_NAME}_objlib>) set_target_properties(${LIB_NAME}_static PROPERTIES LIBRARY_OUTPUT_DIRECTORY "${BUILD_OUTPUT_ROOT_DIRECTORY}" OUTPUT_NAME ${LIB_NAME}) - + target_link_libraries(${LIB_NAME}_static LINK_PUBLIC ${ARG_STATIC_LINK_LIBS} LINK_PRIVATE ${ARG_STATIC_PRIVATE_LINK_LIBS}) - + install(TARGETS ${LIB_NAME}_static LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR}) endif() - + if (APPLE) set_target_properties(${LIB_NAME}_shared PROPERTIES @@ -98,4 +173,3 @@ function(ADD_ARROW_LIB LIB_NAME) endif() endfunction() - http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/cmake_modules/FindNumPy.cmake ---------------------------------------------------------------------- diff --git a/cpp/cmake_modules/FindNumPy.cmake b/cpp/cmake_modules/FindNumPy.cmake new file mode 100644 index 0000000..58bb531 --- /dev/null +++ b/cpp/cmake_modules/FindNumPy.cmake @@ -0,0 +1,100 @@ +# - Find the NumPy libraries +# This module finds if NumPy is installed, and sets the following variables +# indicating where it is. +# +# TODO: Update to provide the libraries and paths for linking npymath lib. +# +# NUMPY_FOUND - was NumPy found +# NUMPY_VERSION - the version of NumPy found as a string +# NUMPY_VERSION_MAJOR - the major version number of NumPy +# NUMPY_VERSION_MINOR - the minor version number of NumPy +# NUMPY_VERSION_PATCH - the patch version number of NumPy +# NUMPY_VERSION_DECIMAL - e.g. version 1.6.1 is 10601 +# NUMPY_INCLUDE_DIRS - path to the NumPy include files + +#============================================================================ +# Copyright 2012 Continuum Analytics, Inc. +# +# MIT License +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files +# (the "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to permit +# persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be included +# in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS +# OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR +# OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +# ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. +# +#============================================================================ + +# Finding NumPy involves calling the Python interpreter +if(NumPy_FIND_REQUIRED) + find_package(PythonInterp REQUIRED) +else() + find_package(PythonInterp) +endif() + +if(NOT PYTHONINTERP_FOUND) + set(NUMPY_FOUND FALSE) + return() +endif() + +execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" + "import numpy as n; print(n.__version__); print(n.get_include());" + RESULT_VARIABLE _NUMPY_SEARCH_SUCCESS + OUTPUT_VARIABLE _NUMPY_VALUES_OUTPUT + ERROR_VARIABLE _NUMPY_ERROR_VALUE + OUTPUT_STRIP_TRAILING_WHITESPACE) + +if(NOT _NUMPY_SEARCH_SUCCESS MATCHES 0) + if(NumPy_FIND_REQUIRED) + message(FATAL_ERROR + "NumPy import failure:\n${_NUMPY_ERROR_VALUE}") + endif() + set(NUMPY_FOUND FALSE) + return() +endif() + +# Convert the process output into a list +string(REGEX REPLACE ";" "\\\\;" _NUMPY_VALUES ${_NUMPY_VALUES_OUTPUT}) +string(REGEX REPLACE "\n" ";" _NUMPY_VALUES ${_NUMPY_VALUES}) +list(GET _NUMPY_VALUES 0 NUMPY_VERSION) +list(GET _NUMPY_VALUES 1 NUMPY_INCLUDE_DIRS) + +string(REGEX MATCH "^[0-9]+\\.[0-9]+\\.[0-9]+" _VER_CHECK "${NUMPY_VERSION}") +if("${_VER_CHECK}" STREQUAL "") + # The output from Python was unexpected. Raise an error always + # here, because we found NumPy, but it appears to be corrupted somehow. + message(FATAL_ERROR + "Requested version and include path from NumPy, got instead:\n${_NUMPY_VALUES_OUTPUT}\n") + return() +endif() + +# Make sure all directory separators are '/' +string(REGEX REPLACE "\\\\" "/" NUMPY_INCLUDE_DIRS ${NUMPY_INCLUDE_DIRS}) + +# Get the major and minor version numbers +string(REGEX REPLACE "\\." ";" _NUMPY_VERSION_LIST ${NUMPY_VERSION}) +list(GET _NUMPY_VERSION_LIST 0 NUMPY_VERSION_MAJOR) +list(GET _NUMPY_VERSION_LIST 1 NUMPY_VERSION_MINOR) +list(GET _NUMPY_VERSION_LIST 2 NUMPY_VERSION_PATCH) +string(REGEX MATCH "[0-9]*" NUMPY_VERSION_PATCH ${NUMPY_VERSION_PATCH}) +math(EXPR NUMPY_VERSION_DECIMAL + "(${NUMPY_VERSION_MAJOR} * 10000) + (${NUMPY_VERSION_MINOR} * 100) + ${NUMPY_VERSION_PATCH}") + +find_package_message(NUMPY + "Found NumPy: version \"${NUMPY_VERSION}\" ${NUMPY_INCLUDE_DIRS}" + "${NUMPY_INCLUDE_DIRS}${NUMPY_VERSION}") + +set(NUMPY_FOUND TRUE) http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/cmake_modules/FindPythonLibsNew.cmake ---------------------------------------------------------------------- diff --git a/cpp/cmake_modules/FindPythonLibsNew.cmake b/cpp/cmake_modules/FindPythonLibsNew.cmake new file mode 100644 index 0000000..1000a95 --- /dev/null +++ b/cpp/cmake_modules/FindPythonLibsNew.cmake @@ -0,0 +1,241 @@ +# - Find python libraries +# This module finds the libraries corresponding to the Python interpeter +# FindPythonInterp provides. +# This code sets the following variables: +# +# PYTHONLIBS_FOUND - have the Python libs been found +# PYTHON_PREFIX - path to the Python installation +# PYTHON_LIBRARIES - path to the python library +# PYTHON_INCLUDE_DIRS - path to where Python.h is found +# PYTHON_SITE_PACKAGES - path to installation site-packages +# PYTHON_IS_DEBUG - whether the Python interpreter is a debug build +# +# PYTHON_INCLUDE_PATH - path to where Python.h is found (deprecated) +# +# A function PYTHON_ADD_MODULE(<name> src1 src2 ... srcN) is defined +# to build modules for python. +# +# Thanks to talljimbo for the patch adding the 'LDVERSION' config +# variable usage. + +#============================================================================= +# Copyright 2001-2009 Kitware, Inc. +# Copyright 2012-2014 Continuum Analytics, Inc. +# +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# * Neither the names of Kitware, Inc., the Insight Software Consortium, +# nor the names of their contributors may be used to endorse or promote +# products derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +# # A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +# HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#============================================================================= +# (To distribute this file outside of CMake, substitute the full +# License text for the above reference.) + +# Use the Python interpreter to find the libs. +if(PythonLibsNew_FIND_REQUIRED) + find_package(PythonInterp REQUIRED) +else() + find_package(PythonInterp) +endif() + +if(NOT PYTHONINTERP_FOUND) + set(PYTHONLIBS_FOUND FALSE) + return() +endif() + +# According to http://stackoverflow.com/questions/646518/python-how-to-detect-debug-interpreter +# testing whether sys has the gettotalrefcount function is a reliable, +# cross-platform way to detect a CPython debug interpreter. +# +# The library suffix is from the config var LDVERSION sometimes, otherwise +# VERSION. VERSION will typically be like "2.7" on unix, and "27" on windows. +# +# The config var LIBPL is for Linux, and helps on Debian Jessie where the +# addition of multi-arch support shuffled things around. +execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" + "from distutils import sysconfig as s;import sys;import struct; +print('.'.join(str(v) for v in sys.version_info)); +print(sys.prefix); +print(s.get_python_inc(plat_specific=True)); +print(s.get_python_lib(plat_specific=True)); +print(s.get_config_var('SO')); +print(hasattr(sys, 'gettotalrefcount')+0); +print(struct.calcsize('@P')); +print(s.get_config_var('LDVERSION') or s.get_config_var('VERSION')); +print(s.get_config_var('LIBPL')); +" + RESULT_VARIABLE _PYTHON_SUCCESS + OUTPUT_VARIABLE _PYTHON_VALUES + ERROR_VARIABLE _PYTHON_ERROR_VALUE + OUTPUT_STRIP_TRAILING_WHITESPACE) + +if(NOT _PYTHON_SUCCESS MATCHES 0) + if(PythonLibsNew_FIND_REQUIRED) + message(FATAL_ERROR + "Python config failure:\n${_PYTHON_ERROR_VALUE}") + endif() + set(PYTHONLIBS_FOUND FALSE) + return() +endif() + +# Convert the process output into a list +string(REGEX REPLACE ";" "\\\\;" _PYTHON_VALUES ${_PYTHON_VALUES}) +string(REGEX REPLACE "\n" ";" _PYTHON_VALUES ${_PYTHON_VALUES}) +list(GET _PYTHON_VALUES 0 _PYTHON_VERSION_LIST) +list(GET _PYTHON_VALUES 1 PYTHON_PREFIX) +list(GET _PYTHON_VALUES 2 PYTHON_INCLUDE_DIR) +list(GET _PYTHON_VALUES 3 PYTHON_SITE_PACKAGES) +list(GET _PYTHON_VALUES 4 PYTHON_MODULE_EXTENSION) +list(GET _PYTHON_VALUES 5 PYTHON_IS_DEBUG) +list(GET _PYTHON_VALUES 6 PYTHON_SIZEOF_VOID_P) +list(GET _PYTHON_VALUES 7 PYTHON_LIBRARY_SUFFIX) +list(GET _PYTHON_VALUES 8 PYTHON_LIBRARY_PATH) + +# Make sure the Python has the same pointer-size as the chosen compiler +# Skip the check on OS X, it doesn't consistently have CMAKE_SIZEOF_VOID_P defined +if((NOT APPLE) AND (NOT "${PYTHON_SIZEOF_VOID_P}" STREQUAL "${CMAKE_SIZEOF_VOID_P}")) + if(PythonLibsNew_FIND_REQUIRED) + math(EXPR _PYTHON_BITS "${PYTHON_SIZEOF_VOID_P} * 8") + math(EXPR _CMAKE_BITS "${CMAKE_SIZEOF_VOID_P} * 8") + message(FATAL_ERROR + "Python config failure: Python is ${_PYTHON_BITS}-bit, " + "chosen compiler is ${_CMAKE_BITS}-bit") + endif() + set(PYTHONLIBS_FOUND FALSE) + return() +endif() + +# The built-in FindPython didn't always give the version numbers +string(REGEX REPLACE "\\." ";" _PYTHON_VERSION_LIST ${_PYTHON_VERSION_LIST}) +list(GET _PYTHON_VERSION_LIST 0 PYTHON_VERSION_MAJOR) +list(GET _PYTHON_VERSION_LIST 1 PYTHON_VERSION_MINOR) +list(GET _PYTHON_VERSION_LIST 2 PYTHON_VERSION_PATCH) + +# Make sure all directory separators are '/' +string(REGEX REPLACE "\\\\" "/" PYTHON_PREFIX ${PYTHON_PREFIX}) +string(REGEX REPLACE "\\\\" "/" PYTHON_INCLUDE_DIR ${PYTHON_INCLUDE_DIR}) +string(REGEX REPLACE "\\\\" "/" PYTHON_SITE_PACKAGES ${PYTHON_SITE_PACKAGES}) + +if(CMAKE_HOST_WIN32) + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") + set(PYTHON_LIBRARY + "${PYTHON_PREFIX}/libs/Python${PYTHON_LIBRARY_SUFFIX}.lib") + else() + set(PYTHON_LIBRARY "${PYTHON_PREFIX}/libs/libpython${PYTHON_LIBRARY_SUFFIX}.a") + endif() +elseif(APPLE) + # Seems to require "-undefined dynamic_lookup" instead of linking + # against the .dylib, otherwise it crashes. This flag is added + # below + set(PYTHON_LIBRARY "") + #set(PYTHON_LIBRARY + # "${PYTHON_PREFIX}/lib/libpython${PYTHON_LIBRARY_SUFFIX}.dylib") +else() + if(${PYTHON_SIZEOF_VOID_P} MATCHES 8) + set(_PYTHON_LIBS_SEARCH "${PYTHON_PREFIX}/lib64" "${PYTHON_PREFIX}/lib" "${PYTHON_LIBRARY_PATH}") + else() + set(_PYTHON_LIBS_SEARCH "${PYTHON_PREFIX}/lib" "${PYTHON_LIBRARY_PATH}") + endif() + message(STATUS "Searching for Python libs in ${_PYTHON_LIBS_SEARCH}") + message(STATUS "Looking for python${PYTHON_LIBRARY_SUFFIX}") + # Probably this needs to be more involved. It would be nice if the config + # information the python interpreter itself gave us were more complete. + find_library(PYTHON_LIBRARY + NAMES "python${PYTHON_LIBRARY_SUFFIX}" + PATHS ${_PYTHON_LIBS_SEARCH} + NO_SYSTEM_ENVIRONMENT_PATH) + message(STATUS "Found Python lib ${PYTHON_LIBRARY}") +endif() + +# For backward compatibility, set PYTHON_INCLUDE_PATH, but make it internal. +SET(PYTHON_INCLUDE_PATH "${PYTHON_INCLUDE_DIR}" CACHE INTERNAL + "Path to where Python.h is found (deprecated)") + +MARK_AS_ADVANCED( + PYTHON_LIBRARY + PYTHON_INCLUDE_DIR +) + +# We use PYTHON_INCLUDE_DIR, PYTHON_LIBRARY and PYTHON_DEBUG_LIBRARY for the +# cache entries because they are meant to specify the location of a single +# library. We now set the variables listed by the documentation for this +# module. +SET(PYTHON_INCLUDE_DIRS "${PYTHON_INCLUDE_DIR}") +SET(PYTHON_LIBRARIES "${PYTHON_LIBRARY}") +SET(PYTHON_DEBUG_LIBRARIES "${PYTHON_DEBUG_LIBRARY}") + + +# Don't know how to get to this directory, just doing something simple :P +#INCLUDE(${CMAKE_CURRENT_LIST_DIR}/FindPackageHandleStandardArgs.cmake) +#FIND_PACKAGE_HANDLE_STANDARD_ARGS(PythonLibs DEFAULT_MSG PYTHON_LIBRARIES PYTHON_INCLUDE_DIRS) +find_package_message(PYTHON + "Found PythonLibs: ${PYTHON_LIBRARY}" + "${PYTHON_EXECUTABLE}${PYTHON_VERSION}") + + +# PYTHON_ADD_MODULE(<name> src1 src2 ... srcN) is used to build modules for python. +FUNCTION(PYTHON_ADD_MODULE _NAME ) + GET_PROPERTY(_TARGET_SUPPORTS_SHARED_LIBS + GLOBAL PROPERTY TARGET_SUPPORTS_SHARED_LIBS) + OPTION(PYTHON_ENABLE_MODULE_${_NAME} "Add module ${_NAME}" TRUE) + OPTION(PYTHON_MODULE_${_NAME}_BUILD_SHARED + "Add module ${_NAME} shared" ${_TARGET_SUPPORTS_SHARED_LIBS}) + + # Mark these options as advanced + MARK_AS_ADVANCED(PYTHON_ENABLE_MODULE_${_NAME} + PYTHON_MODULE_${_NAME}_BUILD_SHARED) + + IF(PYTHON_ENABLE_MODULE_${_NAME}) + IF(PYTHON_MODULE_${_NAME}_BUILD_SHARED) + SET(PY_MODULE_TYPE MODULE) + ELSE(PYTHON_MODULE_${_NAME}_BUILD_SHARED) + SET(PY_MODULE_TYPE STATIC) + SET_PROPERTY(GLOBAL APPEND PROPERTY PY_STATIC_MODULES_LIST ${_NAME}) + ENDIF(PYTHON_MODULE_${_NAME}_BUILD_SHARED) + + SET_PROPERTY(GLOBAL APPEND PROPERTY PY_MODULES_LIST ${_NAME}) + ADD_LIBRARY(${_NAME} ${PY_MODULE_TYPE} ${ARGN}) + IF(APPLE) + # On OS X, linking against the Python libraries causes + # segfaults, so do this dynamic lookup instead. + SET_TARGET_PROPERTIES(${_NAME} PROPERTIES LINK_FLAGS + "-undefined dynamic_lookup") + ELSE() + # In general, we should not link against libpython as we do not embed + # the Python interpreter. The python binary itself can then define where + # the symbols should loaded from. + SET_TARGET_PROPERTIES(${_NAME} PROPERTIES LINK_FLAGS + "-Wl,-undefined,dynamic_lookup") + ENDIF() + IF(PYTHON_MODULE_${_NAME}_BUILD_SHARED) + SET_TARGET_PROPERTIES(${_NAME} PROPERTIES PREFIX "${PYTHON_MODULE_PREFIX}") + SET_TARGET_PROPERTIES(${_NAME} PROPERTIES SUFFIX "${PYTHON_MODULE_EXTENSION}") + ELSE() + ENDIF() + + ENDIF(PYTHON_ENABLE_MODULE_${_NAME}) +ENDFUNCTION(PYTHON_ADD_MODULE) http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/src/arrow/python/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/CMakeLists.txt b/cpp/src/arrow/python/CMakeLists.txt new file mode 100644 index 0000000..03f5afc --- /dev/null +++ b/cpp/src/arrow/python/CMakeLists.txt @@ -0,0 +1,93 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# + +####################################### +# arrow_python +####################################### + +if (ARROW_BUILD_TESTS) + add_library(arrow_python_test_main STATIC + util/test_main.cc) + + if (APPLE) + target_link_libraries(arrow_python_test_main + gtest + dl) + set_target_properties(arrow_python_test_main + PROPERTIES LINK_FLAGS "-undefined dynamic_lookup") + else() + target_link_libraries(arrow_python_test_main + gtest + pthread + dl + ) + endif() +endif() + +set(ARROW_PYTHON_MIN_TEST_LIBS + arrow_python_test_main + arrow_python_static + arrow_ipc_static + arrow_io_static + arrow_static) + +if(NOT APPLE AND ARROW_BUILD_TESTS) + ADD_THIRDPARTY_LIB(python + SHARED_LIB "${PYTHON_LIBRARIES}") + list(APPEND ARROW_PYTHON_MIN_TEST_LIBS python) +endif() + +set(ARROW_PYTHON_TEST_LINK_LIBS ${ARROW_PYTHON_MIN_TEST_LIBS}) + +# ---------------------------------------------------------------------- + +set(ARROW_PYTHON_SRCS + builtin_convert.cc + common.cc + config.cc + helpers.cc + io.cc + pandas_convert.cc +) + +set(ARROW_PYTHON_SHARED_LINK_LIBS + arrow_io_shared + arrow_ipc_shared + arrow_shared +) + +ADD_ARROW_LIB(arrow_python + SOURCES ${ARROW_PYTHON_SRCS} + SHARED_LINK_FLAGS "" + SHARED_LINK_LIBS ${ARROW_PYTHON_SHARED_LINK_LIBS} + STATIC_LINK_LIBS ${ARROW_IO_SHARED_PRIVATE_LINK_LIBS} +) + +install(FILES + api.h + builtin_convert.h + common.h + config.h + do_import_numpy.h + helpers.h + io.h + numpy_interop.h + pandas_convert.h + type_traits.h + DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/arrow/python") + +# set_target_properties(arrow_python_shared PROPERTIES +# INSTALL_RPATH "\$ORIGIN") + +if (ARROW_BUILD_TESTS) + ADD_ARROW_TEST(pandas-test + STATIC_LINK_LIBS "${ARROW_PYTHON_TEST_LINK_LIBS}") +endif() http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/src/arrow/python/api.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/api.h b/cpp/src/arrow/python/api.h new file mode 100644 index 0000000..f4f1c0c --- /dev/null +++ b/cpp/src/arrow/python/api.h @@ -0,0 +1,27 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PYTHON_API_H +#define ARROW_PYTHON_API_H + +#include "arrow/python/builtin_convert.h" +#include "arrow/python/common.h" +#include "arrow/python/helpers.h" +#include "arrow/python/io.h" +#include "arrow/python/pandas_convert.h" + +#endif // ARROW_PYTHON_API_H http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/src/arrow/python/builtin_convert.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/builtin_convert.cc b/cpp/src/arrow/python/builtin_convert.cc new file mode 100644 index 0000000..9acccc1 --- /dev/null +++ b/cpp/src/arrow/python/builtin_convert.cc @@ -0,0 +1,527 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <Python.h> +#include <datetime.h> +#include <sstream> + +#include "arrow/python/builtin_convert.h" + +#include "arrow/api.h" +#include "arrow/status.h" + +#include "arrow/python/helpers.h" +#include "arrow/python/util/datetime.h" + +namespace arrow { +namespace py { + +static inline bool IsPyInteger(PyObject* obj) { +#if PYARROW_IS_PY2 + return PyLong_Check(obj) || PyInt_Check(obj); +#else + return PyLong_Check(obj); +#endif +} + +class ScalarVisitor { + public: + ScalarVisitor() + : total_count_(0), + none_count_(0), + bool_count_(0), + int_count_(0), + date_count_(0), + timestamp_count_(0), + float_count_(0), + binary_count_(0), + unicode_count_(0) {} + + void Visit(PyObject* obj) { + ++total_count_; + if (obj == Py_None) { + ++none_count_; + } else if (PyBool_Check(obj)) { + ++bool_count_; + } else if (PyFloat_Check(obj)) { + ++float_count_; + } else if (IsPyInteger(obj)) { + ++int_count_; + } else if (PyDate_CheckExact(obj)) { + ++date_count_; + } else if (PyDateTime_CheckExact(obj)) { + ++timestamp_count_; + } else if (PyBytes_Check(obj)) { + ++binary_count_; + } else if (PyUnicode_Check(obj)) { + ++unicode_count_; + } else { + // TODO(wesm): accumulate error information somewhere + } + } + + std::shared_ptr<DataType> GetType() { + // TODO(wesm): handling mixed-type cases + if (float_count_) { + return float64(); + } else if (int_count_) { + // TODO(wesm): tighter type later + return int64(); + } else if (date_count_) { + return date64(); + } else if (timestamp_count_) { + return timestamp(TimeUnit::MICRO); + } else if (bool_count_) { + return boolean(); + } else if (binary_count_) { + return binary(); + } else if (unicode_count_) { + return utf8(); + } else { + return null(); + } + } + + int64_t total_count() const { return total_count_; } + + private: + int64_t total_count_; + int64_t none_count_; + int64_t bool_count_; + int64_t int_count_; + int64_t date_count_; + int64_t timestamp_count_; + int64_t float_count_; + int64_t binary_count_; + int64_t unicode_count_; + + // Place to accumulate errors + // std::vector<Status> errors_; +}; + +static constexpr int MAX_NESTING_LEVELS = 32; + +class SeqVisitor { + public: + SeqVisitor() : max_nesting_level_(0) { + memset(nesting_histogram_, 0, MAX_NESTING_LEVELS * sizeof(int)); + } + + Status Visit(PyObject* obj, int level = 0) { + Py_ssize_t size = PySequence_Size(obj); + + if (level > max_nesting_level_) { max_nesting_level_ = level; } + + for (int64_t i = 0; i < size; ++i) { + // TODO(wesm): Error checking? + // TODO(wesm): Specialize for PyList_GET_ITEM? + OwnedRef item_ref(PySequence_GetItem(obj, i)); + PyObject* item = item_ref.obj(); + + if (PyList_Check(item)) { + RETURN_NOT_OK(Visit(item, level + 1)); + } else if (PyDict_Check(item)) { + return Status::NotImplemented("No type inference for dicts"); + } else { + // We permit nulls at any level of nesting + if (item == Py_None) { + // TODO + } else { + ++nesting_histogram_[level]; + scalars_.Visit(item); + } + } + } + return Status::OK(); + } + + std::shared_ptr<DataType> GetType() { + if (scalars_.total_count() == 0) { + if (max_nesting_level_ == 0) { + return null(); + } else { + return nullptr; + } + } else { + std::shared_ptr<DataType> result = scalars_.GetType(); + for (int i = 0; i < max_nesting_level_; ++i) { + result = std::make_shared<ListType>(result); + } + return result; + } + } + + Status Validate() const { + if (scalars_.total_count() > 0) { + if (num_nesting_levels() > 1) { + return Status::Invalid("Mixed nesting levels not supported"); + } else if (max_observed_level() < max_nesting_level_) { + return Status::Invalid("Mixed nesting levels not supported"); + } + } + return Status::OK(); + } + + int max_observed_level() const { + int result = 0; + for (int i = 0; i < MAX_NESTING_LEVELS; ++i) { + if (nesting_histogram_[i] > 0) { result = i; } + } + return result; + } + + int num_nesting_levels() const { + int result = 0; + for (int i = 0; i < MAX_NESTING_LEVELS; ++i) { + if (nesting_histogram_[i] > 0) { ++result; } + } + return result; + } + + private: + ScalarVisitor scalars_; + + // Track observed + int max_nesting_level_; + int nesting_histogram_[MAX_NESTING_LEVELS]; +}; + +// Non-exhaustive type inference +Status InferArrowType(PyObject* obj, int64_t* size, std::shared_ptr<DataType>* out_type) { + *size = PySequence_Size(obj); + if (PyErr_Occurred()) { + // Not a sequence + PyErr_Clear(); + return Status::TypeError("Object is not a sequence"); + } + + // For 0-length sequences, refuse to guess + if (*size == 0) { *out_type = null(); } + + SeqVisitor seq_visitor; + RETURN_NOT_OK(seq_visitor.Visit(obj)); + RETURN_NOT_OK(seq_visitor.Validate()); + + *out_type = seq_visitor.GetType(); + + if (*out_type == nullptr) { return Status::TypeError("Unable to determine data type"); } + + return Status::OK(); +} + +// Marshal Python sequence (list, tuple, etc.) to Arrow array +class SeqConverter { + public: + virtual Status Init(const std::shared_ptr<ArrayBuilder>& builder) { + builder_ = builder; + return Status::OK(); + } + + virtual Status AppendData(PyObject* seq) = 0; + + protected: + std::shared_ptr<ArrayBuilder> builder_; +}; + +template <typename BuilderType> +class TypedConverter : public SeqConverter { + public: + Status Init(const std::shared_ptr<ArrayBuilder>& builder) override { + builder_ = builder; + typed_builder_ = static_cast<BuilderType*>(builder.get()); + return Status::OK(); + } + + protected: + BuilderType* typed_builder_; +}; + +class BoolConverter : public TypedConverter<BooleanBuilder> { + public: + Status AppendData(PyObject* seq) override { + Py_ssize_t size = PySequence_Size(seq); + RETURN_NOT_OK(typed_builder_->Reserve(size)); + for (int64_t i = 0; i < size; ++i) { + OwnedRef item(PySequence_GetItem(seq, i)); + if (item.obj() == Py_None) { + typed_builder_->AppendNull(); + } else { + if (item.obj() == Py_True) { + typed_builder_->Append(true); + } else { + typed_builder_->Append(false); + } + } + } + return Status::OK(); + } +}; + +class Int64Converter : public TypedConverter<Int64Builder> { + public: + Status AppendData(PyObject* seq) override { + int64_t val; + Py_ssize_t size = PySequence_Size(seq); + RETURN_NOT_OK(typed_builder_->Reserve(size)); + for (int64_t i = 0; i < size; ++i) { + OwnedRef item(PySequence_GetItem(seq, i)); + if (item.obj() == Py_None) { + typed_builder_->AppendNull(); + } else { + val = PyLong_AsLongLong(item.obj()); + RETURN_IF_PYERROR(); + typed_builder_->Append(val); + } + } + return Status::OK(); + } +}; + +class DateConverter : public TypedConverter<Date64Builder> { + public: + Status AppendData(PyObject* seq) override { + Py_ssize_t size = PySequence_Size(seq); + RETURN_NOT_OK(typed_builder_->Reserve(size)); + for (int64_t i = 0; i < size; ++i) { + OwnedRef item(PySequence_GetItem(seq, i)); + if (item.obj() == Py_None) { + typed_builder_->AppendNull(); + } else { + PyDateTime_Date* pydate = reinterpret_cast<PyDateTime_Date*>(item.obj()); + typed_builder_->Append(PyDate_to_ms(pydate)); + } + } + return Status::OK(); + } +}; + +class TimestampConverter : public TypedConverter<TimestampBuilder> { + public: + Status AppendData(PyObject* seq) override { + Py_ssize_t size = PySequence_Size(seq); + RETURN_NOT_OK(typed_builder_->Reserve(size)); + for (int64_t i = 0; i < size; ++i) { + OwnedRef item(PySequence_GetItem(seq, i)); + if (item.obj() == Py_None) { + typed_builder_->AppendNull(); + } else { + PyDateTime_DateTime* pydatetime = + reinterpret_cast<PyDateTime_DateTime*>(item.obj()); + struct tm datetime = {0}; + datetime.tm_year = PyDateTime_GET_YEAR(pydatetime) - 1900; + datetime.tm_mon = PyDateTime_GET_MONTH(pydatetime) - 1; + datetime.tm_mday = PyDateTime_GET_DAY(pydatetime); + datetime.tm_hour = PyDateTime_DATE_GET_HOUR(pydatetime); + datetime.tm_min = PyDateTime_DATE_GET_MINUTE(pydatetime); + datetime.tm_sec = PyDateTime_DATE_GET_SECOND(pydatetime); + int us = PyDateTime_DATE_GET_MICROSECOND(pydatetime); + RETURN_IF_PYERROR(); + struct tm epoch = {0}; + epoch.tm_year = 70; + epoch.tm_mday = 1; + // Microseconds since the epoch + int64_t val = lrint(difftime(mktime(&datetime), mktime(&epoch))) * 1000000 + us; + typed_builder_->Append(val); + } + } + return Status::OK(); + } +}; + +class DoubleConverter : public TypedConverter<DoubleBuilder> { + public: + Status AppendData(PyObject* seq) override { + double val; + Py_ssize_t size = PySequence_Size(seq); + RETURN_NOT_OK(typed_builder_->Reserve(size)); + for (int64_t i = 0; i < size; ++i) { + OwnedRef item(PySequence_GetItem(seq, i)); + if (item.obj() == Py_None) { + typed_builder_->AppendNull(); + } else { + val = PyFloat_AsDouble(item.obj()); + RETURN_IF_PYERROR(); + typed_builder_->Append(val); + } + } + return Status::OK(); + } +}; + +class BytesConverter : public TypedConverter<BinaryBuilder> { + public: + Status AppendData(PyObject* seq) override { + PyObject* item; + PyObject* bytes_obj; + OwnedRef tmp; + const char* bytes; + int64_t length; + Py_ssize_t size = PySequence_Size(seq); + for (int64_t i = 0; i < size; ++i) { + item = PySequence_GetItem(seq, i); + OwnedRef holder(item); + + if (item == Py_None) { + RETURN_NOT_OK(typed_builder_->AppendNull()); + continue; + } else if (PyUnicode_Check(item)) { + tmp.reset(PyUnicode_AsUTF8String(item)); + RETURN_IF_PYERROR(); + bytes_obj = tmp.obj(); + } else if (PyBytes_Check(item)) { + bytes_obj = item; + } else { + return Status::TypeError("Non-string value encountered"); + } + // No error checking + length = PyBytes_GET_SIZE(bytes_obj); + bytes = PyBytes_AS_STRING(bytes_obj); + RETURN_NOT_OK(typed_builder_->Append(bytes, length)); + } + return Status::OK(); + } +}; + +class UTF8Converter : public TypedConverter<StringBuilder> { + public: + Status AppendData(PyObject* seq) override { + PyObject* item; + PyObject* bytes_obj; + OwnedRef tmp; + const char* bytes; + int64_t length; + Py_ssize_t size = PySequence_Size(seq); + for (int64_t i = 0; i < size; ++i) { + item = PySequence_GetItem(seq, i); + OwnedRef holder(item); + + if (item == Py_None) { + RETURN_NOT_OK(typed_builder_->AppendNull()); + continue; + } else if (!PyUnicode_Check(item)) { + return Status::TypeError("Non-unicode value encountered"); + } + tmp.reset(PyUnicode_AsUTF8String(item)); + RETURN_IF_PYERROR(); + bytes_obj = tmp.obj(); + + // No error checking + length = PyBytes_GET_SIZE(bytes_obj); + bytes = PyBytes_AS_STRING(bytes_obj); + RETURN_NOT_OK(typed_builder_->Append(bytes, length)); + } + return Status::OK(); + } +}; + +class ListConverter : public TypedConverter<ListBuilder> { + public: + Status Init(const std::shared_ptr<ArrayBuilder>& builder) override; + + Status AppendData(PyObject* seq) override { + Py_ssize_t size = PySequence_Size(seq); + for (int64_t i = 0; i < size; ++i) { + OwnedRef item(PySequence_GetItem(seq, i)); + if (item.obj() == Py_None) { + RETURN_NOT_OK(typed_builder_->AppendNull()); + } else { + typed_builder_->Append(); + RETURN_NOT_OK(value_converter_->AppendData(item.obj())); + } + } + return Status::OK(); + } + + protected: + std::shared_ptr<SeqConverter> value_converter_; +}; + +// Dynamic constructor for sequence converters +std::shared_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>& type) { + switch (type->type) { + case Type::BOOL: + return std::make_shared<BoolConverter>(); + case Type::INT64: + return std::make_shared<Int64Converter>(); + case Type::DATE64: + return std::make_shared<DateConverter>(); + case Type::TIMESTAMP: + return std::make_shared<TimestampConverter>(); + case Type::DOUBLE: + return std::make_shared<DoubleConverter>(); + case Type::BINARY: + return std::make_shared<BytesConverter>(); + case Type::STRING: + return std::make_shared<UTF8Converter>(); + case Type::LIST: + return std::make_shared<ListConverter>(); + case Type::STRUCT: + default: + return nullptr; + break; + } +} + +Status ListConverter::Init(const std::shared_ptr<ArrayBuilder>& builder) { + builder_ = builder; + typed_builder_ = static_cast<ListBuilder*>(builder.get()); + + value_converter_ = + GetConverter(static_cast<ListType*>(builder->type().get())->value_type()); + if (value_converter_ == nullptr) { + return Status::NotImplemented("value type not implemented"); + } + + value_converter_->Init(typed_builder_->value_builder()); + return Status::OK(); +} + +Status AppendPySequence(PyObject* obj, const std::shared_ptr<DataType>& type, + const std::shared_ptr<ArrayBuilder>& builder) { + std::shared_ptr<SeqConverter> converter = GetConverter(type); + if (converter == nullptr) { + std::stringstream ss; + ss << "No type converter implemented for " << type->ToString(); + return Status::NotImplemented(ss.str()); + } + converter->Init(builder); + + return converter->AppendData(obj); +} + +Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr<Array>* out) { + std::shared_ptr<DataType> type; + int64_t size; + PyDateTime_IMPORT; + RETURN_NOT_OK(InferArrowType(obj, &size, &type)); + + // Handle NA / NullType case + if (type->type == Type::NA) { + out->reset(new NullArray(size)); + return Status::OK(); + } + + // Give the sequence converter an array builder + std::shared_ptr<ArrayBuilder> builder; + RETURN_NOT_OK(MakeBuilder(pool, type, &builder)); + RETURN_NOT_OK(AppendPySequence(obj, type, builder)); + + return builder->Finish(out); +} + +} // namespace py +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/src/arrow/python/builtin_convert.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/builtin_convert.h b/cpp/src/arrow/python/builtin_convert.h new file mode 100644 index 0000000..7b50990 --- /dev/null +++ b/cpp/src/arrow/python/builtin_convert.h @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Functions for converting between CPython built-in data structures and Arrow +// data structures + +#ifndef ARROW_PYTHON_ADAPTERS_BUILTIN_H +#define ARROW_PYTHON_ADAPTERS_BUILTIN_H + +#include <Python.h> + +#include <memory> + +#include <arrow/type.h> + +#include "arrow/util/visibility.h" + +#include "arrow/python/common.h" + +namespace arrow { + +class Array; +class Status; + +namespace py { + +ARROW_EXPORT arrow::Status InferArrowType( + PyObject* obj, int64_t* size, std::shared_ptr<arrow::DataType>* out_type); + +ARROW_EXPORT arrow::Status AppendPySequence(PyObject* obj, + const std::shared_ptr<arrow::DataType>& type, + const std::shared_ptr<arrow::ArrayBuilder>& builder); + +ARROW_EXPORT +Status ConvertPySequence(PyObject* obj, MemoryPool* pool, std::shared_ptr<Array>* out); + +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_ADAPTERS_BUILTIN_H http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/src/arrow/python/common.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/common.cc b/cpp/src/arrow/python/common.cc new file mode 100644 index 0000000..a5aea30 --- /dev/null +++ b/cpp/src/arrow/python/common.cc @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/python/common.h" + +#include <cstdlib> +#include <mutex> +#include <sstream> + +#include "arrow/memory_pool.h" +#include "arrow/status.h" + +namespace arrow { +namespace py { + +static std::mutex memory_pool_mutex; +static MemoryPool* default_python_pool = nullptr; + +void set_default_memory_pool(MemoryPool* pool) { + std::lock_guard<std::mutex> guard(memory_pool_mutex); + default_python_pool = pool; +} + +MemoryPool* get_memory_pool() { + std::lock_guard<std::mutex> guard(memory_pool_mutex); + if (default_python_pool) { + return default_python_pool; + } else { + return default_memory_pool(); + } +} + +// ---------------------------------------------------------------------- +// PyBuffer + +PyBuffer::PyBuffer(PyObject* obj) : Buffer(nullptr, 0) { + if (PyObject_CheckBuffer(obj)) { + obj_ = PyMemoryView_FromObject(obj); + Py_buffer* buffer = PyMemoryView_GET_BUFFER(obj_); + data_ = reinterpret_cast<const uint8_t*>(buffer->buf); + size_ = buffer->len; + capacity_ = buffer->len; + is_mutable_ = false; + Py_INCREF(obj_); + } +} + +PyBuffer::~PyBuffer() { + PyAcquireGIL lock; + Py_DECREF(obj_); +} + +} // namespace py +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/src/arrow/python/common.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/common.h b/cpp/src/arrow/python/common.h new file mode 100644 index 0000000..f1be471 --- /dev/null +++ b/cpp/src/arrow/python/common.h @@ -0,0 +1,139 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PYTHON_COMMON_H +#define ARROW_PYTHON_COMMON_H + +#include <string> + +#include "arrow/python/config.h" + +#include "arrow/buffer.h" +#include "arrow/util/macros.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +class MemoryPool; + +namespace py { + +class PyAcquireGIL { + public: + PyAcquireGIL() { state_ = PyGILState_Ensure(); } + + ~PyAcquireGIL() { PyGILState_Release(state_); } + + private: + PyGILState_STATE state_; + DISALLOW_COPY_AND_ASSIGN(PyAcquireGIL); +}; + +#define PYARROW_IS_PY2 PY_MAJOR_VERSION <= 2 + +class OwnedRef { + public: + OwnedRef() : obj_(nullptr) {} + + explicit OwnedRef(PyObject* obj) : obj_(obj) {} + + ~OwnedRef() { + PyAcquireGIL lock; + Py_XDECREF(obj_); + } + + void reset(PyObject* obj) { + if (obj_ != nullptr) { Py_XDECREF(obj_); } + obj_ = obj; + } + + void release() { obj_ = nullptr; } + + PyObject* obj() const { return obj_; } + + private: + PyObject* obj_; +}; + +struct PyObjectStringify { + OwnedRef tmp_obj; + const char* bytes; + + explicit PyObjectStringify(PyObject* obj) { + PyObject* bytes_obj; + if (PyUnicode_Check(obj)) { + bytes_obj = PyUnicode_AsUTF8String(obj); + tmp_obj.reset(bytes_obj); + } else { + bytes_obj = obj; + } + bytes = PyBytes_AsString(bytes_obj); + } +}; + +// TODO(wesm): We can just let errors pass through. To be explored later +#define RETURN_IF_PYERROR() \ + if (PyErr_Occurred()) { \ + PyObject *exc_type, *exc_value, *traceback; \ + PyErr_Fetch(&exc_type, &exc_value, &traceback); \ + PyObjectStringify stringified(exc_value); \ + std::string message(stringified.bytes); \ + Py_DECREF(exc_type); \ + Py_XDECREF(exc_value); \ + Py_XDECREF(traceback); \ + PyErr_Clear(); \ + return Status::UnknownError(message); \ + } + +// Return the common PyArrow memory pool +ARROW_EXPORT void set_default_memory_pool(MemoryPool* pool); +ARROW_EXPORT MemoryPool* get_memory_pool(); + +class ARROW_EXPORT NumPyBuffer : public Buffer { + public: + explicit NumPyBuffer(PyArrayObject* arr) : Buffer(nullptr, 0) { + arr_ = arr; + Py_INCREF(arr); + + data_ = reinterpret_cast<const uint8_t*>(PyArray_DATA(arr_)); + size_ = PyArray_SIZE(arr_) * PyArray_DESCR(arr_)->elsize; + capacity_ = size_; + } + + virtual ~NumPyBuffer() { Py_XDECREF(arr_); } + + private: + PyArrayObject* arr_; +}; + +class ARROW_EXPORT PyBuffer : public Buffer { + public: + /// Note that the GIL must be held when calling the PyBuffer constructor. + /// + /// While memoryview objects support multi-demensional buffers, PyBuffer only supports + /// one-dimensional byte buffers. + explicit PyBuffer(PyObject* obj); + ~PyBuffer(); + + private: + PyObject* obj_; +}; + +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_COMMON_H http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/src/arrow/python/config.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/config.cc b/cpp/src/arrow/python/config.cc new file mode 100644 index 0000000..2abc4dd --- /dev/null +++ b/cpp/src/arrow/python/config.cc @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <Python.h> + +#include "arrow/python/config.h" + +namespace arrow { +namespace py { + +void Init() {} + +PyObject* numpy_nan = nullptr; + +void set_numpy_nan(PyObject* obj) { + Py_INCREF(obj); + numpy_nan = obj; +} + +} // namespace py +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/src/arrow/python/config.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/config.h b/cpp/src/arrow/python/config.h new file mode 100644 index 0000000..dd554e0 --- /dev/null +++ b/cpp/src/arrow/python/config.h @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_PYTHON_CONFIG_H +#define ARROW_PYTHON_CONFIG_H + +#include <Python.h> + +#include "arrow/python/numpy_interop.h" +#include "arrow/util/visibility.h" + +#if PY_MAJOR_VERSION >= 3 +#define PyString_Check PyUnicode_Check +#endif + +namespace arrow { +namespace py { + +ARROW_EXPORT +extern PyObject* numpy_nan; + +ARROW_EXPORT +void Init(); + +ARROW_EXPORT +void set_numpy_nan(PyObject* obj); + +} // namespace py +} // namespace arrow + +#endif // ARROW_PYTHON_CONFIG_H http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/src/arrow/python/do_import_numpy.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/do_import_numpy.h b/cpp/src/arrow/python/do_import_numpy.h new file mode 100644 index 0000000..bb4a382 --- /dev/null +++ b/cpp/src/arrow/python/do_import_numpy.h @@ -0,0 +1,21 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Trick borrowed from dynd-python for initializing the NumPy array API + +// Trigger the array import (inversion of NO_IMPORT_ARRAY) +#define NUMPY_IMPORT_ARRAY http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/src/arrow/python/helpers.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/helpers.cc b/cpp/src/arrow/python/helpers.cc new file mode 100644 index 0000000..add2d9a --- /dev/null +++ b/cpp/src/arrow/python/helpers.cc @@ -0,0 +1,55 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/python/helpers.h" + +#include <arrow/api.h> + +namespace arrow { +namespace py { + +#define GET_PRIMITIVE_TYPE(NAME, FACTORY) \ + case Type::NAME: \ + return FACTORY(); \ + break; + +std::shared_ptr<DataType> GetPrimitiveType(Type::type type) { + switch (type) { + case Type::NA: + return null(); + GET_PRIMITIVE_TYPE(UINT8, uint8); + GET_PRIMITIVE_TYPE(INT8, int8); + GET_PRIMITIVE_TYPE(UINT16, uint16); + GET_PRIMITIVE_TYPE(INT16, int16); + GET_PRIMITIVE_TYPE(UINT32, uint32); + GET_PRIMITIVE_TYPE(INT32, int32); + GET_PRIMITIVE_TYPE(UINT64, uint64); + GET_PRIMITIVE_TYPE(INT64, int64); + GET_PRIMITIVE_TYPE(DATE32, date32); + GET_PRIMITIVE_TYPE(DATE64, date64); + GET_PRIMITIVE_TYPE(BOOL, boolean); + GET_PRIMITIVE_TYPE(FLOAT, float32); + GET_PRIMITIVE_TYPE(DOUBLE, float64); + GET_PRIMITIVE_TYPE(BINARY, binary); + GET_PRIMITIVE_TYPE(STRING, utf8); + default: + return nullptr; + } +} + +} // namespace py +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/src/arrow/python/helpers.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/helpers.h b/cpp/src/arrow/python/helpers.h new file mode 100644 index 0000000..611e814 --- /dev/null +++ b/cpp/src/arrow/python/helpers.h @@ -0,0 +1,35 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_HELPERS_H +#define PYARROW_HELPERS_H + +#include <memory> + +#include "arrow/type.h" +#include "arrow/util/visibility.h" + +namespace arrow { +namespace py { + +ARROW_EXPORT +std::shared_ptr<DataType> GetPrimitiveType(Type::type type); + +} // namespace py +} // namespace arrow + +#endif // PYARROW_HELPERS_H http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/src/arrow/python/io.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/io.cc b/cpp/src/arrow/python/io.cc new file mode 100644 index 0000000..ba82a45 --- /dev/null +++ b/cpp/src/arrow/python/io.cc @@ -0,0 +1,222 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/python/io.h" + +#include <cstdint> +#include <cstdlib> +#include <string> + +#include "arrow/io/memory.h" +#include "arrow/memory_pool.h" +#include "arrow/status.h" + +#include "arrow/python/common.h" + +namespace arrow { +namespace py { + +// ---------------------------------------------------------------------- +// Python file + +PythonFile::PythonFile(PyObject* file) : file_(file) { + Py_INCREF(file_); +} + +PythonFile::~PythonFile() { + Py_DECREF(file_); +} + +static Status CheckPyError() { + if (PyErr_Occurred()) { + PyObject *exc_type, *exc_value, *traceback; + PyErr_Fetch(&exc_type, &exc_value, &traceback); + PyObjectStringify stringified(exc_value); + std::string message(stringified.bytes); + Py_XDECREF(exc_type); + Py_XDECREF(exc_value); + Py_XDECREF(traceback); + PyErr_Clear(); + return Status::IOError(message); + } + return Status::OK(); +} + +// This is annoying: because C++11 does not allow implicit conversion of string +// literals to non-const char*, we need to go through some gymnastics to use +// PyObject_CallMethod without a lot of pain (its arguments are non-const +// char*) +template <typename... ArgTypes> +static inline PyObject* cpp_PyObject_CallMethod( + PyObject* obj, const char* method_name, const char* argspec, ArgTypes... args) { + return PyObject_CallMethod( + obj, const_cast<char*>(method_name), const_cast<char*>(argspec), args...); +} + +Status PythonFile::Close() { + // whence: 0 for relative to start of file, 2 for end of file + PyObject* result = cpp_PyObject_CallMethod(file_, "close", "()"); + Py_XDECREF(result); + ARROW_RETURN_NOT_OK(CheckPyError()); + return Status::OK(); +} + +Status PythonFile::Seek(int64_t position, int whence) { + // whence: 0 for relative to start of file, 2 for end of file + PyObject* result = cpp_PyObject_CallMethod(file_, "seek", "(ii)", position, whence); + Py_XDECREF(result); + ARROW_RETURN_NOT_OK(CheckPyError()); + return Status::OK(); +} + +Status PythonFile::Read(int64_t nbytes, PyObject** out) { + PyObject* result = cpp_PyObject_CallMethod(file_, "read", "(i)", nbytes); + ARROW_RETURN_NOT_OK(CheckPyError()); + *out = result; + return Status::OK(); +} + +Status PythonFile::Write(const uint8_t* data, int64_t nbytes) { + PyObject* py_data = + PyBytes_FromStringAndSize(reinterpret_cast<const char*>(data), nbytes); + ARROW_RETURN_NOT_OK(CheckPyError()); + + PyObject* result = cpp_PyObject_CallMethod(file_, "write", "(O)", py_data); + Py_XDECREF(py_data); + Py_XDECREF(result); + ARROW_RETURN_NOT_OK(CheckPyError()); + return Status::OK(); +} + +Status PythonFile::Tell(int64_t* position) { + PyObject* result = cpp_PyObject_CallMethod(file_, "tell", "()"); + ARROW_RETURN_NOT_OK(CheckPyError()); + + *position = PyLong_AsLongLong(result); + Py_DECREF(result); + + // PyLong_AsLongLong can raise OverflowError + ARROW_RETURN_NOT_OK(CheckPyError()); + + return Status::OK(); +} + +// ---------------------------------------------------------------------- +// Seekable input stream + +PyReadableFile::PyReadableFile(PyObject* file) { + file_.reset(new PythonFile(file)); +} + +PyReadableFile::~PyReadableFile() {} + +Status PyReadableFile::Close() { + PyAcquireGIL lock; + return file_->Close(); +} + +Status PyReadableFile::Seek(int64_t position) { + PyAcquireGIL lock; + return file_->Seek(position, 0); +} + +Status PyReadableFile::Tell(int64_t* position) { + PyAcquireGIL lock; + return file_->Tell(position); +} + +Status PyReadableFile::Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) { + PyAcquireGIL lock; + PyObject* bytes_obj; + ARROW_RETURN_NOT_OK(file_->Read(nbytes, &bytes_obj)); + + *bytes_read = PyBytes_GET_SIZE(bytes_obj); + std::memcpy(out, PyBytes_AS_STRING(bytes_obj), *bytes_read); + Py_DECREF(bytes_obj); + + return Status::OK(); +} + +Status PyReadableFile::Read(int64_t nbytes, std::shared_ptr<Buffer>* out) { + PyAcquireGIL lock; + + PyObject* bytes_obj; + ARROW_RETURN_NOT_OK(file_->Read(nbytes, &bytes_obj)); + + *out = std::make_shared<PyBuffer>(bytes_obj); + Py_DECREF(bytes_obj); + + return Status::OK(); +} + +Status PyReadableFile::GetSize(int64_t* size) { + PyAcquireGIL lock; + + int64_t current_position; + + ARROW_RETURN_NOT_OK(file_->Tell(¤t_position)); + + ARROW_RETURN_NOT_OK(file_->Seek(0, 2)); + + int64_t file_size; + ARROW_RETURN_NOT_OK(file_->Tell(&file_size)); + + // Restore previous file position + ARROW_RETURN_NOT_OK(file_->Seek(current_position, 0)); + + *size = file_size; + return Status::OK(); +} + +bool PyReadableFile::supports_zero_copy() const { + return false; +} + +// ---------------------------------------------------------------------- +// Output stream + +PyOutputStream::PyOutputStream(PyObject* file) { + file_.reset(new PythonFile(file)); +} + +PyOutputStream::~PyOutputStream() {} + +Status PyOutputStream::Close() { + PyAcquireGIL lock; + return file_->Close(); +} + +Status PyOutputStream::Tell(int64_t* position) { + PyAcquireGIL lock; + return file_->Tell(position); +} + +Status PyOutputStream::Write(const uint8_t* data, int64_t nbytes) { + PyAcquireGIL lock; + return file_->Write(data, nbytes); +} + +// ---------------------------------------------------------------------- +// A readable file that is backed by a PyBuffer + +PyBytesReader::PyBytesReader(PyObject* obj) + : io::BufferReader(std::make_shared<PyBuffer>(obj)) {} + +PyBytesReader::~PyBytesReader() {} + +} // namespace py +} // namespace arrow http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/src/arrow/python/io.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/io.h b/cpp/src/arrow/python/io.h new file mode 100644 index 0000000..905bd6c --- /dev/null +++ b/cpp/src/arrow/python/io.h @@ -0,0 +1,99 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_IO_H +#define PYARROW_IO_H + +#include "arrow/io/interfaces.h" +#include "arrow/io/memory.h" +#include "arrow/util/visibility.h" + +#include "arrow/python/config.h" + +#include "arrow/python/common.h" + +namespace arrow { + +class MemoryPool; + +namespace py { + +// A common interface to a Python file-like object. Must acquire GIL before +// calling any methods +class PythonFile { + public: + explicit PythonFile(PyObject* file); + ~PythonFile(); + + Status Close(); + Status Seek(int64_t position, int whence); + Status Read(int64_t nbytes, PyObject** out); + Status Tell(int64_t* position); + Status Write(const uint8_t* data, int64_t nbytes); + + private: + PyObject* file_; +}; + +class ARROW_EXPORT PyReadableFile : public io::RandomAccessFile { + public: + explicit PyReadableFile(PyObject* file); + virtual ~PyReadableFile(); + + Status Close() override; + + Status Read(int64_t nbytes, int64_t* bytes_read, uint8_t* out) override; + Status Read(int64_t nbytes, std::shared_ptr<Buffer>* out) override; + + Status GetSize(int64_t* size) override; + + Status Seek(int64_t position) override; + + Status Tell(int64_t* position) override; + + bool supports_zero_copy() const override; + + private: + std::unique_ptr<PythonFile> file_; +}; + +class ARROW_EXPORT PyOutputStream : public io::OutputStream { + public: + explicit PyOutputStream(PyObject* file); + virtual ~PyOutputStream(); + + Status Close() override; + Status Tell(int64_t* position) override; + Status Write(const uint8_t* data, int64_t nbytes) override; + + private: + std::unique_ptr<PythonFile> file_; +}; + +// A zero-copy reader backed by a PyBuffer object +class ARROW_EXPORT PyBytesReader : public io::BufferReader { + public: + explicit PyBytesReader(PyObject* obj); + virtual ~PyBytesReader(); +}; + +// TODO(wesm): seekable output files + +} // namespace py +} // namespace arrow + +#endif // PYARROW_IO_H http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/src/arrow/python/numpy_interop.h ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/numpy_interop.h b/cpp/src/arrow/python/numpy_interop.h new file mode 100644 index 0000000..0a4b425 --- /dev/null +++ b/cpp/src/arrow/python/numpy_interop.h @@ -0,0 +1,60 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef PYARROW_NUMPY_INTEROP_H +#define PYARROW_NUMPY_INTEROP_H + +#include <Python.h> + +#include <numpy/numpyconfig.h> + +// Don't use the deprecated Numpy functions +#ifdef NPY_1_7_API_VERSION +#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION +#else +#define NPY_ARRAY_NOTSWAPPED NPY_NOTSWAPPED +#define NPY_ARRAY_ALIGNED NPY_ALIGNED +#define NPY_ARRAY_WRITEABLE NPY_WRITEABLE +#define NPY_ARRAY_UPDATEIFCOPY NPY_UPDATEIFCOPY +#endif + +// This is required to be able to access the NumPy C API properly in C++ files +// other than this main one +#define PY_ARRAY_UNIQUE_SYMBOL arrow_ARRAY_API +#ifndef NUMPY_IMPORT_ARRAY +#define NO_IMPORT_ARRAY +#endif + +#include <numpy/arrayobject.h> +#include <numpy/ufuncobject.h> + +namespace arrow { +namespace py { + +inline int import_numpy() { +#ifdef NUMPY_IMPORT_ARRAY + import_array1(-1); + import_umath1(-1); +#endif + + return 0; +} + +} // namespace py +} // namespace arrow + +#endif // PYARROW_NUMPY_INTEROP_H http://git-wip-us.apache.org/repos/asf/arrow/blob/3aac4ade/cpp/src/arrow/python/pandas-test.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/python/pandas-test.cc b/cpp/src/arrow/python/pandas-test.cc new file mode 100644 index 0000000..ae2527e --- /dev/null +++ b/cpp/src/arrow/python/pandas-test.cc @@ -0,0 +1,64 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gtest/gtest.h" + +#include <cstdint> +#include <memory> +#include <string> +#include <vector> + +#include "arrow/array.h" +#include "arrow/builder.h" +#include "arrow/python/pandas_convert.h" +#include "arrow/schema.h" +#include "arrow/table.h" +#include "arrow/test-util.h" +#include "arrow/type.h" + +namespace arrow { +namespace py { + +TEST(PandasConversionTest, TestObjectBlockWriteFails) { + StringBuilder builder(default_memory_pool()); + const char value[] = {'\xf1', '\0'}; + + for (int i = 0; i < 1000; ++i) { + builder.Append(value, strlen(value)); + } + + std::shared_ptr<Array> arr; + ASSERT_OK(builder.Finish(&arr)); + + auto f1 = field("f1", utf8()); + auto f2 = field("f2", utf8()); + auto f3 = field("f3", utf8()); + std::vector<std::shared_ptr<Field>> fields = {f1, f2, f3}; + std::vector<std::shared_ptr<Column>> cols = {std::make_shared<Column>(f1, arr), + std::make_shared<Column>(f2, arr), std::make_shared<Column>(f3, arr)}; + + auto schema = std::make_shared<Schema>(fields); + auto table = std::make_shared<Table>("", schema, cols); + + PyObject* out; + Py_BEGIN_ALLOW_THREADS; + ASSERT_RAISES(UnknownError, ConvertTableToPandas(table, 2, &out)); + Py_END_ALLOW_THREADS; +} + +} // namespace py +} // namespace arrow