Repository: arrow Updated Branches: refs/heads/master 73455b56f -> 268ffbeff
ARROW-374: More precise handling of bytes vs unicode in Python API Python built-in types that are not all unicode become `arrow::BinaryArray` instead of `arrow::StringArray`, since we cannot be sure that the PyBytes objects are UTF-8-encoded strings. Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #249 from wesm/ARROW-374 and squashes the following commits: 1371a30 [Wes McKinney] py3 fixes 8ac3a49 [Wes McKinney] Consistently convert PyBytes to BinaryArray with pandas, too 83d1c05 [Wes McKinney] Remove print statement c8df606 [Wes McKinney] Timestamp and time cannot be static 4a9aaf4 [Wes McKinney] Add Python interface to BinaryArray, convert PyBytes to binary instead of assuming utf8 unicode Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/268ffbef Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/268ffbef Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/268ffbef Branch: refs/heads/master Commit: 268ffbeffb1cd0617e52d381d500a2d10f61124c Parents: 73455b5 Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Wed Dec 21 09:31:56 2016 +0100 Committer: Uwe L. Korn <uw...@xhochy.com> Committed: Wed Dec 21 09:31:56 2016 +0100 ---------------------------------------------------------------------- cpp/src/arrow/type.cc | 6 +- python/pyarrow/__init__.py | 5 +- python/pyarrow/array.pyx | 5 ++ python/pyarrow/includes/libarrow.pxd | 6 +- python/pyarrow/scalar.pyx | 16 ++++- python/pyarrow/schema.pyx | 6 ++ python/pyarrow/tests/test_convert_builtin.py | 31 ++++++--- python/pyarrow/tests/test_convert_pandas.py | 18 +++-- python/pyarrow/tests/test_scalars.py | 22 +++++-- python/src/pyarrow/adapters/builtin.cc | 80 ++++++++++++++++------- python/src/pyarrow/adapters/pandas.cc | 65 +++++++++++++++++- python/src/pyarrow/helpers.cc | 50 +++++--------- python/src/pyarrow/helpers.h | 16 ----- 13 files changed, 227 insertions(+), 99 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/cpp/src/arrow/type.cc ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 4748cc3..8ff9eea 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -155,13 +155,11 @@ TYPE_FACTORY(binary, BinaryType); TYPE_FACTORY(date, DateType); std::shared_ptr<DataType> timestamp(TimeUnit unit) { - static std::shared_ptr<DataType> result = std::make_shared<TimestampType>(); - return result; + return std::make_shared<TimestampType>(unit); } std::shared_ptr<DataType> time(TimeUnit unit) { - static std::shared_ptr<DataType> result = std::make_shared<TimeType>(); - return result; + return std::make_shared<TimeType>(unit); } std::shared_ptr<DataType> list(const std::shared_ptr<DataType>& value_type) { http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/__init__.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 39ba4c7..9ede934 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -40,13 +40,14 @@ from pyarrow.scalar import (ArrayValue, Scalar, NA, NAType, BooleanValue, Int8Value, Int16Value, Int32Value, Int64Value, UInt8Value, UInt16Value, UInt32Value, UInt64Value, - FloatValue, DoubleValue, ListValue, StringValue) + FloatValue, DoubleValue, ListValue, + BinaryValue, StringValue) from pyarrow.schema import (null, bool_, int8, int16, int32, int64, uint8, uint16, uint32, uint64, timestamp, date, - float_, double, string, + float_, double, binary, string, list_, struct, field, DataType, Field, Schema, schema) http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/array.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/array.pyx b/python/pyarrow/array.pyx index 84f1705..c178d5c 100644 --- a/python/pyarrow/array.pyx +++ b/python/pyarrow/array.pyx @@ -238,6 +238,10 @@ cdef class StringArray(Array): pass +cdef class BinaryArray(Array): + pass + + cdef dict _array_classes = { Type_NA: NullArray, Type_BOOL: BooleanArray, @@ -253,6 +257,7 @@ cdef dict _array_classes = { Type_FLOAT: FloatArray, Type_DOUBLE: DoubleArray, Type_LIST: ListArray, + Type_BINARY: BinaryArray, Type_STRING: StringArray, Type_TIMESTAMP: Int64Array, } http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/includes/libarrow.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index 419dd74..40fb60d 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -40,6 +40,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: Type_TIMESTAMP" arrow::Type::TIMESTAMP" Type_DATE" arrow::Type::DATE" + Type_BINARY" arrow::Type::BINARY" Type_STRING" arrow::Type::STRING" Type_LIST" arrow::Type::LIST" @@ -161,7 +162,10 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: shared_ptr[CArray] values() shared_ptr[CDataType] value_type() - cdef cppclass CStringArray" arrow::StringArray"(CListArray): + cdef cppclass CBinaryArray" arrow::BinaryArray"(CListArray): + const uint8_t* GetValue(int i, int32_t* length) + + cdef cppclass CStringArray" arrow::StringArray"(CBinaryArray): c_string GetString(int i) cdef cppclass CChunkedArray" arrow::ChunkedArray": http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/scalar.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/scalar.pyx b/python/pyarrow/scalar.pyx index 623e3e4..a0610a1 100644 --- a/python/pyarrow/scalar.pyx +++ b/python/pyarrow/scalar.pyx @@ -22,6 +22,7 @@ import pyarrow.schema as schema import datetime +cimport cpython as cp NA = None @@ -170,6 +171,18 @@ cdef class StringValue(ArrayValue): return frombytes(ap.GetString(self.index)) +cdef class BinaryValue(ArrayValue): + + def as_py(self): + cdef: + const uint8_t* ptr + int32_t length + CBinaryArray* ap = <CBinaryArray*> self.sp_array.get() + + ptr = ap.GetValue(self.index, &length) + return cp.PyBytes_FromStringAndSize(<const char*>(ptr), length) + + cdef class ListValue(ArrayValue): def __len__(self): @@ -218,7 +231,8 @@ cdef dict _scalar_classes = { Type_FLOAT: FloatValue, Type_DOUBLE: DoubleValue, Type_LIST: ListValue, - Type_STRING: StringValue + Type_BINARY: BinaryValue, + Type_STRING: StringValue, } cdef object box_arrow_scalar(DataType type, http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/schema.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/schema.pyx b/python/pyarrow/schema.pyx index d05ac9e..7a69b0f 100644 --- a/python/pyarrow/schema.pyx +++ b/python/pyarrow/schema.pyx @@ -215,6 +215,12 @@ def string(): """ return primitive_type(Type_STRING) +def binary(): + """ + Binary (PyBytes-like) type + """ + return primitive_type(Type_BINARY) + def list_(DataType value_type): cdef DataType out = DataType() cdef shared_ptr[CDataType] list_type http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/tests/test_convert_builtin.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_convert_builtin.py b/python/pyarrow/tests/test_convert_builtin.py index 7dc1c1b..a5f7aa5 100644 --- a/python/pyarrow/tests/test_convert_builtin.py +++ b/python/pyarrow/tests/test_convert_builtin.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from pyarrow.compat import unittest +from pyarrow.compat import unittest, u import pyarrow import datetime @@ -71,16 +71,28 @@ class TestConvertList(unittest.TestCase): assert arr.type == pyarrow.double() assert arr.to_pylist() == data - def test_string(self): - data = ['foo', b'bar', None, 'arrow'] + def test_unicode(self): + data = [u('foo'), u('bar'), None, u('arrow')] arr = pyarrow.from_pylist(data) assert len(arr) == 4 assert arr.null_count == 1 assert arr.type == pyarrow.string() - assert arr.to_pylist() == ['foo', 'bar', None, 'arrow'] + assert arr.to_pylist() == [u('foo'), u('bar'), None, u('arrow')] + + def test_bytes(self): + u1 = b'ma\xc3\xb1ana' + data = [b'foo', + u1.decode('utf-8'), # unicode gets encoded, + None] + arr = pyarrow.from_pylist(data) + assert len(arr) == 3 + assert arr.null_count == 1 + assert arr.type == pyarrow.binary() + assert arr.to_pylist() == [b'foo', u1, None] def test_date(self): - data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1), datetime.date(2040, 2, 26)] + data = [datetime.date(2000, 1, 1), None, datetime.date(1970, 1, 1), + datetime.date(2040, 2, 26)] arr = pyarrow.from_pylist(data) assert len(arr) == 4 assert arr.type == pyarrow.date() @@ -101,10 +113,13 @@ class TestConvertList(unittest.TestCase): assert len(arr) == 4 assert arr.type == pyarrow.timestamp() assert arr.null_count == 1 - assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1, 23, 34, 123456) + assert arr[0].as_py() == datetime.datetime(2007, 7, 13, 1, + 23, 34, 123456) assert arr[1].as_py() is None - assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12, 34, 56, 432539) - assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5, 46, 57, 437699) + assert arr[2].as_py() == datetime.datetime(2006, 1, 13, 12, + 34, 56, 432539) + assert arr[3].as_py() == datetime.datetime(2010, 8, 13, 5, + 46, 57, 437699) def test_mixed_nesting_levels(self): pyarrow.from_pylist([1, 2, None]) http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/tests/test_convert_pandas.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_convert_pandas.py b/python/pyarrow/tests/test_convert_pandas.py index cf50f3d..da34f85 100644 --- a/python/pyarrow/tests/test_convert_pandas.py +++ b/python/pyarrow/tests/test_convert_pandas.py @@ -23,6 +23,7 @@ import numpy as np import pandas as pd import pandas.util.testing as tm +from pyarrow.compat import u import pyarrow as A @@ -157,13 +158,22 @@ class TestPandasConversion(unittest.TestCase): df = pd.DataFrame({'bools': arr}) self._check_pandas_roundtrip(df) - def test_strings(self): + def test_unicode(self): repeats = 1000 - values = [b'foo', None, u'bar', 'qux', np.nan] + values = [u('foo'), None, u('bar'), u('qux'), np.nan] df = pd.DataFrame({'strings': values * repeats}) - values = ['foo', None, u'bar', 'qux', None] - expected = pd.DataFrame({'strings': values * repeats}) + self._check_pandas_roundtrip(df) + + def test_bytes_to_binary(self): + values = [u('qux'), b'foo', None, 'bar', 'qux', np.nan] + df = pd.DataFrame({'strings': values}) + + table = A.from_pandas_dataframe(df) + assert table[0].type == A.binary() + + values2 = [b'qux', b'foo', None, b'bar', b'qux', np.nan] + expected = pd.DataFrame({'strings': values2}) self._check_pandas_roundtrip(df, expected) def test_timestamps_notimezone_no_nulls(self): http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/pyarrow/tests/test_scalars.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_scalars.py b/python/pyarrow/tests/test_scalars.py index 4fb850a..19cfacb 100644 --- a/python/pyarrow/tests/test_scalars.py +++ b/python/pyarrow/tests/test_scalars.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -from pyarrow.compat import unittest, u +from pyarrow.compat import unittest, u, unicode_type import pyarrow as A @@ -58,20 +58,32 @@ class TestScalars(unittest.TestCase): v = arr[2] assert v.as_py() == 3.0 - def test_string(self): - arr = A.from_pylist(['foo', None, u('bar')]) + def test_string_unicode(self): + arr = A.from_pylist([u('foo'), None, u('bar')]) v = arr[0] assert isinstance(v, A.StringValue) - assert repr(v) == "'foo'" assert v.as_py() == 'foo' assert arr[1] is A.NA v = arr[2].as_py() - assert v == 'bar' + assert v == u('bar') assert isinstance(v, str) + def test_bytes(self): + arr = A.from_pylist([b'foo', None, u('bar')]) + + v = arr[0] + assert isinstance(v, A.BinaryValue) + assert v.as_py() == b'foo' + + assert arr[1] is A.NA + + v = arr[2].as_py() + assert v == b'bar' + assert isinstance(v, bytes) + def test_list(self): arr = A.from_pylist([['foo', None], None, ['bar'], []]) http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/src/pyarrow/adapters/builtin.cc ---------------------------------------------------------------------- diff --git a/python/src/pyarrow/adapters/builtin.cc b/python/src/pyarrow/adapters/builtin.cc index e0cb7c2..2a13944 100644 --- a/python/src/pyarrow/adapters/builtin.cc +++ b/python/src/pyarrow/adapters/builtin.cc @@ -42,14 +42,6 @@ static inline bool IsPyInteger(PyObject* obj) { #endif } -static inline bool IsPyBaseString(PyObject* obj) { -#if PYARROW_IS_PY2 - return PyString_Check(obj) || PyUnicode_Check(obj); -#else - return PyUnicode_Check(obj); -#endif -} - class ScalarVisitor { public: ScalarVisitor() : @@ -60,7 +52,8 @@ class ScalarVisitor { date_count_(0), timestamp_count_(0), float_count_(0), - string_count_(0) {} + binary_count_(0), + unicode_count_(0) {} void Visit(PyObject* obj) { ++total_count_; @@ -76,8 +69,10 @@ class ScalarVisitor { ++date_count_; } else if (PyDateTime_CheckExact(obj)) { ++timestamp_count_; - } else if (IsPyBaseString(obj)) { - ++string_count_; + } else if (PyBytes_Check(obj)) { + ++binary_count_; + } else if (PyUnicode_Check(obj)) { + ++unicode_count_; } else { // TODO(wesm): accumulate error information somewhere } @@ -86,20 +81,22 @@ class ScalarVisitor { std::shared_ptr<DataType> GetType() { // TODO(wesm): handling mixed-type cases if (float_count_) { - return DOUBLE; + return arrow::float64(); } else if (int_count_) { // TODO(wesm): tighter type later - return INT64; + return arrow::int64(); } else if (date_count_) { - return DATE; + return arrow::date(); } else if (timestamp_count_) { - return TIMESTAMP_US; + return arrow::timestamp(arrow::TimeUnit::MICRO); } else if (bool_count_) { - return BOOL; - } else if (string_count_) { - return STRING; + return arrow::boolean(); + } else if (binary_count_) { + return arrow::binary(); + } else if (unicode_count_) { + return arrow::utf8(); } else { - return NA; + return arrow::null(); } } @@ -115,7 +112,8 @@ class ScalarVisitor { int64_t date_count_; int64_t timestamp_count_; int64_t float_count_; - int64_t string_count_; + int64_t binary_count_; + int64_t unicode_count_; // Place to accumulate errors // std::vector<Status> errors_; @@ -163,7 +161,7 @@ class SeqVisitor { std::shared_ptr<DataType> GetType() { if (scalars_.total_count() == 0) { if (max_nesting_level_ == 0) { - return NA; + return arrow::null(); } else { return nullptr; } @@ -227,7 +225,7 @@ static Status InferArrowType(PyObject* obj, int64_t* size, // For 0-length sequences, refuse to guess if (*size == 0) { - *out_type = NA; + *out_type = arrow::null(); } SeqVisitor seq_visitor; @@ -381,7 +379,7 @@ class DoubleConverter : public TypedConverter<arrow::DoubleBuilder> { } }; -class StringConverter : public TypedConverter<arrow::StringBuilder> { +class BytesConverter : public TypedConverter<arrow::BinaryBuilder> { public: Status AppendData(PyObject* seq) override { PyObject* item; @@ -415,6 +413,38 @@ class StringConverter : public TypedConverter<arrow::StringBuilder> { } }; +class UTF8Converter : public TypedConverter<arrow::StringBuilder> { + public: + Status AppendData(PyObject* seq) override { + PyObject* item; + PyObject* bytes_obj; + OwnedRef tmp; + const char* bytes; + int32_t length; + Py_ssize_t size = PySequence_Size(seq); + for (int64_t i = 0; i < size; ++i) { + item = PySequence_GetItem(seq, i); + OwnedRef holder(item); + + if (item == Py_None) { + RETURN_NOT_OK(typed_builder_->AppendNull()); + continue; + } else if (!PyUnicode_Check(item)) { + return Status::TypeError("Non-unicode value encountered"); + } + tmp.reset(PyUnicode_AsUTF8String(item)); + RETURN_IF_PYERROR(); + bytes_obj = tmp.obj(); + + // No error checking + length = PyBytes_GET_SIZE(bytes_obj); + bytes = PyBytes_AS_STRING(bytes_obj); + RETURN_NOT_OK(typed_builder_->Append(bytes, length)); + } + return Status::OK(); + } +}; + class ListConverter : public TypedConverter<arrow::ListBuilder> { public: Status Init(const std::shared_ptr<ArrayBuilder>& builder) override; @@ -449,8 +479,10 @@ std::shared_ptr<SeqConverter> GetConverter(const std::shared_ptr<DataType>& type return std::make_shared<TimestampConverter>(); case Type::DOUBLE: return std::make_shared<DoubleConverter>(); + case Type::BINARY: + return std::make_shared<BytesConverter>(); case Type::STRING: - return std::make_shared<StringConverter>(); + return std::make_shared<UTF8Converter>(); case Type::LIST: return std::make_shared<ListConverter>(); case Type::STRUCT: http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/src/pyarrow/adapters/pandas.cc ---------------------------------------------------------------------- diff --git a/python/src/pyarrow/adapters/pandas.cc b/python/src/pyarrow/adapters/pandas.cc index f8dff6d..38f3b6f 100644 --- a/python/src/pyarrow/adapters/pandas.cc +++ b/python/src/pyarrow/adapters/pandas.cc @@ -193,6 +193,9 @@ class ArrowSerializer { Status ConvertObjectStrings(std::shared_ptr<Array>* out) { PyAcquireGIL lock; + // The output type at this point is inconclusive because there may be bytes + // and unicode mixed in the object array + PyObject** objects = reinterpret_cast<PyObject**>(PyArray_DATA(arr_)); arrow::TypePtr string_type(new arrow::StringType()); arrow::StringBuilder string_builder(pool_, string_type); @@ -200,6 +203,7 @@ class ArrowSerializer { Status s; PyObject* obj; + bool have_bytes = false; for (int64_t i = 0; i < length_; ++i) { obj = objects[i]; if (PyUnicode_Check(obj)) { @@ -215,13 +219,21 @@ class ArrowSerializer { return s; } } else if (PyBytes_Check(obj)) { + have_bytes = true; const int32_t length = PyBytes_GET_SIZE(obj); RETURN_NOT_OK(string_builder.Append(PyBytes_AS_STRING(obj), length)); } else { string_builder.AppendNull(); } } - return string_builder.Finish(out); + RETURN_NOT_OK(string_builder.Finish(out)); + + if (have_bytes) { + const auto& arr = static_cast<const arrow::StringArray&>(*out->get()); + *out = std::make_shared<arrow::BinaryArray>(arr.length(), arr.offsets(), + arr.data(), arr.null_count(), arr.null_bitmap()); + } + return Status::OK(); } Status ConvertBooleans(std::shared_ptr<Array>* out) { @@ -865,7 +877,7 @@ class ArrowDeserializer { return Status::OK(); } - // UTF8 + // UTF8 strings template <int T2> inline typename std::enable_if< T2 == arrow::Type::STRING, Status>::type @@ -912,6 +924,54 @@ class ArrowDeserializer { return Status::OK(); } + template <int T2> + inline typename std::enable_if< + T2 == arrow::Type::BINARY, Status>::type + ConvertValues(const std::shared_ptr<arrow::ChunkedArray>& data) { + size_t chunk_offset = 0; + PyAcquireGIL lock; + + RETURN_NOT_OK(AllocateOutput(NPY_OBJECT)); + + for (int c = 0; c < data->num_chunks(); c++) { + const std::shared_ptr<Array> arr = data->chunk(c); + auto binary_arr = static_cast<arrow::BinaryArray*>(arr.get()); + auto out_values = reinterpret_cast<PyObject**>(PyArray_DATA(out_)) + chunk_offset; + + const uint8_t* data_ptr; + int32_t length; + if (data->null_count() > 0) { + for (int64_t i = 0; i < arr->length(); ++i) { + if (binary_arr->IsNull(i)) { + Py_INCREF(Py_None); + out_values[i] = Py_None; + } else { + data_ptr = binary_arr->GetValue(i, &length); + + out_values[i] = PyBytes_FromStringAndSize( + reinterpret_cast<const char*>(data_ptr), length); + if (out_values[i] == nullptr) { + return Status::UnknownError("String initialization failed"); + } + } + } + } else { + for (int64_t i = 0; i < arr->length(); ++i) { + data_ptr = binary_arr->GetValue(i, &length); + out_values[i] = PyBytes_FromStringAndSize( + reinterpret_cast<const char*>(data_ptr), length); + if (out_values[i] == nullptr) { + return Status::UnknownError("String initialization failed"); + } + } + } + + chunk_offset += binary_arr->length(); + } + + return Status::OK(); + } + private: std::shared_ptr<Column> col_; PyObject* py_ref_; @@ -948,6 +1008,7 @@ Status ConvertColumnToPandas(const std::shared_ptr<Column>& col, PyObject* py_re FROM_ARROW_CASE(UINT64); FROM_ARROW_CASE(FLOAT); FROM_ARROW_CASE(DOUBLE); + FROM_ARROW_CASE(BINARY); FROM_ARROW_CASE(STRING); FROM_ARROW_CASE(DATE); FROM_ARROW_CASE(TIMESTAMP); http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/src/pyarrow/helpers.cc ---------------------------------------------------------------------- diff --git a/python/src/pyarrow/helpers.cc b/python/src/pyarrow/helpers.cc index af92744..b42199c 100644 --- a/python/src/pyarrow/helpers.cc +++ b/python/src/pyarrow/helpers.cc @@ -23,47 +23,33 @@ using namespace arrow; namespace pyarrow { -const std::shared_ptr<NullType> NA = std::make_shared<NullType>(); -const std::shared_ptr<BooleanType> BOOL = std::make_shared<BooleanType>(); -const std::shared_ptr<UInt8Type> UINT8 = std::make_shared<UInt8Type>(); -const std::shared_ptr<UInt16Type> UINT16 = std::make_shared<UInt16Type>(); -const std::shared_ptr<UInt32Type> UINT32 = std::make_shared<UInt32Type>(); -const std::shared_ptr<UInt64Type> UINT64 = std::make_shared<UInt64Type>(); -const std::shared_ptr<Int8Type> INT8 = std::make_shared<Int8Type>(); -const std::shared_ptr<Int16Type> INT16 = std::make_shared<Int16Type>(); -const std::shared_ptr<Int32Type> INT32 = std::make_shared<Int32Type>(); -const std::shared_ptr<Int64Type> INT64 = std::make_shared<Int64Type>(); -const std::shared_ptr<DateType> DATE = std::make_shared<DateType>(); -const std::shared_ptr<TimestampType> TIMESTAMP_US = std::make_shared<TimestampType>(TimeUnit::MICRO); -const std::shared_ptr<FloatType> FLOAT = std::make_shared<FloatType>(); -const std::shared_ptr<DoubleType> DOUBLE = std::make_shared<DoubleType>(); -const std::shared_ptr<StringType> STRING = std::make_shared<StringType>(); -#define GET_PRIMITIVE_TYPE(NAME, Class) \ +#define GET_PRIMITIVE_TYPE(NAME, FACTORY) \ case Type::NAME: \ - return NAME; \ + return FACTORY(); \ break; std::shared_ptr<DataType> GetPrimitiveType(Type::type type) { switch (type) { case Type::NA: - return NA; - GET_PRIMITIVE_TYPE(UINT8, UInt8Type); - GET_PRIMITIVE_TYPE(INT8, Int8Type); - GET_PRIMITIVE_TYPE(UINT16, UInt16Type); - GET_PRIMITIVE_TYPE(INT16, Int16Type); - GET_PRIMITIVE_TYPE(UINT32, UInt32Type); - GET_PRIMITIVE_TYPE(INT32, Int32Type); - GET_PRIMITIVE_TYPE(UINT64, UInt64Type); - GET_PRIMITIVE_TYPE(INT64, Int64Type); - GET_PRIMITIVE_TYPE(DATE, DateType); + return null(); + GET_PRIMITIVE_TYPE(UINT8, uint8); + GET_PRIMITIVE_TYPE(INT8, int8); + GET_PRIMITIVE_TYPE(UINT16, uint16); + GET_PRIMITIVE_TYPE(INT16, int16); + GET_PRIMITIVE_TYPE(UINT32, uint32); + GET_PRIMITIVE_TYPE(INT32, int32); + GET_PRIMITIVE_TYPE(UINT64, uint64); + GET_PRIMITIVE_TYPE(INT64, int64); + GET_PRIMITIVE_TYPE(DATE, date); case Type::TIMESTAMP: - return TIMESTAMP_US; + return arrow::timestamp(arrow::TimeUnit::MICRO); break; - GET_PRIMITIVE_TYPE(BOOL, BooleanType); - GET_PRIMITIVE_TYPE(FLOAT, FloatType); - GET_PRIMITIVE_TYPE(DOUBLE, DoubleType); - GET_PRIMITIVE_TYPE(STRING, StringType); + GET_PRIMITIVE_TYPE(BOOL, boolean); + GET_PRIMITIVE_TYPE(FLOAT, float32); + GET_PRIMITIVE_TYPE(DOUBLE, float64); + GET_PRIMITIVE_TYPE(BINARY, binary); + GET_PRIMITIVE_TYPE(STRING, utf8); default: return nullptr; } http://git-wip-us.apache.org/repos/asf/arrow/blob/268ffbef/python/src/pyarrow/helpers.h ---------------------------------------------------------------------- diff --git a/python/src/pyarrow/helpers.h b/python/src/pyarrow/helpers.h index e714bba..8334d97 100644 --- a/python/src/pyarrow/helpers.h +++ b/python/src/pyarrow/helpers.h @@ -28,22 +28,6 @@ namespace pyarrow { using arrow::DataType; using arrow::Type; -extern const std::shared_ptr<arrow::NullType> NA; -extern const std::shared_ptr<arrow::BooleanType> BOOL; -extern const std::shared_ptr<arrow::UInt8Type> UINT8; -extern const std::shared_ptr<arrow::UInt16Type> UINT16; -extern const std::shared_ptr<arrow::UInt32Type> UINT32; -extern const std::shared_ptr<arrow::UInt64Type> UINT64; -extern const std::shared_ptr<arrow::Int8Type> INT8; -extern const std::shared_ptr<arrow::Int16Type> INT16; -extern const std::shared_ptr<arrow::Int32Type> INT32; -extern const std::shared_ptr<arrow::Int64Type> INT64; -extern const std::shared_ptr<arrow::DateType> DATE; -extern const std::shared_ptr<arrow::TimestampType> TIMESTAMP_US; -extern const std::shared_ptr<arrow::FloatType> FLOAT; -extern const std::shared_ptr<arrow::DoubleType> DOUBLE; -extern const std::shared_ptr<arrow::StringType> STRING; - PYARROW_EXPORT std::shared_ptr<DataType> GetPrimitiveType(Type::type type);