Repository: arrow Updated Branches: refs/heads/master bb287e203 -> 7f20f6e73
ARROW-818: [Python] Expand Sphinx API docs, pyarrow.* namespace. Add factory functions for time32, time64 Author: Wes McKinney <wes.mckin...@twosigma.com> Closes #557 from wesm/ARROW-818 and squashes the following commits: 96ce436 [Wes McKinney] Expand Sphinx API docs, pyarrow.* namespace. Add factory functions for time32, time64 Project: http://git-wip-us.apache.org/repos/asf/arrow/repo Commit: http://git-wip-us.apache.org/repos/asf/arrow/commit/7f20f6e7 Tree: http://git-wip-us.apache.org/repos/asf/arrow/tree/7f20f6e7 Diff: http://git-wip-us.apache.org/repos/asf/arrow/diff/7f20f6e7 Branch: refs/heads/master Commit: 7f20f6e738a2e163b0b753416ee4c4ed00998f4b Parents: bb287e2 Author: Wes McKinney <wes.mckin...@twosigma.com> Authored: Tue Apr 18 16:37:03 2017 +0200 Committer: Uwe L. Korn <uw...@xhochy.com> Committed: Tue Apr 18 16:37:03 2017 +0200 ---------------------------------------------------------------------- python/doc/source/api.rst | 69 +++++++++++++++++++++++----- python/pyarrow/__init__.py | 33 ++++++++++---- python/pyarrow/_array.pxd | 10 +++++ python/pyarrow/_array.pyx | 74 ++++++++++++++++++++++++++++++- python/pyarrow/_io.pyx | 6 +-- python/pyarrow/includes/libarrow.pxd | 3 ++ python/pyarrow/tests/test_io.py | 4 +- python/pyarrow/tests/test_schema.py | 21 +++++++++ 8 files changed, 195 insertions(+), 25 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/arrow/blob/7f20f6e7/python/doc/source/api.rst ---------------------------------------------------------------------- diff --git a/python/doc/source/api.rst b/python/doc/source/api.rst index 92e248b..08a0694 100644 --- a/python/doc/source/api.rst +++ b/python/doc/source/api.rst @@ -24,8 +24,8 @@ API Reference .. _api.functions: -Type Metadata and Schemas -------------------------- +Type and Schema Factory Functions +--------------------------------- .. autosummary:: :toctree: generated/ @@ -43,6 +43,8 @@ Type Metadata and Schemas float16 float32 float64 + time32 + time64 timestamp date32 date64 @@ -53,10 +55,8 @@ Type Metadata and Schemas struct dictionary field - DataType - Field - Schema schema + from_numpy_dtype Scalar Value Types ------------------ @@ -68,6 +68,7 @@ Scalar Value Types NAType Scalar ArrayValue + BooleanValue Int8Value Int16Value Int32Value @@ -82,6 +83,11 @@ Scalar Value Types BinaryValue StringValue FixedSizeBinaryValue + Date32Value + Date64Value + TimestampValue + DecimalValue + Array Types and Constructors ---------------------------- @@ -91,21 +97,30 @@ Array Types and Constructors array Array - NullArray - NumericArray - IntegerArray - FloatingPointArray BooleanArray + DictionaryArray + FloatingPointArray + IntegerArray Int8Array Int16Array Int32Array Int64Array + NullArray + NumericArray UInt8Array UInt16Array UInt32Array UInt64Array - DictionaryArray + BinaryArray + FixedSizeBinaryArray StringArray + Time32Array + Time64Array + Date32Array + Date64Array + TimestampArray + DecimalArray + ListArray Tables and Record Batches ------------------------- @@ -113,9 +128,11 @@ Tables and Record Batches .. autosummary:: :toctree: generated/ + ChunkedArray Column RecordBatch Table + get_record_batch_size Tensor type and Functions ------------------------- @@ -141,7 +158,7 @@ Input / Output and Shared Memory MemoryMappedFile memory_map create_memory_map - PythonFileInterface + PythonFile Interprocess Communication and Messaging ---------------------------------------- @@ -165,3 +182,33 @@ Memory Pools jemalloc_memory_pool total_allocated_bytes set_memory_pool + +Type Classes +------------ + +.. autosummary:: + :toctree: generated/ + + DataType + DecimalType + DictionaryType + FixedSizeBinaryType + Time32Type + Time64Type + TimestampType + Field + Schema + +.. currentmodule:: pyarrow.parquet + +Apache Parquet +-------------- + +.. autosummary:: + :toctree: generated/ + + ParquetDataset + ParquetFile + read_table + write_metadata + write_table http://git-wip-us.apache.org/repos/asf/arrow/blob/7f20f6e7/python/pyarrow/__init__.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/__init__.py b/python/pyarrow/__init__.py index 87f2352..4d8da9f 100644 --- a/python/pyarrow/__init__.py +++ b/python/pyarrow/__init__.py @@ -31,12 +31,20 @@ from pyarrow._config import cpu_count, set_cpu_count from pyarrow._array import (null, bool_, int8, int16, int32, int64, uint8, uint16, uint32, uint64, - timestamp, date32, date64, + time32, time64, timestamp, date32, date64, float16, float32, float64, binary, string, decimal, list_, struct, dictionary, field, - DataType, FixedSizeBinaryType, - Field, Schema, schema, + DataType, + DecimalType, + DictionaryType, + FixedSizeBinaryType, + TimestampType, + Time32Type, + Time64Type, + Field, + Schema, + schema, Array, Tensor, array, from_numpy_dtype, @@ -47,25 +55,34 @@ from pyarrow._array import (null, bool_, Int16Array, UInt16Array, Int32Array, UInt32Array, Int64Array, UInt64Array, - ListArray, StringArray, + ListArray, + BinaryArray, StringArray, + FixedSizeBinaryArray, DictionaryArray, + Date32Array, Date64Array, + TimestampArray, Time32Array, Time64Array, + DecimalArray, ArrayValue, Scalar, NA, NAType, BooleanValue, Int8Value, Int16Value, Int32Value, Int64Value, UInt8Value, UInt16Value, UInt32Value, UInt64Value, FloatValue, DoubleValue, ListValue, - BinaryValue, StringValue, FixedSizeBinaryValue) + BinaryValue, StringValue, FixedSizeBinaryValue, + DecimalValue, + Date32Value, Date64Value, TimestampValue) -from pyarrow._io import (HdfsFile, NativeFile, PythonFileInterface, +from pyarrow._io import (HdfsFile, NativeFile, PythonFile, Buffer, BufferReader, InMemoryOutputStream, OSFile, MemoryMappedFile, memory_map, frombuffer, read_tensor, write_tensor, memory_map, create_memory_map, - get_record_batch_size, get_tensor_size) + get_record_batch_size, get_tensor_size, + have_libhdfs, have_libhdfs3) from pyarrow._memory import (MemoryPool, total_allocated_bytes, set_memory_pool, default_memory_pool) -from pyarrow._table import Column, RecordBatch, Table, concat_tables +from pyarrow._table import (ChunkedArray, Column, RecordBatch, Table, + concat_tables) from pyarrow._error import (ArrowException, ArrowKeyError, ArrowInvalid, http://git-wip-us.apache.org/repos/asf/arrow/blob/7f20f6e7/python/pyarrow/_array.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/_array.pxd b/python/pyarrow/_array.pxd index afb0c27..464de31 100644 --- a/python/pyarrow/_array.pxd +++ b/python/pyarrow/_array.pxd @@ -42,6 +42,16 @@ cdef class TimestampType(DataType): const CTimestampType* ts_type +cdef class Time32Type(DataType): + cdef: + const CTime32Type* time_type + + +cdef class Time64Type(DataType): + cdef: + const CTime64Type* time_type + + cdef class FixedSizeBinaryType(DataType): cdef: const CFixedSizeBinaryType* fixed_size_binary_type http://git-wip-us.apache.org/repos/asf/arrow/blob/7f20f6e7/python/pyarrow/_array.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/_array.pyx b/python/pyarrow/_array.pyx index e41380d..1c571ba 100644 --- a/python/pyarrow/_array.pyx +++ b/python/pyarrow/_array.pyx @@ -127,6 +127,30 @@ cdef class TimestampType(DataType): return None +cdef class Time32Type(DataType): + + cdef void init(self, const shared_ptr[CDataType]& type): + DataType.init(self, type) + self.time_type = <const CTime32Type*> type.get() + + property unit: + + def __get__(self): + return timeunit_to_string(self.time_type.unit()) + + +cdef class Time64Type(DataType): + + cdef void init(self, const shared_ptr[CDataType]& type): + DataType.init(self, type) + self.time_type = <const CTime64Type*> type.get() + + property unit: + + def __get__(self): + return timeunit_to_string(self.time_type.unit()) + + cdef class FixedSizeBinaryType(DataType): cdef void init(self, const shared_ptr[CDataType]& type): @@ -342,6 +366,7 @@ def int64(): cdef dict _timestamp_type_cache = {} +cdef dict _time_type_cache = {} cdef timeunit_to_string(TimeUnit unit): @@ -369,7 +394,7 @@ def timestamp(unit_str, tz=None): elif unit_str == 'ns': unit = TimeUnit_NANO else: - raise TypeError('Invalid TimeUnit string') + raise ValueError('Invalid TimeUnit string') cdef TimestampType out = TimestampType() @@ -388,6 +413,50 @@ def timestamp(unit_str, tz=None): return out +def time32(unit_str): + cdef: + TimeUnit unit + c_string c_timezone + + if unit_str == "s": + unit = TimeUnit_SECOND + elif unit_str == 'ms': + unit = TimeUnit_MILLI + else: + raise ValueError('Invalid TimeUnit for time32: {}'.format(unit_str)) + + cdef Time32Type out + if unit in _time_type_cache: + return _time_type_cache[unit] + else: + out = Time32Type() + out.init(ctime32(unit)) + _time_type_cache[unit] = out + return out + + +def time64(unit_str): + cdef: + TimeUnit unit + c_string c_timezone + + if unit_str == "us": + unit = TimeUnit_MICRO + elif unit_str == 'ns': + unit = TimeUnit_NANO + else: + raise ValueError('Invalid TimeUnit for time64: {}'.format(unit_str)) + + cdef Time64Type out + if unit in _time_type_cache: + return _time_type_cache[unit] + else: + out = Time64Type() + out.init(ctime64(unit)) + _time_type_cache[unit] = out + return out + + def date32(): return primitive_type(_Type_DATE32) @@ -516,6 +585,9 @@ cdef Schema box_schema(const shared_ptr[CSchema]& type): def from_numpy_dtype(object dtype): + """ + Convert NumPy dtype to pyarrow.DataType + """ cdef shared_ptr[CDataType] c_type with nogil: check_status(pyarrow.NumPyDtypeToArrow(dtype, &c_type)) http://git-wip-us.apache.org/repos/asf/arrow/blob/7f20f6e7/python/pyarrow/_io.pyx ---------------------------------------------------------------------- diff --git a/python/pyarrow/_io.pyx b/python/pyarrow/_io.pyx index 09e8233..40c76f8 100644 --- a/python/pyarrow/_io.pyx +++ b/python/pyarrow/_io.pyx @@ -307,7 +307,7 @@ cdef class NativeFile: # Python file-like objects -cdef class PythonFileInterface(NativeFile): +cdef class PythonFile(NativeFile): cdef: object handle @@ -600,7 +600,7 @@ cdef get_reader(object source, shared_ptr[RandomAccessFile]* reader): source = BufferReader(source) elif not isinstance(source, NativeFile) and hasattr(source, 'read'): # Optimistically hope this is file-like - source = PythonFileInterface(source, mode='r') + source = PythonFile(source, mode='r') if isinstance(source, NativeFile): nf = source @@ -622,7 +622,7 @@ cdef get_writer(object source, shared_ptr[OutputStream]* writer): source = OSFile(source, mode='w') elif not isinstance(source, NativeFile) and hasattr(source, 'write'): # Optimistically hope this is file-like - source = PythonFileInterface(source, mode='w') + source = PythonFile(source, mode='w') if isinstance(source, NativeFile): nf = source http://git-wip-us.apache.org/repos/asf/arrow/blob/7f20f6e7/python/pyarrow/includes/libarrow.pxd ---------------------------------------------------------------------- diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index ea835f6..473a0b9 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -106,6 +106,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: cdef cppclass CTime64Type" arrow::Time64Type"(CFixedWidthType): TimeUnit unit() + shared_ptr[CDataType] ctime32" arrow::time32"(TimeUnit unit) + shared_ptr[CDataType] ctime64" arrow::time64"(TimeUnit unit) + cdef cppclass CDictionaryType" arrow::DictionaryType"(CFixedWidthType): CDictionaryType(const shared_ptr[CDataType]& index_type, const shared_ptr[CArray]& dictionary) http://git-wip-us.apache.org/repos/asf/arrow/blob/7f20f6e7/python/pyarrow/tests/test_io.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_io.py b/python/pyarrow/tests/test_io.py index c5d3708..a14898f 100644 --- a/python/pyarrow/tests/test_io.py +++ b/python/pyarrow/tests/test_io.py @@ -32,7 +32,7 @@ import pyarrow as pa def test_python_file_write(): buf = BytesIO() - f = pa.PythonFileInterface(buf) + f = pa.PythonFile(buf) assert f.tell() == 0 @@ -56,7 +56,7 @@ def test_python_file_read(): data = b'some sample data' buf = BytesIO(data) - f = pa.PythonFileInterface(buf, mode='r') + f = pa.PythonFile(buf, mode='r') assert f.size() == len(data) http://git-wip-us.apache.org/repos/asf/arrow/blob/7f20f6e7/python/pyarrow/tests/test_schema.py ---------------------------------------------------------------------- diff --git a/python/pyarrow/tests/test_schema.py b/python/pyarrow/tests/test_schema.py index d1107fb..da704f3 100644 --- a/python/pyarrow/tests/test_schema.py +++ b/python/pyarrow/tests/test_schema.py @@ -77,6 +77,27 @@ def test_type_timestamp_with_tz(): assert t.tz == tz +def test_time_types(): + t1 = pa.time32('s') + t2 = pa.time32('ms') + t3 = pa.time64('us') + t4 = pa.time64('ns') + + assert t1.unit == 's' + assert t2.unit == 'ms' + assert t3.unit == 'us' + assert t4.unit == 'ns' + + assert str(t1) == 'time32[s]' + assert str(t4) == 'time64[ns]' + + with pytest.raises(ValueError): + pa.time32('us') + + with pytest.raises(ValueError): + pa.time64('s') + + def test_type_from_numpy_dtype_timestamps(): cases = [ (np.dtype('datetime64[s]'), pa.timestamp('s')),