This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 49ccf6a ARROW-2760: [Python] Remove legacy property definition syntax
from parquet module and test them
49ccf6a is described below
commit 49ccf6aee748660b03d99fc22d3f775cf0fd8e97
Author: Krisztián Szűcs <[email protected]>
AuthorDate: Fri Jul 27 11:26:00 2018 -0400
ARROW-2760: [Python] Remove legacy property definition syntax from parquet
module and test them
Author: Krisztián Szűcs <[email protected]>
Closes #2187 from kszucs/cython_properties and squashes the following
commits:
d6a7f779 <Krisztián Szűcs> remove accidentally committed test helper script
77c59d66 <Krisztián Szűcs> return NotImplemented from equality check
b1e7bede <Krisztián Szűcs> raise NotImplementedError for index_page_offset
71f5edee <Krisztián Szűcs> flake8
74d53bb9 <Krisztián Szűcs> missing distinct_count equals to zero
40ba9651 <Krisztián Szűcs> expose missing parquet classes to pq namespace
6942eba1 <Krisztián Szűcs> comments in compare schemas
1e1c7cd6 <Krisztián Szűcs> test and fix column chunk metadata properties
ffa10413 <Krisztián Szűcs> expected distinct_count value is None
d55b96a8 <Krisztián Szűcs> fix types
dd362295 <Krisztián Szűcs> rename type property to physical_type
a43153e9 <Krisztián Szűcs> warn instead of print
266a022b <Krisztián Szűcs> implement equality operator on ParquetSchema and
ColumnSchema
c58441a4 <Krisztián Szűcs> due to receiving extension class as argument
it's possuble to use directly __cinit__ constructors instead of custom init
methods
06e4f8ed <Krisztián Szűcs> remove enum cpdef
c0e2a08c <Krisztián Szűcs> replace old property syntax; move indent to
formatting; test missing column stat properties
---
python/pyarrow/_parquet.pyx | 541 ++++++++++++++++-------------------
python/pyarrow/formatting.py | 15 +-
python/pyarrow/parquet.py | 8 +-
python/pyarrow/tests/test_parquet.py | 116 ++++++--
4 files changed, 352 insertions(+), 328 deletions(-)
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 983ff8d..1aa2124 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -31,16 +31,11 @@ from pyarrow.lib cimport (Array, Schema,
NativeFile, get_reader, get_writer)
from pyarrow.compat import tobytes, frombytes
+from pyarrow.formatting import indent
from pyarrow.lib import ArrowException, NativeFile, _stringify_path
import six
-
-try:
- from textwrap import indent
-except ImportError:
- def indent(text, prefix):
- lines = [prefix + line for line in text.splitlines(True)]
- return ''.join(lines)
+import warnings
cdef class RowGroupStatistics:
@@ -95,49 +90,42 @@ cdef class RowGroupStatistics:
else:
raise ValueError('Unknown physical ParquetType')
- property has_min_max:
-
- def __get__(self):
- return self.statistics.get().HasMinMax()
-
- property min:
-
- def __get__(self):
- raw_physical_type = self.statistics.get().physical_type()
- encode_min = self.statistics.get().EncodeMin()
-
- min_value = FormatStatValue(raw_physical_type, encode_min.c_str())
- return self._cast_statistic(min_value)
-
- property max:
-
- def __get__(self):
- raw_physical_type = self.statistics.get().physical_type()
- encode_max = self.statistics.get().EncodeMax()
-
- max_value = FormatStatValue(raw_physical_type, encode_max.c_str())
- return self._cast_statistic(max_value)
+ @property
+ def has_min_max(self):
+ return self.statistics.get().HasMinMax()
- property null_count:
+ @property
+ def min(self):
+ raw_physical_type = self.statistics.get().physical_type()
+ encode_min = self.statistics.get().EncodeMin()
- def __get__(self):
- return self.statistics.get().null_count()
+ min_value = FormatStatValue(raw_physical_type, encode_min.c_str())
+ return self._cast_statistic(min_value)
- property distinct_count:
+ @property
+ def max(self):
+ raw_physical_type = self.statistics.get().physical_type()
+ encode_max = self.statistics.get().EncodeMax()
- def __get__(self):
- return self.statistics.get().distinct_count()
+ max_value = FormatStatValue(raw_physical_type, encode_max.c_str())
+ return self._cast_statistic(max_value)
- property num_values:
+ @property
+ def null_count(self):
+ return self.statistics.get().null_count()
- def __get__(self):
- return self.statistics.get().num_values()
+ @property
+ def distinct_count(self):
+ return self.statistics.get().distinct_count()
- property physical_type:
+ @property
+ def num_values(self):
+ return self.statistics.get().num_values()
- def __get__(self):
- physical_type = self.statistics.get().physical_type()
- return physical_type_name_from_enum(physical_type)
+ @property
+ def physical_type(self):
+ raw_physical_type = self.statistics.get().physical_type()
+ return physical_type_name_from_enum(raw_physical_type)
cdef class ColumnChunkMetaData:
@@ -157,7 +145,7 @@ cdef class ColumnChunkMetaData:
return """{0}
file_offset: {1}
file_path: {2}
- type: {3}
+ physical_type: {3}
num_values: {4}
path_in_schema: {5}
is_stats_set: {6}
@@ -168,12 +156,11 @@ cdef class ColumnChunkMetaData:
has_dictionary_page: {10}
dictionary_page_offset: {11}
data_page_offset: {12}
- index_page_offset: {13}
- total_compressed_size: {14}
- total_uncompressed_size: {15}""".format(object.__repr__(self),
+ total_compressed_size: {13}
+ total_uncompressed_size: {14}""".format(object.__repr__(self),
self.file_offset,
self.file_path,
- self.type,
+ self.physical_type,
self.num_values,
self.path_in_schema,
self.is_stats_set,
@@ -183,90 +170,80 @@ cdef class ColumnChunkMetaData:
self.has_dictionary_page,
self.dictionary_page_offset,
self.data_page_offset,
- self.index_page_offset,
self.total_compressed_size,
self.total_uncompressed_size)
- property file_offset:
-
- def __get__(self):
- return self.metadata.file_offset()
-
- property file_path:
-
- def __get__(self):
- return frombytes(self.metadata.file_path())
-
- property type:
-
- def __get__(self):
- return physical_type_name_from_enum(self.metadata.type())
-
- property num_values:
-
- def __get__(self):
- return self.metadata.num_values()
-
- property path_in_schema:
-
- def __get__(self):
- path = self.metadata.path_in_schema().get().ToDotString()
- return frombytes(path)
-
- property is_stats_set:
-
- def __get__(self):
- return self.metadata.is_stats_set()
+ @property
+ def file_offset(self):
+ return self.metadata.file_offset()
- property statistics:
+ @property
+ def file_path(self):
+ return frombytes(self.metadata.file_path())
- def __get__(self):
- if not self.metadata.is_stats_set():
- return None
- statistics = RowGroupStatistics()
- statistics.init(self.metadata.statistics())
- return statistics
+ @property
+ def physical_type(self):
+ return physical_type_name_from_enum(self.metadata.type())
- property compression:
+ @property
+ def num_values(self):
+ return self.metadata.num_values()
- def __get__(self):
- return self.metadata.compression()
+ @property
+ def path_in_schema(self):
+ path = self.metadata.path_in_schema().get().ToDotString()
+ return frombytes(path)
- property encodings:
+ @property
+ def is_stats_set(self):
+ return self.metadata.is_stats_set()
- def __get__(self):
- return map(encoding_name_from_enum,
- self.metadata.encodings())
+ @property
+ def statistics(self):
+ if not self.metadata.is_stats_set():
+ return None
+ statistics = RowGroupStatistics()
+ statistics.init(self.metadata.statistics())
+ return statistics
- property has_dictionary_page:
+ @property
+ def compression(self):
+ return compression_name_from_enum(self.metadata.compression())
- def __get__(self):
- return self.metadata.has_dictionary_page()
+ @property
+ def encodings(self):
+ return tuple(map(encoding_name_from_enum, self.metadata.encodings()))
- property dictionary_page_offset:
+ @property
+ def has_dictionary_page(self):
+ return bool(self.metadata.has_dictionary_page())
- def __get__(self):
+ @property
+ def dictionary_page_offset(self):
+ if self.has_dictionary_page:
return self.metadata.dictionary_page_offset()
+ else:
+ return None
- property data_page_offset:
-
- def __get__(self):
- return self.metadata.data_page_offset()
-
- property index_page_offset:
-
- def __get__(self):
- return self.metadata.index_page_offset()
+ @property
+ def data_page_offset(self):
+ return self.metadata.data_page_offset()
- property total_compressed_size:
+ @property
+ def has_index_page(self):
+ raise NotImplementedError('not supported in parquet-cpp')
- def __get__(self):
- return self.metadata.total_compressed_size()
+ @property
+ def index_page_offset(self):
+ raise NotImplementedError("parquet-cpp doesn't return valid values")
- property total_uncompressed_size:
+ @property
+ def total_compressed_size(self):
+ return self.metadata.total_compressed_size()
- def __get__(self):
- return self.metadata.total_uncompressed_size()
+ @property
+ def total_uncompressed_size(self):
+ return self.metadata.total_uncompressed_size()
cdef class RowGroupMetaData:
@@ -275,10 +252,7 @@ cdef class RowGroupMetaData:
CRowGroupMetaData* metadata
FileMetaData parent
- def __cinit__(self):
- pass
-
- cdef void init_from_file(self, FileMetaData parent, int i):
+ def __cinit__(self, FileMetaData parent, int i):
if i < 0 or i >= parent.num_row_groups:
raise IndexError('{0} out of bounds'.format(i))
self.up_metadata = parent._metadata.RowGroup(i)
@@ -299,20 +273,17 @@ cdef class RowGroupMetaData:
self.num_rows,
self.total_byte_size)
- property num_columns:
-
- def __get__(self):
- return self.metadata.num_columns()
-
- property num_rows:
-
- def __get__(self):
- return self.metadata.num_rows()
+ @property
+ def num_columns(self):
+ return self.metadata.num_columns()
- property total_byte_size:
+ @property
+ def num_rows(self):
+ return self.metadata.num_rows()
- def __get__(self):
- return self.metadata.total_byte_size()
+ @property
+ def total_byte_size(self):
+ return self.metadata.total_byte_size()
cdef class FileMetaData:
@@ -343,72 +314,56 @@ cdef class FileMetaData:
@property
def schema(self):
- if self._schema is not None:
- return self._schema
-
- cdef ParquetSchema schema = ParquetSchema()
- schema.init_from_filemeta(self)
- self._schema = schema
- return schema
-
- property serialized_size:
-
- def __get__(self):
- return self._metadata.size()
-
- property num_columns:
-
- def __get__(self):
- return self._metadata.num_columns()
+ if self._schema is None:
+ self._schema = ParquetSchema(self)
+ return self._schema
- property num_rows:
-
- def __get__(self):
- return self._metadata.num_rows()
+ @property
+ def serialized_size(self):
+ return self._metadata.size()
- property num_row_groups:
+ @property
+ def num_columns(self):
+ return self._metadata.num_columns()
- def __get__(self):
- return self._metadata.num_row_groups()
+ @property
+ def num_rows(self):
+ return self._metadata.num_rows()
- property format_version:
+ @property
+ def num_row_groups(self):
+ return self._metadata.num_row_groups()
- def __get__(self):
- cdef ParquetVersion version = self._metadata.version()
- if version == ParquetVersion_V1:
- return '1.0'
- if version == ParquetVersion_V2:
- return '2.0'
- else:
- print('Unrecognized file version, assuming 1.0: {0}'
- .format(version))
- return '1.0'
+ @property
+ def format_version(self):
+ cdef ParquetVersion version = self._metadata.version()
+ if version == ParquetVersion_V1:
+ return '1.0'
+ if version == ParquetVersion_V2:
+ return '2.0'
+ else:
+ warnings.warn('Unrecognized file version, assuming 1.0: {}'
+ .format(version))
+ return '1.0'
- property created_by:
+ @property
+ def created_by(self):
+ return frombytes(self._metadata.created_by())
- def __get__(self):
- return frombytes(self._metadata.created_by())
+ @property
+ def metadata(self):
+ cdef:
+ unordered_map[c_string, c_string] metadata
+ const CKeyValueMetadata* underlying_metadata
+ underlying_metadata = self._metadata.key_value_metadata().get()
+ if underlying_metadata != NULL:
+ underlying_metadata.ToUnorderedMap(&metadata)
+ return metadata
+ else:
+ return None
def row_group(self, int i):
- """
-
- """
- cdef RowGroupMetaData result = RowGroupMetaData()
- result.init_from_file(self, i)
- return result
-
- property metadata:
-
- def __get__(self):
- cdef:
- unordered_map[c_string, c_string] metadata
- const CKeyValueMetadata* underlying_metadata
- underlying_metadata = self._metadata.key_value_metadata().get()
- if underlying_metadata != NULL:
- underlying_metadata.ToUnorderedMap(&metadata)
- return metadata
- else:
- return None
+ return RowGroupMetaData(self, i)
cdef class ParquetSchema:
@@ -416,8 +371,9 @@ cdef class ParquetSchema:
FileMetaData parent # the FileMetaData owning the SchemaDescriptor
const SchemaDescriptor* schema
- def __cinit__(self):
- self.schema = NULL
+ def __cinit__(self, FileMetaData container):
+ self.parent = container
+ self.schema = container._metadata.schema()
def __repr__(self):
cdef const ColumnDescriptor* descr
@@ -434,20 +390,15 @@ cdef class ParquetSchema:
{1}
""".format(object.__repr__(self), '\n'.join(elements))
- cdef init_from_filemeta(self, FileMetaData container):
- self.parent = container
- self.schema = container._metadata.schema()
-
def __len__(self):
return self.schema.num_columns()
def __getitem__(self, i):
return self.column(i)
- property names:
-
- def __get__(self):
- return [self[i].name for i in range(len(self))]
+ @property
+ def names(self):
+ return [self[i].name for i in range(len(self))]
def to_arrow_schema(self):
"""
@@ -457,8 +408,7 @@ cdef class ParquetSchema:
-------
schema : pyarrow.Schema
"""
- cdef:
- shared_ptr[CSchema] sp_arrow_schema
+ cdef shared_ptr[CSchema] sp_arrow_schema
with nogil:
check_status(FromParquetSchema(
@@ -467,6 +417,12 @@ cdef class ParquetSchema:
return pyarrow_wrap_schema(sp_arrow_schema)
+ def __eq__(self, other):
+ try:
+ return self.equals(other)
+ except TypeError:
+ return NotImplemented
+
def equals(self, ParquetSchema other):
"""
Returns True if the Parquet schemas are equal
@@ -477,9 +433,7 @@ cdef class ParquetSchema:
if i < 0 or i >= len(self):
raise IndexError('{0} out of bounds'.format(i))
- cdef ColumnSchema col = ColumnSchema()
- col.init_from_schema(self, i)
- return col
+ return ColumnSchema(self, i)
cdef class ColumnSchema:
@@ -487,13 +441,16 @@ cdef class ColumnSchema:
ParquetSchema parent
const ColumnDescriptor* descr
- def __cinit__(self):
- self.descr = NULL
-
- cdef init_from_schema(self, ParquetSchema schema, int i):
+ def __cinit__(self, ParquetSchema schema, int i):
self.parent = schema
self.descr = schema.schema.Column(i)
+ def __eq__(self, other):
+ try:
+ return self.equals(other)
+ except TypeError:
+ return NotImplemented
+
def equals(self, ColumnSchema other):
"""
Returns True if the column schemas are equal
@@ -520,52 +477,43 @@ cdef class ColumnSchema:
self.max_repetition_level, physical_type,
logical_type)
- property name:
-
- def __get__(self):
- return frombytes(self.descr.name())
-
- property path:
-
- def __get__(self):
- return frombytes(self.descr.path().get().ToDotString())
-
- property max_definition_level:
-
- def __get__(self):
- return self.descr.max_definition_level()
-
- property max_repetition_level:
+ @property
+ def name(self):
+ return frombytes(self.descr.name())
- def __get__(self):
- return self.descr.max_repetition_level()
+ @property
+ def path(self):
+ return frombytes(self.descr.path().get().ToDotString())
- property physical_type:
+ @property
+ def max_definition_level(self):
+ return self.descr.max_definition_level()
- def __get__(self):
- return physical_type_name_from_enum(self.descr.physical_type())
+ @property
+ def max_repetition_level(self):
+ return self.descr.max_repetition_level()
- property logical_type:
+ @property
+ def physical_type(self):
+ return physical_type_name_from_enum(self.descr.physical_type())
- def __get__(self):
- return logical_type_name_from_enum(self.descr.logical_type())
+ @property
+ def logical_type(self):
+ return logical_type_name_from_enum(self.descr.logical_type())
# FIXED_LEN_BYTE_ARRAY attribute
- property length:
-
- def __get__(self):
- return self.descr.type_length()
+ @property
+ def length(self):
+ return self.descr.type_length()
# Decimal attributes
- property precision:
-
- def __get__(self):
- return self.descr.type_precision()
-
- property scale:
+ @property
+ def precision(self):
+ return self.descr.type_precision()
- def __get__(self):
- return self.descr.type_scale()
+ @property
+ def scale(self):
+ return self.descr.type_scale()
cdef physical_type_name_from_enum(ParquetType type_):
@@ -609,19 +557,56 @@ cdef logical_type_name_from_enum(ParquetLogicalType
type_):
}.get(type_, 'UNKNOWN')
-cdef encoding_name_from_enum (ParquetEncoding encoding_):
+cdef encoding_name_from_enum(ParquetEncoding encoding_):
return {
- ParquetEncoding_PLAIN: "PLAIN",
- ParquetEncoding_PLAIN_DICTIONARY: "PLAIN_DICTIONARY",
- ParquetEncoding_RLE: "RLE",
- ParquetEncoding_BIT_PACKED: "BIT_PACKED",
- ParquetEncoding_DELTA_BINARY_PACKED: "DELTA_BINARY_PACKED",
- ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY: "DELTA_LENGTH_BYTE_ARRAY",
- ParquetEncoding_DELTA_BYTE_ARRAY: "DELTA_BYTE_ARRAY",
- ParquetEncoding_RLE_DICTIONARY: "RLE_DICTIONARY",
+ ParquetEncoding_PLAIN: 'PLAIN',
+ ParquetEncoding_PLAIN_DICTIONARY: 'PLAIN_DICTIONARY',
+ ParquetEncoding_RLE: 'RLE',
+ ParquetEncoding_BIT_PACKED: 'BIT_PACKED',
+ ParquetEncoding_DELTA_BINARY_PACKED: 'DELTA_BINARY_PACKED',
+ ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY: 'DELTA_LENGTH_BYTE_ARRAY',
+ ParquetEncoding_DELTA_BYTE_ARRAY: 'DELTA_BYTE_ARRAY',
+ ParquetEncoding_RLE_DICTIONARY: 'RLE_DICTIONARY',
}.get(encoding_, 'UNKNOWN')
+cdef compression_name_from_enum(ParquetCompression compression_):
+ return {
+ ParquetCompression_UNCOMPRESSED: 'UNCOMPRESSED',
+ ParquetCompression_SNAPPY: 'SNAPPY',
+ ParquetCompression_GZIP: 'GZIP',
+ ParquetCompression_LZO: 'LZO',
+ ParquetCompression_BROTLI: 'BROTLI',
+ ParquetCompression_LZ4: 'LZ4',
+ ParquetCompression_ZSTD: 'ZSTD',
+ }.get(compression_, 'UNKNOWN')
+
+
+cdef int check_compression_name(name) except -1:
+ if name.upper() not in {'NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4',
+ 'ZSTD'}:
+ raise ArrowException("Unsupported compression: " + name)
+ return 0
+
+
+cdef ParquetCompression compression_from_name(str name):
+ name = name.upper()
+ if name == 'SNAPPY':
+ return ParquetCompression_SNAPPY
+ elif name == 'GZIP':
+ return ParquetCompression_GZIP
+ elif name == 'LZO':
+ return ParquetCompression_LZO
+ elif name == 'BROTLI':
+ return ParquetCompression_BROTLI
+ elif name == 'LZ4':
+ return ParquetCompression_LZ4
+ elif name == 'ZSTD':
+ return ParquetCompression_ZSTD
+ else:
+ return ParquetCompression_UNCOMPRESSED
+
+
cdef class ParquetReader:
cdef:
object source
@@ -654,22 +639,21 @@ cdef class ParquetReader:
check_status(OpenFile(rd_handle, self.allocator, properties,
c_metadata, &self.reader))
- property column_paths:
-
- def __get__(self):
- cdef:
- FileMetaData container = self.metadata
- const CFileMetaData* metadata = container._metadata
- vector[c_string] path
- int i = 0
+ @property
+ def column_paths(self):
+ cdef:
+ FileMetaData container = self.metadata
+ const CFileMetaData* metadata = container._metadata
+ vector[c_string] path
+ int i = 0
- paths = []
- for i in range(0, metadata.num_columns()):
- path = (metadata.schema().Column(i)
- .path().get().ToDotVector())
- paths.append([frombytes(x) for x in path])
+ paths = []
+ for i in range(0, metadata.num_columns()):
+ path = (metadata.schema().Column(i)
+ .path().get().ToDotVector())
+ paths.append([frombytes(x) for x in path])
- return paths
+ return paths
@property
def metadata(self):
@@ -686,10 +670,9 @@ cdef class ParquetReader:
result.init(metadata)
return result
- property num_row_groups:
-
- def __get__(self):
- return self.reader.get().num_row_groups()
+ @property
+ def num_row_groups(self):
+ return self.reader.get().num_row_groups()
def set_num_threads(self, int nthreads):
self.reader.get().set_num_threads(nthreads)
@@ -809,30 +792,6 @@ cdef class ParquetReader:
array.init(carray)
return array
-cdef int check_compression_name(name) except -1:
- if name.upper() not in ['NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4',
- 'ZSTD']:
- raise ArrowException("Unsupported compression: " + name)
- return 0
-
-
-cdef ParquetCompression compression_from_name(str name):
- name = name.upper()
- if name == "SNAPPY":
- return ParquetCompression_SNAPPY
- elif name == "GZIP":
- return ParquetCompression_GZIP
- elif name == "LZO":
- return ParquetCompression_LZO
- elif name == "BROTLI":
- return ParquetCompression_BROTLI
- elif name == "LZ4":
- return ParquetCompression_LZ4
- elif name == "ZSTD":
- return ParquetCompression_ZSTD
- else:
- return ParquetCompression_UNCOMPRESSED
-
cdef class ParquetWriter:
cdef:
diff --git a/python/pyarrow/formatting.py b/python/pyarrow/formatting.py
index eea3e74..5ef9482 100644
--- a/python/pyarrow/formatting.py
+++ b/python/pyarrow/formatting.py
@@ -20,6 +20,12 @@
import pyarrow.lib as lib
import warnings
+try:
+ from textwrap import indent
+except ImportError:
+ def indent(text, prefix):
+ return ''.join(prefix + line for line in text.splitlines(True))
+
def array_format(arr, window=10):
warnings.warn("array_format is deprecated, use Array.format() instead",
@@ -32,13 +38,6 @@ def value_format(x, indent_level=0):
FutureWarning)
if isinstance(x, lib.ListValue):
contents = ',\n'.join(value_format(item) for item in x)
- return '[{0}]'.format(_indent(contents, 1).strip())
+ return '[{0}]'.format(indent(contents, ' ').strip())
else:
return repr(x)
-
-
-def _indent(text, spaces):
- if spaces == 0:
- return text
- block = ' ' * spaces
- return '\n'.join(block + x for x in text.split('\n'))
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 9c92737..2c1aef0 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -33,9 +33,11 @@ except ImportError:
import numpy as np
from pyarrow.filesystem import FileSystem, LocalFileSystem, S3FSWrapper
-from pyarrow._parquet import (ParquetReader, FileMetaData, # noqa
- RowGroupMetaData, ParquetSchema)
-import pyarrow._parquet as _parquet # noqa
+from pyarrow._parquet import (ParquetReader, RowGroupStatistics, # noqa
+ FileMetaData, RowGroupMetaData,
+ ColumnChunkMetaData,
+ ParquetSchema, ColumnSchema)
+import pyarrow._parquet as _parquet
import pyarrow.lib as lib
import pyarrow as pa
diff --git a/python/pyarrow/tests/test_parquet.py
b/python/pyarrow/tests/test_parquet.py
index c9c1a96..cc86ef1 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -499,10 +499,13 @@ def test_pandas_parquet_configuration_options(tmpdir):
tm.assert_frame_equal(df, df_read)
-def make_sample_file(df):
+def make_sample_file(table_or_df):
import pyarrow.parquet as pq
- a_table = pa.Table.from_pandas(df)
+ if isinstance(table_or_df, pa.Table):
+ a_table = table_or_df
+ else:
+ a_table = pa.Table.from_pandas(table_or_df)
buf = io.BytesIO()
_write_table(a_table, buf, compression='SNAPPY', version='2.0',
@@ -513,6 +516,8 @@ def make_sample_file(df):
def test_parquet_metadata_api():
+ import pyarrow.parquet as pq
+
df = alltypes_sample(size=10000)
df = df.reindex(columns=sorted(df.columns))
@@ -527,6 +532,8 @@ def test_parquet_metadata_api():
assert meta.num_row_groups == 1
assert meta.format_version == '2.0'
assert 'parquet-cpp' in meta.created_by
+ assert isinstance(meta.serialized_size, int)
+ assert isinstance(meta.metadata, dict)
# Schema
schema = fileh.schema
@@ -553,45 +560,85 @@ def test_parquet_metadata_api():
# Row group
for rg in range(meta.num_row_groups):
rg_meta = meta.row_group(rg)
+ assert isinstance(rg_meta, pq.RowGroupMetaData)
repr(rg_meta)
for col in range(rg_meta.num_columns):
col_meta = rg_meta.column(col)
+ assert isinstance(col_meta, pq.ColumnChunkMetaData)
repr(col_meta)
+ rg_meta = meta.row_group(0)
assert rg_meta.num_rows == len(df)
assert rg_meta.num_columns == ncols + 1 # +1 for index
+ assert rg_meta.total_byte_size > 0
+
+ col_meta = rg_meta.column(0)
+ assert col_meta.file_offset > 0
+ assert col_meta.file_path == '' # created from BytesIO
+ assert col_meta.physical_type == 'BOOLEAN'
+ assert col_meta.num_values == 10000
+ assert col_meta.path_in_schema == 'bool'
+ assert col_meta.is_stats_set is True
+ assert isinstance(col_meta.statistics, pq.RowGroupStatistics)
+ assert col_meta.compression == 'SNAPPY'
+ assert col_meta.encodings == ('PLAIN', 'RLE')
+ assert col_meta.has_dictionary_page is False
+ assert col_meta.dictionary_page_offset is None
+ assert col_meta.data_page_offset > 0
+ assert col_meta.total_compressed_size > 0
+ assert col_meta.total_uncompressed_size > 0
+ with pytest.raises(NotImplementedError):
+ col_meta.has_index_page
+ with pytest.raises(NotImplementedError):
+ col_meta.index_page_offset
@pytest.mark.parametrize(
- 'data, dtype, min_value, max_value, null_count, num_values',
+ (
+ 'data',
+ 'type',
+ 'physical_type',
+ 'min_value',
+ 'max_value',
+ 'null_count',
+ 'num_values',
+ 'distinct_count'
+ ),
[
- ([1, 2, 2, None, 4], np.uint8, 1, 4, 1, 4),
- ([1, 2, 2, None, 4], np.uint16, 1, 4, 1, 4),
- ([1, 2, 2, None, 4], np.uint32, 1, 4, 1, 4),
- ([1, 2, 2, None, 4], np.uint64, 1, 4, 1, 4),
- ([-1, 2, 2, None, 4], np.int16, -1, 4, 1, 4),
- ([-1, 2, 2, None, 4], np.int32, -1, 4, 1, 4),
- ([-1, 2, 2, None, 4], np.int64, -1, 4, 1, 4),
- ([-1.1, 2.2, 2.3, None, 4.4], np.float32, -1.1, 4.4, 1, 4),
- ([-1.1, 2.2, 2.3, None, 4.4], np.float64, -1.1, 4.4, 1, 4),
+ ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0),
+ ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0),
+ ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0),
+ ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0),
+ ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0),
+ ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0),
+ ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0),
+ ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0),
+ (
+ [-1.1, 2.2, 2.3, None, 4.4], pa.float32(),
+ 'FLOAT', -1.1, 4.4, 1, 4, 0
+ ),
(
- [u'', u'b', unichar(1000), None, u'aaa'],
- object, b'', unichar(1000).encode('utf-8'), 1, 4
+ [-1.1, 2.2, 2.3, None, 4.4], pa.float64(),
+ 'DOUBLE', -1.1, 4.4, 1, 4, 0
+ ),
+ (
+ [u'', u'b', unichar(1000), None, u'aaa'], pa.binary(),
+ 'BYTE_ARRAY', b'', unichar(1000).encode('utf-8'), 1, 4, 0
+ ),
+ (
+ [True, False, False, True, True], pa.bool_(),
+ 'BOOLEAN', False, True, 0, 5, 0
),
- ([True, False, False, True, True], np.bool, False, True, 0, 5),
]
)
-def test_parquet_column_statistics_api(
- data,
- dtype,
- min_value,
- max_value,
- null_count,
- num_values):
- df = pd.DataFrame({'data': data}, dtype=dtype)
-
- fileh = make_sample_file(df)
+def test_parquet_column_statistics_api(data, type, physical_type, min_value,
+ max_value, null_count, num_values,
+ distinct_count):
+ df = pd.DataFrame({'data': data})
+ schema = pa.schema([pa.field('data', type)])
+ table = pa.Table.from_pandas(df, schema=schema)
+ fileh = make_sample_file(table)
meta = fileh.metadata
@@ -599,26 +646,43 @@ def test_parquet_column_statistics_api(
col_meta = rg_meta.column(0)
stat = col_meta.statistics
+ assert stat.has_min_max
assert stat.min == min_value
assert stat.max == max_value
assert stat.null_count == null_count
assert stat.num_values == num_values
+ # TODO(kszucs) until parquet-cpp API doesn't expose HasDistinctCount
+ # method, missing distinct_count is represented as zero instead of None
+ assert stat.distinct_count == distinct_count
+ assert stat.physical_type == physical_type
def test_compare_schemas():
+ import pyarrow.parquet as pq
+
df = alltypes_sample(size=10000)
fileh = make_sample_file(df)
fileh2 = make_sample_file(df)
fileh3 = make_sample_file(df[df.columns[::2]])
+ # ParquetSchema
+ assert isinstance(fileh.schema, pq.ParquetSchema)
assert fileh.schema.equals(fileh.schema)
+ assert fileh.schema == fileh.schema
assert fileh.schema.equals(fileh2.schema)
-
+ assert fileh.schema == fileh2.schema
+ assert fileh.schema != 'arbitrary object'
assert not fileh.schema.equals(fileh3.schema)
+ assert fileh.schema != fileh3.schema
+ # ColumnSchema
+ assert isinstance(fileh.schema[0], pq.ColumnSchema)
assert fileh.schema[0].equals(fileh.schema[0])
+ assert fileh.schema[0] == fileh.schema[0]
assert not fileh.schema[0].equals(fileh.schema[1])
+ assert fileh.schema[0] != fileh.schema[1]
+ assert fileh.schema[0] != 'arbitrary object'
def test_column_of_arrays(tmpdir):