[arrow] branch master updated: ARROW-2760: [Python] Remove legacy property definition syntax from parquet module and test them

wesm Fri, 27 Jul 2018 08:26:25 -0700

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 49ccf6a  ARROW-2760: [Python] Remove legacy property definition syntax 
from parquet module and test them
49ccf6a is described below

commit 49ccf6aee748660b03d99fc22d3f775cf0fd8e97
Author: Krisztián Szűcs <[email protected]>
AuthorDate: Fri Jul 27 11:26:00 2018 -0400

    ARROW-2760: [Python] Remove legacy property definition syntax from parquet 
module and test them
    
    Author: Krisztián Szűcs <[email protected]>
    
    Closes #2187 from kszucs/cython_properties and squashes the following 
commits:
    
    d6a7f779 <Krisztián Szűcs> remove accidentally committed test helper script
    77c59d66 <Krisztián Szűcs> return NotImplemented from equality check
    b1e7bede <Krisztián Szűcs> raise NotImplementedError for index_page_offset
    71f5edee <Krisztián Szűcs> flake8
    74d53bb9 <Krisztián Szűcs> missing distinct_count equals to zero
    40ba9651 <Krisztián Szűcs> expose missing parquet classes to pq namespace
    6942eba1 <Krisztián Szűcs> comments in compare schemas
    1e1c7cd6 <Krisztián Szűcs> test and fix column chunk metadata properties
    ffa10413 <Krisztián Szűcs> expected distinct_count value is None
    d55b96a8 <Krisztián Szűcs> fix types
    dd362295 <Krisztián Szűcs> rename type property to physical_type
    a43153e9 <Krisztián Szűcs> warn instead of print
    266a022b <Krisztián Szűcs> implement equality operator on ParquetSchema and 
ColumnSchema
    c58441a4 <Krisztián Szűcs> due to receiving extension class as argument 
it's possuble to use directly __cinit__ constructors instead of custom init 
methods
    06e4f8ed <Krisztián Szűcs> remove enum cpdef
    c0e2a08c <Krisztián Szűcs> replace old property syntax; move indent to 
formatting; test missing column stat properties
---
 python/pyarrow/_parquet.pyx          | 541 ++++++++++++++++-------------------
 python/pyarrow/formatting.py         |  15 +-
 python/pyarrow/parquet.py            |   8 +-
 python/pyarrow/tests/test_parquet.py | 116 ++++++--
 4 files changed, 352 insertions(+), 328 deletions(-)

diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index 983ff8d..1aa2124 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -31,16 +31,11 @@ from pyarrow.lib cimport (Array, Schema,
                           NativeFile, get_reader, get_writer)
 
 from pyarrow.compat import tobytes, frombytes
+from pyarrow.formatting import indent
 from pyarrow.lib import ArrowException, NativeFile, _stringify_path
 
 import six
-
-try:
-    from textwrap import indent
-except ImportError:
-    def indent(text, prefix):
-        lines = [prefix + line for line in text.splitlines(True)]
-        return ''.join(lines)
+import warnings
 
 
 cdef class RowGroupStatistics:
@@ -95,49 +90,42 @@ cdef class RowGroupStatistics:
         else:
             raise ValueError('Unknown physical ParquetType')
 
-    property has_min_max:
-
-        def __get__(self):
-            return self.statistics.get().HasMinMax()
-
-    property min:
-
-        def __get__(self):
-            raw_physical_type = self.statistics.get().physical_type()
-            encode_min = self.statistics.get().EncodeMin()
-
-            min_value = FormatStatValue(raw_physical_type, encode_min.c_str())
-            return self._cast_statistic(min_value)
-
-    property max:
-
-        def __get__(self):
-            raw_physical_type = self.statistics.get().physical_type()
-            encode_max = self.statistics.get().EncodeMax()
-
-            max_value = FormatStatValue(raw_physical_type, encode_max.c_str())
-            return self._cast_statistic(max_value)
+    @property
+    def has_min_max(self):
+        return self.statistics.get().HasMinMax()
 
-    property null_count:
+    @property
+    def min(self):
+        raw_physical_type = self.statistics.get().physical_type()
+        encode_min = self.statistics.get().EncodeMin()
 
-        def __get__(self):
-            return self.statistics.get().null_count()
+        min_value = FormatStatValue(raw_physical_type, encode_min.c_str())
+        return self._cast_statistic(min_value)
 
-    property distinct_count:
+    @property
+    def max(self):
+        raw_physical_type = self.statistics.get().physical_type()
+        encode_max = self.statistics.get().EncodeMax()
 
-        def __get__(self):
-            return self.statistics.get().distinct_count()
+        max_value = FormatStatValue(raw_physical_type, encode_max.c_str())
+        return self._cast_statistic(max_value)
 
-    property num_values:
+    @property
+    def null_count(self):
+        return self.statistics.get().null_count()
 
-        def __get__(self):
-            return self.statistics.get().num_values()
+    @property
+    def distinct_count(self):
+        return self.statistics.get().distinct_count()
 
-    property physical_type:
+    @property
+    def num_values(self):
+        return self.statistics.get().num_values()
 
-        def __get__(self):
-            physical_type = self.statistics.get().physical_type()
-            return physical_type_name_from_enum(physical_type)
+    @property
+    def physical_type(self):
+        raw_physical_type = self.statistics.get().physical_type()
+        return physical_type_name_from_enum(raw_physical_type)
 
 
 cdef class ColumnChunkMetaData:
@@ -157,7 +145,7 @@ cdef class ColumnChunkMetaData:
         return """{0}
   file_offset: {1}
   file_path: {2}
-  type: {3}
+  physical_type: {3}
   num_values: {4}
   path_in_schema: {5}
   is_stats_set: {6}
@@ -168,12 +156,11 @@ cdef class ColumnChunkMetaData:
   has_dictionary_page: {10}
   dictionary_page_offset: {11}
   data_page_offset: {12}
-  index_page_offset: {13}
-  total_compressed_size: {14}
-  total_uncompressed_size: {15}""".format(object.__repr__(self),
+  total_compressed_size: {13}
+  total_uncompressed_size: {14}""".format(object.__repr__(self),
                                           self.file_offset,
                                           self.file_path,
-                                          self.type,
+                                          self.physical_type,
                                           self.num_values,
                                           self.path_in_schema,
                                           self.is_stats_set,
@@ -183,90 +170,80 @@ cdef class ColumnChunkMetaData:
                                           self.has_dictionary_page,
                                           self.dictionary_page_offset,
                                           self.data_page_offset,
-                                          self.index_page_offset,
                                           self.total_compressed_size,
                                           self.total_uncompressed_size)
 
-    property file_offset:
-
-        def __get__(self):
-            return self.metadata.file_offset()
-
-    property file_path:
-
-        def __get__(self):
-            return frombytes(self.metadata.file_path())
-
-    property type:
-
-        def __get__(self):
-            return physical_type_name_from_enum(self.metadata.type())
-
-    property num_values:
-
-        def __get__(self):
-            return self.metadata.num_values()
-
-    property path_in_schema:
-
-        def __get__(self):
-            path = self.metadata.path_in_schema().get().ToDotString()
-            return frombytes(path)
-
-    property is_stats_set:
-
-        def __get__(self):
-            return self.metadata.is_stats_set()
+    @property
+    def file_offset(self):
+        return self.metadata.file_offset()
 
-    property statistics:
+    @property
+    def file_path(self):
+        return frombytes(self.metadata.file_path())
 
-        def __get__(self):
-            if not self.metadata.is_stats_set():
-                return None
-            statistics = RowGroupStatistics()
-            statistics.init(self.metadata.statistics())
-            return statistics
+    @property
+    def physical_type(self):
+        return physical_type_name_from_enum(self.metadata.type())
 
-    property compression:
+    @property
+    def num_values(self):
+        return self.metadata.num_values()
 
-        def __get__(self):
-            return self.metadata.compression()
+    @property
+    def path_in_schema(self):
+        path = self.metadata.path_in_schema().get().ToDotString()
+        return frombytes(path)
 
-    property encodings:
+    @property
+    def is_stats_set(self):
+        return self.metadata.is_stats_set()
 
-        def __get__(self):
-            return map(encoding_name_from_enum,
-                       self.metadata.encodings())
+    @property
+    def statistics(self):
+        if not self.metadata.is_stats_set():
+            return None
+        statistics = RowGroupStatistics()
+        statistics.init(self.metadata.statistics())
+        return statistics
 
-    property has_dictionary_page:
+    @property
+    def compression(self):
+        return compression_name_from_enum(self.metadata.compression())
 
-        def __get__(self):
-            return self.metadata.has_dictionary_page()
+    @property
+    def encodings(self):
+        return tuple(map(encoding_name_from_enum, self.metadata.encodings()))
 
-    property dictionary_page_offset:
+    @property
+    def has_dictionary_page(self):
+        return bool(self.metadata.has_dictionary_page())
 
-        def __get__(self):
+    @property
+    def dictionary_page_offset(self):
+        if self.has_dictionary_page:
             return self.metadata.dictionary_page_offset()
+        else:
+            return None
 
-    property data_page_offset:
-
-        def __get__(self):
-            return self.metadata.data_page_offset()
-
-    property index_page_offset:
-
-        def __get__(self):
-            return self.metadata.index_page_offset()
+    @property
+    def data_page_offset(self):
+        return self.metadata.data_page_offset()
 
-    property total_compressed_size:
+    @property
+    def has_index_page(self):
+        raise NotImplementedError('not supported in parquet-cpp')
 
-        def __get__(self):
-            return self.metadata.total_compressed_size()
+    @property
+    def index_page_offset(self):
+        raise NotImplementedError("parquet-cpp doesn't return valid values")
 
-    property total_uncompressed_size:
+    @property
+    def total_compressed_size(self):
+        return self.metadata.total_compressed_size()
 
-        def __get__(self):
-            return self.metadata.total_uncompressed_size()
+    @property
+    def total_uncompressed_size(self):
+        return self.metadata.total_uncompressed_size()
 
 
 cdef class RowGroupMetaData:
@@ -275,10 +252,7 @@ cdef class RowGroupMetaData:
         CRowGroupMetaData* metadata
         FileMetaData parent
 
-    def __cinit__(self):
-        pass
-
-    cdef void init_from_file(self, FileMetaData parent, int i):
+    def __cinit__(self, FileMetaData parent, int i):
         if i < 0 or i >= parent.num_row_groups:
             raise IndexError('{0} out of bounds'.format(i))
         self.up_metadata = parent._metadata.RowGroup(i)
@@ -299,20 +273,17 @@ cdef class RowGroupMetaData:
                                  self.num_rows,
                                  self.total_byte_size)
 
-    property num_columns:
-
-        def __get__(self):
-            return self.metadata.num_columns()
-
-    property num_rows:
-
-        def __get__(self):
-            return self.metadata.num_rows()
+    @property
+    def num_columns(self):
+        return self.metadata.num_columns()
 
-    property total_byte_size:
+    @property
+    def num_rows(self):
+        return self.metadata.num_rows()
 
-        def __get__(self):
-            return self.metadata.total_byte_size()
+    @property
+    def total_byte_size(self):
+        return self.metadata.total_byte_size()
 
 
 cdef class FileMetaData:
@@ -343,72 +314,56 @@ cdef class FileMetaData:
 
     @property
     def schema(self):
-        if self._schema is not None:
-            return self._schema
-
-        cdef ParquetSchema schema = ParquetSchema()
-        schema.init_from_filemeta(self)
-        self._schema = schema
-        return schema
-
-    property serialized_size:
-
-        def __get__(self):
-            return self._metadata.size()
-
-    property num_columns:
-
-        def __get__(self):
-            return self._metadata.num_columns()
+        if self._schema is None:
+            self._schema = ParquetSchema(self)
+        return self._schema
 
-    property num_rows:
-
-        def __get__(self):
-            return self._metadata.num_rows()
+    @property
+    def serialized_size(self):
+        return self._metadata.size()
 
-    property num_row_groups:
+    @property
+    def num_columns(self):
+        return self._metadata.num_columns()
 
-        def __get__(self):
-            return self._metadata.num_row_groups()
+    @property
+    def num_rows(self):
+        return self._metadata.num_rows()
 
-    property format_version:
+    @property
+    def num_row_groups(self):
+        return self._metadata.num_row_groups()
 
-        def __get__(self):
-            cdef ParquetVersion version = self._metadata.version()
-            if version == ParquetVersion_V1:
-                return '1.0'
-            if version == ParquetVersion_V2:
-                return '2.0'
-            else:
-                print('Unrecognized file version, assuming 1.0: {0}'
-                      .format(version))
-                return '1.0'
+    @property
+    def format_version(self):
+        cdef ParquetVersion version = self._metadata.version()
+        if version == ParquetVersion_V1:
+            return '1.0'
+        if version == ParquetVersion_V2:
+            return '2.0'
+        else:
+            warnings.warn('Unrecognized file version, assuming 1.0: {}'
+                          .format(version))
+            return '1.0'
 
-    property created_by:
+    @property
+    def created_by(self):
+        return frombytes(self._metadata.created_by())
 
-        def __get__(self):
-            return frombytes(self._metadata.created_by())
+    @property
+    def metadata(self):
+        cdef:
+            unordered_map[c_string, c_string] metadata
+            const CKeyValueMetadata* underlying_metadata
+        underlying_metadata = self._metadata.key_value_metadata().get()
+        if underlying_metadata != NULL:
+            underlying_metadata.ToUnorderedMap(&metadata)
+            return metadata
+        else:
+            return None
 
     def row_group(self, int i):
-        """
-
-        """
-        cdef RowGroupMetaData result = RowGroupMetaData()
-        result.init_from_file(self, i)
-        return result
-
-    property metadata:
-
-        def __get__(self):
-            cdef:
-                unordered_map[c_string, c_string] metadata
-                const CKeyValueMetadata* underlying_metadata
-            underlying_metadata = self._metadata.key_value_metadata().get()
-            if underlying_metadata != NULL:
-                underlying_metadata.ToUnorderedMap(&metadata)
-                return metadata
-            else:
-                return None
+        return RowGroupMetaData(self, i)
 
 
 cdef class ParquetSchema:
@@ -416,8 +371,9 @@ cdef class ParquetSchema:
         FileMetaData parent  # the FileMetaData owning the SchemaDescriptor
         const SchemaDescriptor* schema
 
-    def __cinit__(self):
-        self.schema = NULL
+    def __cinit__(self, FileMetaData container):
+        self.parent = container
+        self.schema = container._metadata.schema()
 
     def __repr__(self):
         cdef const ColumnDescriptor* descr
@@ -434,20 +390,15 @@ cdef class ParquetSchema:
 {1}
  """.format(object.__repr__(self), '\n'.join(elements))
 
-    cdef init_from_filemeta(self, FileMetaData container):
-        self.parent = container
-        self.schema = container._metadata.schema()
-
     def __len__(self):
         return self.schema.num_columns()
 
     def __getitem__(self, i):
         return self.column(i)
 
-    property names:
-
-        def __get__(self):
-            return [self[i].name for i in range(len(self))]
+    @property
+    def names(self):
+        return [self[i].name for i in range(len(self))]
 
     def to_arrow_schema(self):
         """
@@ -457,8 +408,7 @@ cdef class ParquetSchema:
         -------
         schema : pyarrow.Schema
         """
-        cdef:
-            shared_ptr[CSchema] sp_arrow_schema
+        cdef shared_ptr[CSchema] sp_arrow_schema
 
         with nogil:
             check_status(FromParquetSchema(
@@ -467,6 +417,12 @@ cdef class ParquetSchema:
 
         return pyarrow_wrap_schema(sp_arrow_schema)
 
+    def __eq__(self, other):
+        try:
+            return self.equals(other)
+        except TypeError:
+            return NotImplemented
+
     def equals(self, ParquetSchema other):
         """
         Returns True if the Parquet schemas are equal
@@ -477,9 +433,7 @@ cdef class ParquetSchema:
         if i < 0 or i >= len(self):
             raise IndexError('{0} out of bounds'.format(i))
 
-        cdef ColumnSchema col = ColumnSchema()
-        col.init_from_schema(self, i)
-        return col
+        return ColumnSchema(self, i)
 
 
 cdef class ColumnSchema:
@@ -487,13 +441,16 @@ cdef class ColumnSchema:
         ParquetSchema parent
         const ColumnDescriptor* descr
 
-    def __cinit__(self):
-        self.descr = NULL
-
-    cdef init_from_schema(self, ParquetSchema schema, int i):
+    def __cinit__(self, ParquetSchema schema, int i):
         self.parent = schema
         self.descr = schema.schema.Column(i)
 
+    def __eq__(self, other):
+        try:
+            return self.equals(other)
+        except TypeError:
+            return NotImplemented
+
     def equals(self, ColumnSchema other):
         """
         Returns True if the column schemas are equal
@@ -520,52 +477,43 @@ cdef class ColumnSchema:
                               self.max_repetition_level, physical_type,
                               logical_type)
 
-    property name:
-
-        def __get__(self):
-            return frombytes(self.descr.name())
-
-    property path:
-
-        def __get__(self):
-            return frombytes(self.descr.path().get().ToDotString())
-
-    property max_definition_level:
-
-        def __get__(self):
-            return self.descr.max_definition_level()
-
-    property max_repetition_level:
+    @property
+    def name(self):
+        return frombytes(self.descr.name())
 
-        def __get__(self):
-            return self.descr.max_repetition_level()
+    @property
+    def path(self):
+        return frombytes(self.descr.path().get().ToDotString())
 
-    property physical_type:
+    @property
+    def max_definition_level(self):
+        return self.descr.max_definition_level()
 
-        def __get__(self):
-            return physical_type_name_from_enum(self.descr.physical_type())
+    @property
+    def max_repetition_level(self):
+        return self.descr.max_repetition_level()
 
-    property logical_type:
+    @property
+    def physical_type(self):
+        return physical_type_name_from_enum(self.descr.physical_type())
 
-        def __get__(self):
-            return logical_type_name_from_enum(self.descr.logical_type())
+    @property
+    def logical_type(self):
+        return logical_type_name_from_enum(self.descr.logical_type())
 
     # FIXED_LEN_BYTE_ARRAY attribute
-    property length:
-
-        def __get__(self):
-            return self.descr.type_length()
+    @property
+    def length(self):
+        return self.descr.type_length()
 
     # Decimal attributes
-    property precision:
-
-        def __get__(self):
-            return self.descr.type_precision()
-
-    property scale:
+    @property
+    def precision(self):
+        return self.descr.type_precision()
 
-        def __get__(self):
-            return self.descr.type_scale()
+    @property
+    def scale(self):
+        return self.descr.type_scale()
 
 
 cdef physical_type_name_from_enum(ParquetType type_):
@@ -609,19 +557,56 @@ cdef logical_type_name_from_enum(ParquetLogicalType 
type_):
     }.get(type_, 'UNKNOWN')
 
 
-cdef encoding_name_from_enum (ParquetEncoding encoding_):
+cdef encoding_name_from_enum(ParquetEncoding encoding_):
     return {
-        ParquetEncoding_PLAIN: "PLAIN",
-        ParquetEncoding_PLAIN_DICTIONARY: "PLAIN_DICTIONARY",
-        ParquetEncoding_RLE: "RLE",
-        ParquetEncoding_BIT_PACKED: "BIT_PACKED",
-        ParquetEncoding_DELTA_BINARY_PACKED: "DELTA_BINARY_PACKED",
-        ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY: "DELTA_LENGTH_BYTE_ARRAY",
-        ParquetEncoding_DELTA_BYTE_ARRAY: "DELTA_BYTE_ARRAY",
-        ParquetEncoding_RLE_DICTIONARY: "RLE_DICTIONARY",
+        ParquetEncoding_PLAIN: 'PLAIN',
+        ParquetEncoding_PLAIN_DICTIONARY: 'PLAIN_DICTIONARY',
+        ParquetEncoding_RLE: 'RLE',
+        ParquetEncoding_BIT_PACKED: 'BIT_PACKED',
+        ParquetEncoding_DELTA_BINARY_PACKED: 'DELTA_BINARY_PACKED',
+        ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY: 'DELTA_LENGTH_BYTE_ARRAY',
+        ParquetEncoding_DELTA_BYTE_ARRAY: 'DELTA_BYTE_ARRAY',
+        ParquetEncoding_RLE_DICTIONARY: 'RLE_DICTIONARY',
     }.get(encoding_, 'UNKNOWN')
 
 
+cdef compression_name_from_enum(ParquetCompression compression_):
+    return {
+        ParquetCompression_UNCOMPRESSED: 'UNCOMPRESSED',
+        ParquetCompression_SNAPPY: 'SNAPPY',
+        ParquetCompression_GZIP: 'GZIP',
+        ParquetCompression_LZO: 'LZO',
+        ParquetCompression_BROTLI: 'BROTLI',
+        ParquetCompression_LZ4: 'LZ4',
+        ParquetCompression_ZSTD: 'ZSTD',
+    }.get(compression_, 'UNKNOWN')
+
+
+cdef int check_compression_name(name) except -1:
+    if name.upper() not in {'NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4',
+                            'ZSTD'}:
+        raise ArrowException("Unsupported compression: " + name)
+    return 0
+
+
+cdef ParquetCompression compression_from_name(str name):
+    name = name.upper()
+    if name == 'SNAPPY':
+        return ParquetCompression_SNAPPY
+    elif name == 'GZIP':
+        return ParquetCompression_GZIP
+    elif name == 'LZO':
+        return ParquetCompression_LZO
+    elif name == 'BROTLI':
+        return ParquetCompression_BROTLI
+    elif name == 'LZ4':
+        return ParquetCompression_LZ4
+    elif name == 'ZSTD':
+        return ParquetCompression_ZSTD
+    else:
+        return ParquetCompression_UNCOMPRESSED
+
+
 cdef class ParquetReader:
     cdef:
         object source
@@ -654,22 +639,21 @@ cdef class ParquetReader:
             check_status(OpenFile(rd_handle, self.allocator, properties,
                                   c_metadata, &self.reader))
 
-    property column_paths:
-
-        def __get__(self):
-            cdef:
-                FileMetaData container = self.metadata
-                const CFileMetaData* metadata = container._metadata
-                vector[c_string] path
-                int i = 0
+    @property
+    def column_paths(self):
+        cdef:
+            FileMetaData container = self.metadata
+            const CFileMetaData* metadata = container._metadata
+            vector[c_string] path
+            int i = 0
 
-            paths = []
-            for i in range(0, metadata.num_columns()):
-                path = (metadata.schema().Column(i)
-                        .path().get().ToDotVector())
-                paths.append([frombytes(x) for x in path])
+        paths = []
+        for i in range(0, metadata.num_columns()):
+            path = (metadata.schema().Column(i)
+                    .path().get().ToDotVector())
+            paths.append([frombytes(x) for x in path])
 
-            return paths
+        return paths
 
     @property
     def metadata(self):
@@ -686,10 +670,9 @@ cdef class ParquetReader:
         result.init(metadata)
         return result
 
-    property num_row_groups:
-
-        def __get__(self):
-            return self.reader.get().num_row_groups()
+    @property
+    def num_row_groups(self):
+        return self.reader.get().num_row_groups()
 
     def set_num_threads(self, int nthreads):
         self.reader.get().set_num_threads(nthreads)
@@ -809,30 +792,6 @@ cdef class ParquetReader:
         array.init(carray)
         return array
 
-cdef int check_compression_name(name) except -1:
-    if name.upper() not in ['NONE', 'SNAPPY', 'GZIP', 'LZO', 'BROTLI', 'LZ4',
-                            'ZSTD']:
-        raise ArrowException("Unsupported compression: " + name)
-    return 0
-
-
-cdef ParquetCompression compression_from_name(str name):
-    name = name.upper()
-    if name == "SNAPPY":
-        return ParquetCompression_SNAPPY
-    elif name == "GZIP":
-        return ParquetCompression_GZIP
-    elif name == "LZO":
-        return ParquetCompression_LZO
-    elif name == "BROTLI":
-        return ParquetCompression_BROTLI
-    elif name == "LZ4":
-        return ParquetCompression_LZ4
-    elif name == "ZSTD":
-        return ParquetCompression_ZSTD
-    else:
-        return ParquetCompression_UNCOMPRESSED
-
 
 cdef class ParquetWriter:
     cdef:
diff --git a/python/pyarrow/formatting.py b/python/pyarrow/formatting.py
index eea3e74..5ef9482 100644
--- a/python/pyarrow/formatting.py
+++ b/python/pyarrow/formatting.py
@@ -20,6 +20,12 @@
 import pyarrow.lib as lib
 import warnings
 
+try:
+    from textwrap import indent
+except ImportError:
+    def indent(text, prefix):
+        return ''.join(prefix + line for line in text.splitlines(True))
+
 
 def array_format(arr, window=10):
     warnings.warn("array_format is deprecated, use Array.format() instead",
@@ -32,13 +38,6 @@ def value_format(x, indent_level=0):
                   FutureWarning)
     if isinstance(x, lib.ListValue):
         contents = ',\n'.join(value_format(item) for item in x)
-        return '[{0}]'.format(_indent(contents, 1).strip())
+        return '[{0}]'.format(indent(contents, ' ').strip())
     else:
         return repr(x)
-
-
-def _indent(text, spaces):
-    if spaces == 0:
-        return text
-    block = ' ' * spaces
-    return '\n'.join(block + x for x in text.split('\n'))
diff --git a/python/pyarrow/parquet.py b/python/pyarrow/parquet.py
index 9c92737..2c1aef0 100644
--- a/python/pyarrow/parquet.py
+++ b/python/pyarrow/parquet.py
@@ -33,9 +33,11 @@ except ImportError:
 import numpy as np
 
 from pyarrow.filesystem import FileSystem, LocalFileSystem, S3FSWrapper
-from pyarrow._parquet import (ParquetReader, FileMetaData,  # noqa
-                              RowGroupMetaData, ParquetSchema)
-import pyarrow._parquet as _parquet  # noqa
+from pyarrow._parquet import (ParquetReader, RowGroupStatistics,  # noqa
+                              FileMetaData, RowGroupMetaData,
+                              ColumnChunkMetaData,
+                              ParquetSchema, ColumnSchema)
+import pyarrow._parquet as _parquet
 import pyarrow.lib as lib
 import pyarrow as pa
 
diff --git a/python/pyarrow/tests/test_parquet.py 
b/python/pyarrow/tests/test_parquet.py
index c9c1a96..cc86ef1 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -499,10 +499,13 @@ def test_pandas_parquet_configuration_options(tmpdir):
         tm.assert_frame_equal(df, df_read)
 
 
-def make_sample_file(df):
+def make_sample_file(table_or_df):
     import pyarrow.parquet as pq
 
-    a_table = pa.Table.from_pandas(df)
+    if isinstance(table_or_df, pa.Table):
+        a_table = table_or_df
+    else:
+        a_table = pa.Table.from_pandas(table_or_df)
 
     buf = io.BytesIO()
     _write_table(a_table, buf, compression='SNAPPY', version='2.0',
@@ -513,6 +516,8 @@ def make_sample_file(df):
 
 
 def test_parquet_metadata_api():
+    import pyarrow.parquet as pq
+
     df = alltypes_sample(size=10000)
     df = df.reindex(columns=sorted(df.columns))
 
@@ -527,6 +532,8 @@ def test_parquet_metadata_api():
     assert meta.num_row_groups == 1
     assert meta.format_version == '2.0'
     assert 'parquet-cpp' in meta.created_by
+    assert isinstance(meta.serialized_size, int)
+    assert isinstance(meta.metadata, dict)
 
     # Schema
     schema = fileh.schema
@@ -553,45 +560,85 @@ def test_parquet_metadata_api():
     # Row group
     for rg in range(meta.num_row_groups):
         rg_meta = meta.row_group(rg)
+        assert isinstance(rg_meta, pq.RowGroupMetaData)
         repr(rg_meta)
 
         for col in range(rg_meta.num_columns):
             col_meta = rg_meta.column(col)
+            assert isinstance(col_meta, pq.ColumnChunkMetaData)
             repr(col_meta)
 
+    rg_meta = meta.row_group(0)
     assert rg_meta.num_rows == len(df)
     assert rg_meta.num_columns == ncols + 1  # +1 for index
+    assert rg_meta.total_byte_size > 0
+
+    col_meta = rg_meta.column(0)
+    assert col_meta.file_offset > 0
+    assert col_meta.file_path == ''  # created from BytesIO
+    assert col_meta.physical_type == 'BOOLEAN'
+    assert col_meta.num_values == 10000
+    assert col_meta.path_in_schema == 'bool'
+    assert col_meta.is_stats_set is True
+    assert isinstance(col_meta.statistics, pq.RowGroupStatistics)
+    assert col_meta.compression == 'SNAPPY'
+    assert col_meta.encodings == ('PLAIN', 'RLE')
+    assert col_meta.has_dictionary_page is False
+    assert col_meta.dictionary_page_offset is None
+    assert col_meta.data_page_offset > 0
+    assert col_meta.total_compressed_size > 0
+    assert col_meta.total_uncompressed_size > 0
+    with pytest.raises(NotImplementedError):
+        col_meta.has_index_page
+    with pytest.raises(NotImplementedError):
+        col_meta.index_page_offset
 
 
 @pytest.mark.parametrize(
-    'data, dtype, min_value, max_value, null_count, num_values',
+    (
+        'data',
+        'type',
+        'physical_type',
+        'min_value',
+        'max_value',
+        'null_count',
+        'num_values',
+        'distinct_count'
+    ),
     [
-        ([1, 2, 2, None, 4], np.uint8, 1, 4, 1, 4),
-        ([1, 2, 2, None, 4], np.uint16, 1, 4, 1, 4),
-        ([1, 2, 2, None, 4], np.uint32, 1, 4, 1, 4),
-        ([1, 2, 2, None, 4], np.uint64, 1, 4, 1, 4),
-        ([-1, 2, 2, None, 4], np.int16, -1, 4, 1, 4),
-        ([-1, 2, 2, None, 4], np.int32, -1, 4, 1, 4),
-        ([-1, 2, 2, None, 4], np.int64, -1, 4, 1, 4),
-        ([-1.1, 2.2, 2.3, None, 4.4], np.float32, -1.1, 4.4, 1, 4),
-        ([-1.1, 2.2, 2.3, None, 4.4], np.float64, -1.1, 4.4, 1, 4),
+        ([1, 2, 2, None, 4], pa.uint8(), 'INT32', 1, 4, 1, 4, 0),
+        ([1, 2, 2, None, 4], pa.uint16(), 'INT32', 1, 4, 1, 4, 0),
+        ([1, 2, 2, None, 4], pa.uint32(), 'INT32', 1, 4, 1, 4, 0),
+        ([1, 2, 2, None, 4], pa.uint64(), 'INT64', 1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], pa.int8(), 'INT32', -1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], pa.int16(), 'INT32', -1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], pa.int32(), 'INT32', -1, 4, 1, 4, 0),
+        ([-1, 2, 2, None, 4], pa.int64(), 'INT64', -1, 4, 1, 4, 0),
+        (
+            [-1.1, 2.2, 2.3, None, 4.4], pa.float32(),
+            'FLOAT', -1.1, 4.4, 1, 4, 0
+        ),
         (
-            [u'', u'b', unichar(1000), None, u'aaa'],
-            object, b'', unichar(1000).encode('utf-8'), 1, 4
+            [-1.1, 2.2, 2.3, None, 4.4], pa.float64(),
+            'DOUBLE', -1.1, 4.4, 1, 4, 0
+        ),
+        (
+            [u'', u'b', unichar(1000), None, u'aaa'], pa.binary(),
+            'BYTE_ARRAY', b'', unichar(1000).encode('utf-8'), 1, 4, 0
+        ),
+        (
+            [True, False, False, True, True], pa.bool_(),
+            'BOOLEAN', False, True, 0, 5, 0
         ),
-        ([True, False, False, True, True], np.bool, False, True, 0, 5),
     ]
 )
-def test_parquet_column_statistics_api(
-        data,
-        dtype,
-        min_value,
-        max_value,
-        null_count,
-        num_values):
-    df = pd.DataFrame({'data': data}, dtype=dtype)
-
-    fileh = make_sample_file(df)
+def test_parquet_column_statistics_api(data, type, physical_type, min_value,
+                                       max_value, null_count, num_values,
+                                       distinct_count):
+    df = pd.DataFrame({'data': data})
+    schema = pa.schema([pa.field('data', type)])
+    table = pa.Table.from_pandas(df, schema=schema)
+    fileh = make_sample_file(table)
 
     meta = fileh.metadata
 
@@ -599,26 +646,43 @@ def test_parquet_column_statistics_api(
     col_meta = rg_meta.column(0)
 
     stat = col_meta.statistics
+    assert stat.has_min_max
     assert stat.min == min_value
     assert stat.max == max_value
     assert stat.null_count == null_count
     assert stat.num_values == num_values
+    # TODO(kszucs) until parquet-cpp API doesn't expose HasDistinctCount
+    # method, missing distinct_count is represented as zero instead of None
+    assert stat.distinct_count == distinct_count
+    assert stat.physical_type == physical_type
 
 
 def test_compare_schemas():
+    import pyarrow.parquet as pq
+
     df = alltypes_sample(size=10000)
 
     fileh = make_sample_file(df)
     fileh2 = make_sample_file(df)
     fileh3 = make_sample_file(df[df.columns[::2]])
 
+    # ParquetSchema
+    assert isinstance(fileh.schema, pq.ParquetSchema)
     assert fileh.schema.equals(fileh.schema)
+    assert fileh.schema == fileh.schema
     assert fileh.schema.equals(fileh2.schema)
-
+    assert fileh.schema == fileh2.schema
+    assert fileh.schema != 'arbitrary object'
     assert not fileh.schema.equals(fileh3.schema)
+    assert fileh.schema != fileh3.schema
 
+    # ColumnSchema
+    assert isinstance(fileh.schema[0], pq.ColumnSchema)
     assert fileh.schema[0].equals(fileh.schema[0])
+    assert fileh.schema[0] == fileh.schema[0]
     assert not fileh.schema[0].equals(fileh.schema[1])
+    assert fileh.schema[0] != fileh.schema[1]
+    assert fileh.schema[0] != 'arbitrary object'
 
 
 def test_column_of_arrays(tmpdir):

[arrow] branch master updated: ARROW-2760: [Python] Remove legacy property definition syntax from parquet module and test them

Reply via email to