[jira] [Commented] (ARROW-480) [Python] Add accessors for Parquet column statistics

ASF GitHub Bot (JIRA) Sun, 05 Nov 2017 21:03:54 -0800

    [ 
https://issues.apache.org/jira/browse/ARROW-480?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16239912#comment-16239912
 ]


ASF GitHub Bot commented on ARROW-480:
--------------------------------------

wesm closed pull request #1215: ARROW-480: [Python] Implement 
RowGroupMetaData.ColumnChunk
URL: https://github.com/apache/arrow/pull/1215
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/python/pyarrow/_parquet.pxd b/python/pyarrow/_parquet.pxd
index 5094232bd..04a5b1368 100644
--- a/python/pyarrow/_parquet.pxd
+++ b/python/pyarrow/_parquet.pxd
@@ -105,6 +105,11 @@ cdef extern from "parquet/api/schema.h" namespace 
"parquet" nogil:
         ParquetVersion_V1" parquet::ParquetVersion::PARQUET_1_0"
         ParquetVersion_V2" parquet::ParquetVersion::PARQUET_2_0"
 
+    enum ParquetSortOrder" parquet::SortOrder::type":
+        ParquetSortOrder_SIGNED" parquet::SortOrder::SIGNED"
+        ParquetSortOrder_UNSIGNED" parquet::SortOrder::UNSIGNED"
+        ParquetSortOrder_UNKNOWN" parquet::SortOrder::UNKNOWN"
+
     cdef cppclass ColumnDescriptor:
         c_bool Equals(const ColumnDescriptor& other)
 
@@ -126,6 +131,8 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" 
nogil:
         c_bool Equals(const SchemaDescriptor& other)
         int num_columns()
 
+    cdef c_string FormatStatValue(ParquetType parquet_type, const char* val)
+
 
 cdef extern from "parquet/api/reader.h" namespace "parquet" nogil:
     cdef cppclass ColumnReader:
@@ -155,10 +162,52 @@ cdef extern from "parquet/api/reader.h" namespace 
"parquet" nogil:
     cdef cppclass RowGroupReader:
         pass
 
+    cdef cppclass CEncodedStatistics" parquet::EncodedStatistics":
+        const c_string& max() const
+        const c_string& min() const
+        int64_t null_count
+        int64_t distinct_count
+        bint has_min
+        bint has_max
+        bint has_null_count
+        bint has_distinct_count
+
+    cdef cppclass CRowGroupStatistics" parquet::RowGroupStatistics":
+        int64_t null_count() const
+        int64_t distinct_count() const
+        int64_t num_values() const
+        bint HasMinMax()
+        void Reset()
+        c_string EncodeMin()
+        c_string EncodeMax()
+        CEncodedStatistics Encode()
+        void SetComparator()
+        ParquetType physical_type() const
+
+    cdef cppclass CColumnChunkMetaData" parquet::ColumnChunkMetaData":
+        int64_t file_offset() const
+        const c_string& file_path() const
+
+        ParquetType type() const
+        int64_t num_values() const
+        shared_ptr[ColumnPath] path_in_schema() const
+        bint is_stats_set() const
+        shared_ptr[CRowGroupStatistics] statistics() const;
+        ParquetCompression compression() const
+        const vector[ParquetEncoding]& encodings() const
+
+        bint has_dictionary_page() const
+        int64_t dictionary_page_offset() const
+        int64_t data_page_offset() const
+        int64_t index_page_offset() const
+        int64_t total_compressed_size() const
+        int64_t total_uncompressed_size() const
+
     cdef cppclass CRowGroupMetaData" parquet::RowGroupMetaData":
         int num_columns()
         int64_t num_rows()
         int64_t total_byte_size()
+        unique_ptr[CColumnChunkMetaData] ColumnChunk(int i) const
 
     cdef cppclass CFileMetaData" parquet::FileMetaData":
         uint32_t size()
diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx
index b096fa1b4..eca6b201b 100644
--- a/python/pyarrow/_parquet.pyx
+++ b/python/pyarrow/_parquet.pyx
@@ -35,6 +35,212 @@ from pyarrow.lib import ArrowException, NativeFile
 
 import six
 
+try:
+    from textwrap import indent
+except ImportError:
+    def indent(text, prefix):
+        lines = [prefix + line for line in text.splitlines(True)]
+        return ''.join(lines)
+
+
+cdef class RowGroupStatistics:
+    cdef:
+        shared_ptr[CRowGroupStatistics] statistics
+
+    def __cinit__(self):
+        pass
+
+    cdef init(self, const shared_ptr[CRowGroupStatistics]& statistics):
+        self.statistics = statistics
+
+    def __repr__(self):
+        return """{0}
+  has_min_max: {1}
+  min: {2}
+  max: {3}
+  null_count: {4}
+  distinct_count: {5}
+  num_values: {6}
+  physical_type: {7}""".format(object.__repr__(self),
+                               self.has_min_max,
+                               self.min,
+                               self.max,
+                               self.null_count,
+                               self.distinct_count,
+                               self.num_values,
+                               self.physical_type)
+
+    property has_min_max:
+
+        def __get__(self):
+            return self.statistics.get().HasMinMax()
+
+    property min:
+
+        def __get__(self):
+            raw_physical_type = self.statistics.get().physical_type()
+            encode_min = self.statistics.get().EncodeMin()
+
+            min_value = FormatStatValue(raw_physical_type, encode_min.c_str())
+            return frombytes(min_value)
+
+    property max:
+
+        def __get__(self):
+            raw_physical_type = self.statistics.get().physical_type()
+            encode_max = self.statistics.get().EncodeMax()
+
+            max_value = FormatStatValue(raw_physical_type, encode_max.c_str())
+            return frombytes(max_value)
+
+    property null_count:
+
+        def __get__(self):
+            return self.statistics.get().null_count()
+
+    property distinct_count:
+
+        def __get__(self):
+            return self.statistics.get().distinct_count()
+
+    property num_values:
+
+        def __get__(self):
+            return self.statistics.get().num_values()
+
+    property physical_type:
+
+        def __get__(self):
+            physical_type = self.statistics.get().physical_type()
+            return physical_type_name_from_enum(physical_type)
+
+
+cdef class ColumnChunkMetaData:
+    cdef:
+        unique_ptr[CColumnChunkMetaData] up_metadata
+        CColumnChunkMetaData* metadata
+
+    def __cinit__(self):
+        pass
+
+    cdef init(self, const CRowGroupMetaData& row_group_metadata, int i):
+        self.up_metadata = row_group_metadata.ColumnChunk(i)
+        self.metadata = self.up_metadata.get()
+
+    def __repr__(self):
+        statistics = indent(repr(self.statistics), 4 * ' ')
+        return """{0}
+  file_offset: {1}
+  file_path: {2}
+  type: {3}
+  num_values: {4}
+  path_in_schema: {5}
+  is_stats_set: {6}
+  statistics:
+{7}
+  compression: {8}
+  encodings: {9}
+  has_dictionary_page: {10}
+  dictionary_page_offset: {11}
+  data_page_offset: {12}
+  index_page_offset: {13}
+  total_compressed_size: {14}
+  total_uncompressed_size: {15}""".format(object.__repr__(self),
+                                          self.file_offset,
+                                          self.file_path,
+                                          self.type,
+                                          self.num_values,
+                                          self.path_in_schema,
+                                          self.is_stats_set,
+                                          statistics,
+                                          self.compression,
+                                          self.encodings,
+                                          self.has_dictionary_page,
+                                          self.dictionary_page_offset,
+                                          self.data_page_offset,
+                                          self.index_page_offset,
+                                          self.total_compressed_size,
+                                          self.total_uncompressed_size)
+
+    property file_offset:
+
+        def __get__(self):
+            return self.metadata.file_offset()
+
+    property file_path:
+
+        def __get__(self):
+            return frombytes(self.metadata.file_path())
+
+    property type:
+
+        def __get__(self):
+            return physical_type_name_from_enum(self.metadata.type())
+
+    property num_values:
+
+        def __get__(self):
+            return self.metadata.num_values()
+
+    property path_in_schema:
+
+        def __get__(self):
+            path = self.metadata.path_in_schema().get().ToDotString()
+            return frombytes(path)
+
+    property is_stats_set:
+
+        def __get__(self):
+            return self.metadata.is_stats_set()
+
+    property statistics:
+
+        def __get__(self):
+            statistics = RowGroupStatistics()
+            statistics.init(self.metadata.statistics())
+            return statistics
+
+    property compression:
+
+        def __get__(self):
+            return self.metadata.compression()
+
+    property encodings:
+
+        def __get__(self):
+            return map(encoding_name_from_enum,
+                       self.metadata.encodings())
+
+    property has_dictionary_page:
+
+        def __get__(self):
+            return self.metadata.has_dictionary_page()
+
+    property dictionary_page_offset:
+
+        def __get__(self):
+            return self.metadata.dictionary_page_offset()
+
+    property data_page_offset:
+
+        def __get__(self):
+            return self.metadata.data_page_offset()
+
+    property index_page_offset:
+
+        def __get__(self):
+            return self.metadata.index_page_offset()
+
+    property total_compressed_size:
+
+        def __get__(self):
+            return self.metadata.total_compressed_size()
+
+    property total_uncompressed_size:
+
+        def __get__(self):
+            return self.metadata.total_uncompressed_size()
+
 
 cdef class RowGroupMetaData:
     cdef:
@@ -52,6 +258,11 @@ cdef class RowGroupMetaData:
         self.metadata = self.up_metadata.get()
         self.parent = parent
 
+    def column(self, int i):
+        chunk = ColumnChunkMetaData()
+        chunk.init(deref(self.metadata), i)
+        return chunk
+
     def __repr__(self):
         return """{0}
   num_columns: {1}
@@ -371,6 +582,19 @@ cdef logical_type_name_from_enum(ParquetLogicalType type_):
     }.get(type_, 'UNKNOWN')
 
 
+cdef encoding_name_from_enum (ParquetEncoding encoding_):
+    return {
+        ParquetEncoding_PLAIN: "PLAIN",
+        ParquetEncoding_PLAIN_DICTIONARY: "PLAIN_DICTIONARY",
+        ParquetEncoding_RLE: "RLE",
+        ParquetEncoding_BIT_PACKED: "BIT_PACKED",
+        ParquetEncoding_DELTA_BINARY_PACKED: "DELTA_BINARY_PACKED",
+        ParquetEncoding_DELTA_LENGTH_BYTE_ARRAY: "DELTA_LENGTH_BYTE_ARRAY",
+        ParquetEncoding_DELTA_BYTE_ARRAY: "DELTA_BYTE_ARRAY",
+        ParquetEncoding_RLE_DICTIONARY: "RLE_DICTIONARY",
+    }.get(encoding_, 'UNKNOWN')
+
+
 cdef class ParquetReader:
     cdef:
         object source
diff --git a/python/pyarrow/compat.py b/python/pyarrow/compat.py
index f9c148b14..866cbdd96 100644
--- a/python/pyarrow/compat.py
+++ b/python/pyarrow/compat.py
@@ -103,6 +103,9 @@ def tobytes(o):
 
     def frombytes(o):
         return o
+
+    def unichar(s):
+        return unichr(s)
 else:
     unicode_type = str
     def lzip(*x):
@@ -131,6 +134,9 @@ def tobytes(o):
     def frombytes(o):
         return o.decode('utf8')
 
+    def unichar(s):
+        return chr(s)
+
 try:
     import cloudpickle as pickle
 except ImportError:
diff --git a/python/pyarrow/tests/test_parquet.py 
b/python/pyarrow/tests/test_parquet.py
index 95dd6a471..e2e6863c4 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -22,7 +22,7 @@
 import json
 import pytest
 
-from pyarrow.compat import guid, u, BytesIO
+from pyarrow.compat import guid, u, BytesIO, unichar, frombytes
 from pyarrow.filesystem import LocalFileSystem
 import pyarrow as pa
 from .pandas_examples import dataframe_with_arrays, dataframe_with_lists
@@ -469,14 +469,62 @@ def test_parquet_metadata_api():
         schema[-1]
 
     # Row group
-    rg_meta = meta.row_group(0)
-    repr(rg_meta)
+    for rg in range(meta.num_row_groups):
+        rg_meta = meta.row_group(rg)
+        repr(rg_meta)
+
+        for col in range(rg_meta.num_columns):
+            col_meta = rg_meta.column(col)
+            repr(col_meta)
 
     assert rg_meta.num_rows == len(df)
     assert rg_meta.num_columns == ncols + 1  # +1 for index
 
 
 @parquet
+@pytest.mark.parametrize(
+    'data, dtype, min_value, max_value, null_count, num_values',
+    [
+        ([1, 2, 2, None, 4], np.uint8, u'1', u'4', 1, 4),
+        ([1, 2, 2, None, 4], np.uint16, u'1', u'4', 1, 4),
+        ([1, 2, 2, None, 4], np.uint32, u'1', u'4', 1, 4),
+        ([1, 2, 2, None, 4], np.uint64, u'1', u'4', 1, 4),
+        ([-1, 2, 2, None, 4], np.int16, u'-1', u'4', 1, 4),
+        ([-1, 2, 2, None, 4], np.int32, u'-1', u'4', 1, 4),
+        ([-1, 2, 2, None, 4], np.int64, u'-1', u'4', 1, 4),
+        ([-1.1, 2.2, 2.3, None, 4.4], np.float32, u'-1.1', u'4.4', 1, 4),
+        ([-1.1, 2.2, 2.3, None, 4.4], np.float64, u'-1.1', u'4.4', 1, 4),
+        (
+            [u'', u'b', unichar(1000), None, u'aaa'],
+            str, u' ', frombytes((unichar(1000) + u' ').encode('utf-8')), 1, 4
+        ),
+        ([True, False, False, True, True], np.bool, u'0', u'1', 0, 5),
+    ]
+)
+def test_parquet_column_statistics_api(
+        data,
+        dtype,
+        min_value,
+        max_value,
+        null_count,
+        num_values):
+    df = pd.DataFrame({'data': data}, dtype=dtype)
+
+    fileh = make_sample_file(df)
+
+    meta = fileh.metadata
+
+    rg_meta = meta.row_group(0)
+    col_meta = rg_meta.column(0)
+
+    stat = col_meta.statistics
+    assert stat.min == min_value
+    assert stat.max == max_value
+    assert stat.null_count == null_count
+    assert stat.num_values == num_values
+
+
+@parquet
 def test_compare_schemas():
     df = alltypes_sample(size=10000)
 


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [Python] Add accessors for Parquet column statistics 
> -----------------------------------------------------
>
>                 Key: ARROW-480
>                 URL: https://issues.apache.org/jira/browse/ARROW-480
>             Project: Apache Arrow
>          Issue Type: New Feature
>          Components: Python
>            Reporter: Wes McKinney
>            Assignee: Licht Takeuchi
>              Labels: pull-request-available
>             Fix For: 0.9.0
>
>




--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

[jira] [Commented] (ARROW-480) [Python] Add accessors for Parquet column statistics

Reply via email to