[jira] [Commented] (ARROW-1982) [Python] Return parquet statistics min/max as values instead of strings
[ https://issues.apache.org/jira/browse/ARROW-1982?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386608#comment-16386608 ] ASF GitHub Bot commented on ARROW-1982: --- wesm closed pull request #1698: ARROW-1982: [Python] Coerce Parquet statistics as bytes to more useful Python scalar types URL: https://github.com/apache/arrow/pull/1698 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/python/pyarrow/_parquet.pyx b/python/pyarrow/_parquet.pyx index e513e1d92..101fcd165 100644 --- a/python/pyarrow/_parquet.pyx +++ b/python/pyarrow/_parquet.pyx @@ -70,6 +70,31 @@ cdef class RowGroupStatistics: self.num_values, self.physical_type) +cdef inline _cast_statistic(self, object value): +# Input value is bytes +cdef ParquetType physical_type = self.statistics.get().physical_type() +if physical_type == ParquetType_BOOLEAN: +return bool(int(value)) +elif physical_type == ParquetType_INT32: +return int(value) +elif physical_type == ParquetType_INT64: +return int(value) +elif physical_type == ParquetType_INT96: +# Leave as PyBytes +return value +elif physical_type == ParquetType_FLOAT: +return float(value) +elif physical_type == ParquetType_DOUBLE: +return float(value) +elif physical_type == ParquetType_BYTE_ARRAY: +# Leave as PyBytes +return value +elif physical_type == ParquetType_FIXED_LEN_BYTE_ARRAY: +# Leave as PyBytes +return value +else: +raise ValueError('Unknown physical ParquetType') + property has_min_max: def __get__(self): @@ -82,7 +107,7 @@ cdef class RowGroupStatistics: encode_min = self.statistics.get().EncodeMin() min_value = FormatStatValue(raw_physical_type, encode_min.c_str()) -return frombytes(min_value) +return self._cast_statistic(min_value) property max: @@ -91,7 +116,7 @@ cdef class RowGroupStatistics: encode_max = self.statistics.get().EncodeMax() max_value = FormatStatValue(raw_physical_type, encode_max.c_str()) -return frombytes(max_value) +return self._cast_statistic(max_value) property null_count: diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index cec01c859..a3da05fe3 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -26,7 +26,7 @@ import pytest -from pyarrow.compat import guid, u, BytesIO, unichar, frombytes +from pyarrow.compat import guid, u, BytesIO, unichar from pyarrow.tests import util from pyarrow.filesystem import LocalFileSystem import pyarrow as pa @@ -524,20 +524,20 @@ def test_parquet_metadata_api(): @pytest.mark.parametrize( 'data, dtype, min_value, max_value, null_count, num_values', [ -([1, 2, 2, None, 4], np.uint8, u'1', u'4', 1, 4), -([1, 2, 2, None, 4], np.uint16, u'1', u'4', 1, 4), -([1, 2, 2, None, 4], np.uint32, u'1', u'4', 1, 4), -([1, 2, 2, None, 4], np.uint64, u'1', u'4', 1, 4), -([-1, 2, 2, None, 4], np.int16, u'-1', u'4', 1, 4), -([-1, 2, 2, None, 4], np.int32, u'-1', u'4', 1, 4), -([-1, 2, 2, None, 4], np.int64, u'-1', u'4', 1, 4), -([-1.1, 2.2, 2.3, None, 4.4], np.float32, u'-1.1', u'4.4', 1, 4), -([-1.1, 2.2, 2.3, None, 4.4], np.float64, u'-1.1', u'4.4', 1, 4), +([1, 2, 2, None, 4], np.uint8, 1, 4, 1, 4), +([1, 2, 2, None, 4], np.uint16, 1, 4, 1, 4), +([1, 2, 2, None, 4], np.uint32, 1, 4, 1, 4), +([1, 2, 2, None, 4], np.uint64, 1, 4, 1, 4), +([-1, 2, 2, None, 4], np.int16, -1, 4, 1, 4), +([-1, 2, 2, None, 4], np.int32, -1, 4, 1, 4), +([-1, 2, 2, None, 4], np.int64, -1, 4, 1, 4), +([-1.1, 2.2, 2.3, None, 4.4], np.float32, -1.1, 4.4, 1, 4), +([-1.1, 2.2, 2.3, None, 4.4], np.float64, -1.1, 4.4, 1, 4), ( [u'', u'b', unichar(1000), None, u'aaa'], -str, u' ', frombytes((unichar(1000) + u' ').encode('utf-8')), 1, 4 +str, b' ', (unichar(1000) + u' ').encode('utf-8'), 1, 4 ), -([True, False, False, True, True], np.bool, u'0', u'1', 0, 5), +([True, False, False, True, True], np.bool, False, True, 0, 5), ] ) def test_parquet_column_statistics_api( This is an automated message from the Apache Git Service. To respond to the me
[jira] [Commented] (ARROW-1982) [Python] Return parquet statistics min/max as values instead of strings
[ https://issues.apache.org/jira/browse/ARROW-1982?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386598#comment-16386598 ] ASF GitHub Bot commented on ARROW-1982: --- wesm commented on issue #1698: ARROW-1982: [Python] Coerce Parquet statistics as bytes to more useful Python scalar types URL: https://github.com/apache/arrow/pull/1698#issuecomment-370533662 Appveyor build: https://ci.appveyor.com/project/wesm/arrow/build/1.0.1737 This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Return parquet statistics min/max as values instead of strings > --- > > Key: ARROW-1982 > URL: https://issues.apache.org/jira/browse/ARROW-1982 > Project: Apache Arrow > Issue Type: Bug > Components: Python >Reporter: Jim Crist >Assignee: Wes McKinney >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > > Currently `min` and `max` column statistics are returned as formatted strings > of the _physical type_. This makes using them in python a bit tricky, as the > strings need to be parsed as the proper _logical type_. Observe: > {code} > In [20]: import pandas as pd > In [21]: df = pd.DataFrame({'a': [1, 2, 3], > ...:'b': ['a', 'b', 'c'], > ...:'c': [pd.Timestamp('1991-01-01')]*3}) > ...: > In [22]: df.to_parquet('temp.parquet', engine='pyarrow') > In [23]: from pyarrow import parquet as pq > In [24]: f = pq.ParquetFile('temp.parquet') > In [25]: rg = f.metadata.row_group(0) > In [26]: rg.column(0).statistics.min # string instead of integer > Out[26]: '1' > In [27]: rg.column(1).statistics.min # weird space added after value due to > formatter > Out[27]: 'a ' > In [28]: rg.column(2).statistics.min # formatted as physical type (int) > instead of logical (datetime) > Out[28]: '66268800' > {code} > Since the type information is known, it should be possible to convert these > to arrow values instead of strings. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-1982) [Python] Return parquet statistics min/max as values instead of strings
[ https://issues.apache.org/jira/browse/ARROW-1982?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386591#comment-16386591 ] ASF GitHub Bot commented on ARROW-1982: --- wesm commented on a change in pull request #1698: ARROW-1982: [Python] Coerce Parquet statistics as bytes to more useful Python scalar types URL: https://github.com/apache/arrow/pull/1698#discussion_r172299264 ## File path: python/pyarrow/_parquet.pyx ## @@ -70,6 +70,28 @@ cdef class RowGroupStatistics: self.num_values, self.physical_type) +cdef inline _cast_statistic(self, object value): +cdef ParquetType physical_type = self.statistics.get().physical_type() +if physical_type == ParquetType_BOOLEAN: +return bool(int(value)) +elif physical_type == ParquetType_INT32: +return int(value) +elif physical_type == ParquetType_INT64: +return int(value) +elif physical_type == ParquetType_INT96: +# TODO Review comment: OK, value is bytes here already so this can remain as, I'll remove the TODO This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Return parquet statistics min/max as values instead of strings > --- > > Key: ARROW-1982 > URL: https://issues.apache.org/jira/browse/ARROW-1982 > Project: Apache Arrow > Issue Type: Bug > Components: Python >Reporter: Jim Crist >Assignee: Wes McKinney >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > > Currently `min` and `max` column statistics are returned as formatted strings > of the _physical type_. This makes using them in python a bit tricky, as the > strings need to be parsed as the proper _logical type_. Observe: > {code} > In [20]: import pandas as pd > In [21]: df = pd.DataFrame({'a': [1, 2, 3], > ...:'b': ['a', 'b', 'c'], > ...:'c': [pd.Timestamp('1991-01-01')]*3}) > ...: > In [22]: df.to_parquet('temp.parquet', engine='pyarrow') > In [23]: from pyarrow import parquet as pq > In [24]: f = pq.ParquetFile('temp.parquet') > In [25]: rg = f.metadata.row_group(0) > In [26]: rg.column(0).statistics.min # string instead of integer > Out[26]: '1' > In [27]: rg.column(1).statistics.min # weird space added after value due to > formatter > Out[27]: 'a ' > In [28]: rg.column(2).statistics.min # formatted as physical type (int) > instead of logical (datetime) > Out[28]: '66268800' > {code} > Since the type information is known, it should be possible to convert these > to arrow values instead of strings. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-1982) [Python] Return parquet statistics min/max as values instead of strings
[ https://issues.apache.org/jira/browse/ARROW-1982?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386518#comment-16386518 ] ASF GitHub Bot commented on ARROW-1982: --- xhochy commented on a change in pull request #1698: ARROW-1982: [Python] Coerce Parquet statistics as bytes to more useful Python scalar types URL: https://github.com/apache/arrow/pull/1698#discussion_r172287015 ## File path: python/pyarrow/_parquet.pyx ## @@ -70,6 +70,28 @@ cdef class RowGroupStatistics: self.num_values, self.physical_type) +cdef inline _cast_statistic(self, object value): +cdef ParquetType physical_type = self.statistics.get().physical_type() +if physical_type == ParquetType_BOOLEAN: +return bool(int(value)) +elif physical_type == ParquetType_INT32: +return int(value) +elif physical_type == ParquetType_INT64: +return int(value) +elif physical_type == ParquetType_INT96: +# TODO Review comment: We should return also `bytes` here. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Return parquet statistics min/max as values instead of strings > --- > > Key: ARROW-1982 > URL: https://issues.apache.org/jira/browse/ARROW-1982 > Project: Apache Arrow > Issue Type: Bug > Components: Python >Reporter: Jim Crist >Assignee: Wes McKinney >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > > Currently `min` and `max` column statistics are returned as formatted strings > of the _physical type_. This makes using them in python a bit tricky, as the > strings need to be parsed as the proper _logical type_. Observe: > {code} > In [20]: import pandas as pd > In [21]: df = pd.DataFrame({'a': [1, 2, 3], > ...:'b': ['a', 'b', 'c'], > ...:'c': [pd.Timestamp('1991-01-01')]*3}) > ...: > In [22]: df.to_parquet('temp.parquet', engine='pyarrow') > In [23]: from pyarrow import parquet as pq > In [24]: f = pq.ParquetFile('temp.parquet') > In [25]: rg = f.metadata.row_group(0) > In [26]: rg.column(0).statistics.min # string instead of integer > Out[26]: '1' > In [27]: rg.column(1).statistics.min # weird space added after value due to > formatter > Out[27]: 'a ' > In [28]: rg.column(2).statistics.min # formatted as physical type (int) > instead of logical (datetime) > Out[28]: '66268800' > {code} > Since the type information is known, it should be possible to convert these > to arrow values instead of strings. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-1982) [Python] Return parquet statistics min/max as values instead of strings
[ https://issues.apache.org/jira/browse/ARROW-1982?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16385404#comment-16385404 ] ASF GitHub Bot commented on ARROW-1982: --- wesm opened a new pull request #1698: ARROW-1982: [Python] Coerce Parquet statistics as bytes to more useful Python scalar types URL: https://github.com/apache/arrow/pull/1698 I also changed the BYTE_ARRAY, FIXED_LEN_BYTE_ARRAY to return bytes since decoding from binary to UTF8 unicode didn't seem correct to me as the default behavior This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [Python] Return parquet statistics min/max as values instead of strings > --- > > Key: ARROW-1982 > URL: https://issues.apache.org/jira/browse/ARROW-1982 > Project: Apache Arrow > Issue Type: Bug > Components: Python >Reporter: Jim Crist >Assignee: Wes McKinney >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > > Currently `min` and `max` column statistics are returned as formatted strings > of the _physical type_. This makes using them in python a bit tricky, as the > strings need to be parsed as the proper _logical type_. Observe: > {code} > In [20]: import pandas as pd > In [21]: df = pd.DataFrame({'a': [1, 2, 3], > ...:'b': ['a', 'b', 'c'], > ...:'c': [pd.Timestamp('1991-01-01')]*3}) > ...: > In [22]: df.to_parquet('temp.parquet', engine='pyarrow') > In [23]: from pyarrow import parquet as pq > In [24]: f = pq.ParquetFile('temp.parquet') > In [25]: rg = f.metadata.row_group(0) > In [26]: rg.column(0).statistics.min # string instead of integer > Out[26]: '1' > In [27]: rg.column(1).statistics.min # weird space added after value due to > formatter > Out[27]: 'a ' > In [28]: rg.column(2).statistics.min # formatted as physical type (int) > instead of logical (datetime) > Out[28]: '66268800' > {code} > Since the type information is known, it should be possible to convert these > to arrow values instead of strings. -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-1982) [Python] Return parquet statistics min/max as values instead of strings
[ https://issues.apache.org/jira/browse/ARROW-1982?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16336846#comment-16336846 ] Wes McKinney commented on ARROW-1982: - This seems easy enough to fix. Marked for 0.9.0 > [Python] Return parquet statistics min/max as values instead of strings > --- > > Key: ARROW-1982 > URL: https://issues.apache.org/jira/browse/ARROW-1982 > Project: Apache Arrow > Issue Type: Bug > Components: Python >Reporter: Jim Crist >Priority: Major > Fix For: 0.9.0 > > > Currently `min` and `max` column statistics are returned as formatted strings > of the _physical type_. This makes using them in python a bit tricky, as the > strings need to be parsed as the proper _logical type_. Observe: > {code} > In [20]: import pandas as pd > In [21]: df = pd.DataFrame({'a': [1, 2, 3], > ...:'b': ['a', 'b', 'c'], > ...:'c': [pd.Timestamp('1991-01-01')]*3}) > ...: > In [22]: df.to_parquet('temp.parquet', engine='pyarrow') > In [23]: from pyarrow import parquet as pq > In [24]: f = pq.ParquetFile('temp.parquet') > In [25]: rg = f.metadata.row_group(0) > In [26]: rg.column(0).statistics.min # string instead of integer > Out[26]: '1' > In [27]: rg.column(1).statistics.min # weird space added after value due to > formatter > Out[27]: 'a ' > In [28]: rg.column(2).statistics.min # formatted as physical type (int) > instead of logical (datetime) > Out[28]: '66268800' > {code} > Since the type information is known, it should be possible to convert these > to arrow values instead of strings. -- This message was sent by Atlassian JIRA (v7.6.3#76005)