[ 
https://issues.apache.org/jira/browse/ARROW-2122?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16423461#comment-16423461
 ] 

ASF GitHub Bot commented on ARROW-2122:
---------------------------------------

xhochy closed pull request #1707: ARROW-2122: [Python] Pyarrow fails to 
serialize dataframe with timestamp.
URL: https://github.com/apache/arrow/pull/1707
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/python/pyarrow/pandas_compat.py b/python/pyarrow/pandas_compat.py
index 0bc47fc0d..fad943cce 100644
--- a/python/pyarrow/pandas_compat.py
+++ b/python/pyarrow/pandas_compat.py
@@ -128,7 +128,7 @@ def get_extension_dtype_info(column):
         }
         physical_dtype = str(cats.codes.dtype)
     elif hasattr(dtype, 'tz'):
-        metadata = {'timezone': str(dtype.tz)}
+        metadata = {'timezone': pa.lib.tzinfo_to_string(dtype.tz)}
         physical_dtype = 'datetime64[ns]'
     else:
         metadata = None
@@ -418,7 +418,7 @@ def dataframe_to_serialized_dict(frame):
         block_data = {}
 
         if isinstance(block, _int.DatetimeTZBlock):
-            block_data['timezone'] = values.tz.zone
+            block_data['timezone'] = pa.lib.tzinfo_to_string(values.tz)
             values = values.values
         elif isinstance(block, _int.CategoricalBlock):
             block_data.update(dictionary=values.categories,
@@ -482,6 +482,7 @@ def _reconstruct_block(item):
 
 def _make_datetimetz(tz):
     from pyarrow.compat import DatetimeTZDtype
+    tz = pa.lib.string_to_tzinfo(tz)
     return DatetimeTZDtype('ns', tz=tz)
 
 
diff --git a/python/pyarrow/scalar.pxi b/python/pyarrow/scalar.pxi
index a801acd69..64d44a967 100644
--- a/python/pyarrow/scalar.pxi
+++ b/python/pyarrow/scalar.pxi
@@ -235,8 +235,7 @@ cdef class TimestampValue(ArrayValue):
         value = self.value
 
         if not dtype.timezone().empty():
-            import pytz
-            tzinfo = pytz.timezone(frombytes(dtype.timezone()))
+            tzinfo = string_to_tzinfo(frombytes(dtype.timezone()))
         else:
             tzinfo = None
 
diff --git a/python/pyarrow/table.pxi b/python/pyarrow/table.pxi
index c27c0edd9..de7ddefc5 100644
--- a/python/pyarrow/table.pxi
+++ b/python/pyarrow/table.pxi
@@ -300,9 +300,11 @@ cdef class Column:
         result = pd.Series(values, name=self.name)
 
         if isinstance(self.type, TimestampType):
-            if self.type.tz is not None:
+            tz = self.type.tz
+            if tz is not None:
+                tz = string_to_tzinfo(tz)
                 result = (result.dt.tz_localize('utc')
-                          .dt.tz_convert(self.type.tz))
+                          .dt.tz_convert(tz))
 
         return result
 
diff --git a/python/pyarrow/tests/test_convert_builtin.py 
b/python/pyarrow/tests/test_convert_builtin.py
index 19b59a49b..988d512a8 100644
--- a/python/pyarrow/tests/test_convert_builtin.py
+++ b/python/pyarrow/tests/test_convert_builtin.py
@@ -26,6 +26,7 @@
 import itertools
 import numpy as np
 import six
+import pytz
 
 
 int_type_pairs = [
@@ -649,3 +650,14 @@ def test_decimal_array_with_none_and_nan():
 
     array = pa.array(values, type=pa.decimal128(10, 4))
     assert array.to_pylist() == [decimal.Decimal('1.2340'), None, None, None]
+
+
+@pytest.mark.parametrize('tz,name', [
+    (pytz.FixedOffset(90), '+01:30'),
+    (pytz.FixedOffset(-90), '-01:30'),
+    (pytz.utc, 'UTC'),
+    (pytz.timezone('America/New_York'), 'America/New_York')
+])
+def test_timezone_string(tz, name):
+    assert pa.lib.tzinfo_to_string(tz) == name
+    assert pa.lib.string_to_tzinfo(name) == tz
diff --git a/python/pyarrow/tests/test_convert_pandas.py 
b/python/pyarrow/tests/test_convert_pandas.py
index 5abc026bf..f90d805be 100644
--- a/python/pyarrow/tests/test_convert_pandas.py
+++ b/python/pyarrow/tests/test_convert_pandas.py
@@ -1000,6 +1000,16 @@ def test_array_from_pandas_date_with_mask(self):
         expected = pd.Series([None, date(1991, 1, 1), None])
         assert pa.Array.from_pandas(expected).equals(result)
 
+    def test_fixed_offset_timezone(self):
+        df = pd.DataFrame({
+            'a': [
+                pd.Timestamp('2012-11-11 00:00:00+01:00'),
+                pd.NaT
+                ]
+             })
+        _check_pandas_roundtrip(df)
+        _check_serialize_components_roundtrip(df)
+
 
 class TestConvertStringLikeTypes(object):
     """
diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi
index 5f962901c..1ff42d70f 100644
--- a/python/pyarrow/types.pxi
+++ b/python/pyarrow/types.pxi
@@ -15,6 +15,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
+import re
+
 # These are imprecise because the type (in pandas 0.x) depends on the presence
 # of nulls
 cdef dict _pandas_type_map = {
@@ -847,6 +849,63 @@ cdef timeunit_to_string(TimeUnit unit):
         return 'ns'
 
 
+_FIXED_OFFSET_RE = re.compile(r'([+-])(0[0-9]|1[0-9]|2[0-3]):([0-5][0-9])$')
+
+
+def tzinfo_to_string(tz):
+    """
+    Converts a time zone object into a string indicating the name of a time
+    zone, one of:
+    * As used in the Olson time zone database (the "tz database" or
+      "tzdata"), such as "America/New_York"
+    * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+
+    Parameters
+    ----------
+      tz : datetime.tzinfo
+        Time zone object
+
+    Returns
+    -------
+      name : string
+        Time zone name
+    """
+    if tz.zone is None:
+        sign = '+' if tz._minutes >= 0 else '-'
+        hours, minutes = divmod(abs(tz._minutes), 60)
+        return '{}{:02d}:{:02d}'.format(sign, hours, minutes)
+    else:
+        return tz.zone
+
+
+def string_to_tzinfo(name):
+    """
+    Converts a string indicating the name of a time zone into a time zone
+    object, one of:
+    * As used in the Olson time zone database (the "tz database" or
+      "tzdata"), such as "America/New_York"
+    * An absolute time zone offset of the form +XX:XX or -XX:XX, such as +07:30
+
+    Parameters
+    ----------
+      name: string
+        Time zone name
+
+    Returns
+    -------
+      tz : datetime.tzinfo
+        Time zone object
+    """
+    import pytz
+    m = _FIXED_OFFSET_RE.match(name)
+    if m:
+        sign = 1 if m.group(1) == '+' else -1
+        hours, minutes = map(int, m.group(2, 3))
+        return pytz.FixedOffset(sign * (hours * 60 + minutes))
+    else:
+        return pytz.timezone(name)
+
+
 def timestamp(unit, tz=None):
     """
     Create instance of timestamp type with resolution and optional time zone
@@ -894,7 +953,7 @@ def timestamp(unit, tz=None):
         _timestamp_type_cache[unit_code] = out
     else:
         if not isinstance(tz, six.string_types):
-            tz = tz.zone
+            tz = tzinfo_to_string(tz)
 
         c_timezone = tobytes(tz)
         out.init(ctimestamp(unit_code, c_timezone))


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> [Python] Pyarrow fails to serialize dataframe with timestamp.
> -------------------------------------------------------------
>
>                 Key: ARROW-2122
>                 URL: https://issues.apache.org/jira/browse/ARROW-2122
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Python
>            Reporter: Robert Nishihara
>            Assignee: Albert Shieh
>            Priority: Major
>              Labels: pull-request-available
>             Fix For: 0.10.0
>
>
> The bug can be reproduced as follows.
> {code:java}
> import pyarrow as pa
> import pandas as pd
> df = pd.DataFrame({'A': [pd.Timestamp('2012-11-11 00:00:00+01:00'), pd.NaT]}) 
> s = pa.serialize(df).to_buffer()
> new_df = pa.deserialize(s) # this fails{code}
> The last line fails with
> {code:java}
> Traceback (most recent call last):
>   File "<stdin>", line 1, in <module>
>   File "serialization.pxi", line 441, in pyarrow.lib.deserialize
>   File "serialization.pxi", line 404, in pyarrow.lib.deserialize_from
>   File "serialization.pxi", line 257, in 
> pyarrow.lib.SerializedPyObject.deserialize
>   File "serialization.pxi", line 174, in 
> pyarrow.lib.SerializationContext._deserialize_callback
>   File "/home/ubuntu/arrow/python/pyarrow/serialization.py", line 77, in 
> _deserialize_pandas_dataframe
>     return pdcompat.serialized_dict_to_dataframe(data)
>   File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in 
> serialized_dict_to_dataframe
>     for block in data['blocks']]
>   File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 446, in 
> <listcomp>
>     for block in data['blocks']]
>   File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 466, in 
> _reconstruct_block
>     dtype = _make_datetimetz(item['timezone'])
>   File "/home/ubuntu/arrow/python/pyarrow/pandas_compat.py", line 481, in 
> _make_datetimetz
>     return DatetimeTZDtype('ns', tz=tz)
>   File 
> "/home/ubuntu/anaconda3/lib/python3.5/site-packages/pandas/core/dtypes/dtypes.py",
>  line 409, in __new__
>     raise ValueError("DatetimeTZDtype constructor must have a tz "
> ValueError: DatetimeTZDtype constructor must have a tz supplied{code}
>  



--
This message was sent by Atlassian JIRA
(v7.6.3#76005)

Reply via email to