[ 
https://issues.apache.org/jira/browse/ARROW-7980?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

Søren Fuglede Jørgensen updated ARROW-7980:
-------------------------------------------
    Description: 
When following the [procedure outlined 
here]([https://stackoverflow.com/a/57986261/5085211]) to use `pyarrow` to 
serialize/deserialize pandas data frames, the below example fails with the 
given traceback:

 

import pandas as pd                                                             
         
import pyarrow as pa                                                            
         
df = pd.DataFrame([{'Minutes5UTC': '2020-02-25T21:15:00+00:00', 'Minutes5DK': 
'2020-02-25T22:15:00'}])                                                        
df['Minutes5DK'] = pd.to_datetime(df.Minutes5DK)                                
         
df['Minutes5UTC'] = pd.to_datetime(df.Minutes5UTC)                              
         
context = pa.default_serialization_context()                                    
         
pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())

 
--------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-9-6f75cc47c6d5> in <module>
----> 1 pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
 in pyarrow.lib.deserialize()

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
 in pyarrow.lib.deserialize_from()

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
 in pyarrow.lib.SerializedPyObject.deserialize()

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
 in pyarrow.lib.SerializationContext._deserialize_callback()

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.py 
in _deserialize_pandas_dataframe(data)
    167 
    168     def _deserialize_pandas_dataframe(data):
--> 169         return pdcompat.serialized_dict_to_dataframe(data)
    170 
    171     def _serialize_pandas_series(obj):

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py 
in serialized_dict_to_dataframe(data)
    661 def serialized_dict_to_dataframe(data):
    662     import pandas.core.internals as _int
--> 663     reconstructed_blocks = [_reconstruct_block(block)
    664                             for block in data['blocks']]
    665 

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py 
in <listcomp>(.0)
    661 def serialized_dict_to_dataframe(data):
    662     import pandas.core.internals as _int
--> 663     reconstructed_blocks = [_reconstruct_block(block)
    664                             for block in data['blocks']]
    665 

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py 
in _reconstruct_block(item, columns, extension_columns)
    707                                 klass=_int.CategoricalBlock)
    708     elif 'timezone' in item:
--> 709         dtype = make_datetimetz(item['timezone'])
    710         block = _int.make_block(block_arr, placement=placement,
    711                                 klass=_int.DatetimeTZBlock,

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py 
in make_datetimetz(tz)
    734 def make_datetimetz(tz):
    735     tz = pa.lib.string_to_tzinfo(tz)
--> 736     return _pandas_api.datetimetz_type('ns', tz=tz)
    737 
    738 

TypeError: 'NoneType' object is not callable

 

Perhaps interestingly, if I comment out the two `pd.to_datetime` lines, the 
thing works (perhaps unsurprisingly), but if I then include them again, the 
original reproducing example all of a sudden works. That is, this works:

import pandas as pd                                                             
         
import pyarrow as pa                                                            
         
df = pd.DataFrame([{'Minutes5UTC': '2020-02-25T21:15:00+00:00', 'Minutes5DK': 
'2020-02-25T22:15:00'}])
context = pa.default_serialization_context()
pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())

df = pd.DataFrame([{'Minutes5UTC': '2020-02-25T21:15:00+00:00', 'Minutes5DK': 
'2020-02-25T22:15:00'}])
df['Minutes5DK'] = pd.to_datetime(df.Minutes5DK)
df['Minutes5UTC'] = pd.to_datetime(df.Minutes5UTC)
context = pa.default_serialization_context()
pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())

The issue occurs with pyarrow 0.16.0, and in both pandas 0.25.3 and 1.0.1.

  was:
When following the [procedure outlined 
here]([https://stackoverflow.com/a/57986261/5085211]) to use `pyarrow` to 
serialize/deserialize pandas data frames, the below example fails with the 
given traceback:

{{
import pandas as pd
 import pyarrow as pa
 df = pd.DataFrame([{'Minutes5UTC': '2020-02-25T21:15:00+00:00', 'Minutes5DK': 
'2020-02-25T22:15:00'}])
 df['Minutes5DK'] = pd.to_datetime(df.Minutes5DK)
 df['Minutes5UTC'] = pd.to_datetime(df.Minutes5UTC)
 context = pa.default_serialization_context()
 pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())

--------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-9-6f75cc47c6d5> in <module>
----> 1 pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
 in pyarrow.lib.deserialize()

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
 in pyarrow.lib.deserialize_from()

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
 in pyarrow.lib.SerializedPyObject.deserialize()

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
 in pyarrow.lib.SerializationContext._deserialize_callback()

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.py 
in _deserialize_pandas_dataframe(data)
    167 
    168     def _deserialize_pandas_dataframe(data):
--> 169         return pdcompat.serialized_dict_to_dataframe(data)
    170 
    171     def _serialize_pandas_series(obj):

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py 
in serialized_dict_to_dataframe(data)
    661 def serialized_dict_to_dataframe(data):
    662     import pandas.core.internals as _int
--> 663     reconstructed_blocks = [_reconstruct_block(block)
    664                             for block in data['blocks']]
    665 

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py 
in <listcomp>(.0)
    661 def serialized_dict_to_dataframe(data):
    662     import pandas.core.internals as _int
--> 663     reconstructed_blocks = [_reconstruct_block(block)
    664                             for block in data['blocks']]
    665 

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py 
in _reconstruct_block(item, columns, extension_columns)
    707                                 klass=_int.CategoricalBlock)
    708     elif 'timezone' in item:
--> 709         dtype = make_datetimetz(item['timezone'])
    710         block = _int.make_block(block_arr, placement=placement,
    711                                 klass=_int.DatetimeTZBlock,

~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py 
in make_datetimetz(tz)
    734 def make_datetimetz(tz):
    735     tz = pa.lib.string_to_tzinfo(tz)
--> 736     return _pandas_api.datetimetz_type('ns', tz=tz)
    737 
    738 

TypeError: 'NoneType' object is not callable

}}

Perhaps interestingly, if I comment out the two `pd.to_datetime` lines, the 
thing works (perhaps unsurprisingly), but if I then include them again, the 
original reproducing example all of a sudden works. That is, this works:

{{
 import pandas as pd 
 import pyarrow as pa 
 df = pd.DataFrame([\{'Minutes5UTC': '2020-02-25T21:15:00+00:00', 'Minutes5DK': 
'2020-02-25T22:15:00'}])
 context = pa.default_serialization_context()
 pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())
 
 df = pd.DataFrame([\{'Minutes5UTC': '2020-02-25T21:15:00+00:00', 'Minutes5DK': 
'2020-02-25T22:15:00'}])
 df['Minutes5DK'] = pd.to_datetime(df.Minutes5DK)
 df['Minutes5UTC'] = pd.to_datetime(df.Minutes5UTC)
 context = pa.default_serialization_context()
 pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())}}

The issue occurs with pyarrow 0.16.0, and in both pandas 0.25.3 and 1.0.1.


> Deserialization with pyarrow fails for certain Timestamp-based data frame
> -------------------------------------------------------------------------
>
>                 Key: ARROW-7980
>                 URL: https://issues.apache.org/jira/browse/ARROW-7980
>             Project: Apache Arrow
>          Issue Type: Bug
>          Components: Python
>    Affects Versions: 0.16.0
>            Reporter: Søren Fuglede Jørgensen
>            Priority: Major
>
> When following the [procedure outlined 
> here]([https://stackoverflow.com/a/57986261/5085211]) to use `pyarrow` to 
> serialize/deserialize pandas data frames, the below example fails with the 
> given traceback:
>  
> import pandas as pd                                                           
>            
> import pyarrow as pa                                                          
>            
> df = pd.DataFrame([{'Minutes5UTC': '2020-02-25T21:15:00+00:00', 'Minutes5DK': 
> '2020-02-25T22:15:00'}])                                                      
>   
> df['Minutes5DK'] = pd.to_datetime(df.Minutes5DK)                              
>            
> df['Minutes5UTC'] = pd.to_datetime(df.Minutes5UTC)                            
>            
> context = pa.default_serialization_context()                                  
>            
> pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())
>  
> --------------------------------------------------------------------------
> TypeError                                 Traceback (most recent call last)
> <ipython-input-9-6f75cc47c6d5> in <module>
> ----> 1 pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())
> ~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
>  in pyarrow.lib.deserialize()
> ~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
>  in pyarrow.lib.deserialize_from()
> ~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
>  in pyarrow.lib.SerializedPyObject.deserialize()
> ~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.pxi
>  in pyarrow.lib.SerializationContext._deserialize_callback()
> ~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/serialization.py
>  in _deserialize_pandas_dataframe(data)
>     167 
>     168     def _deserialize_pandas_dataframe(data):
> --> 169         return pdcompat.serialized_dict_to_dataframe(data)
>     170 
>     171     def _serialize_pandas_series(obj):
> ~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py
>  in serialized_dict_to_dataframe(data)
>     661 def serialized_dict_to_dataframe(data):
>     662     import pandas.core.internals as _int
> --> 663     reconstructed_blocks = [_reconstruct_block(block)
>     664                             for block in data['blocks']]
>     665 
> ~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py
>  in <listcomp>(.0)
>     661 def serialized_dict_to_dataframe(data):
>     662     import pandas.core.internals as _int
> --> 663     reconstructed_blocks = [_reconstruct_block(block)
>     664                             for block in data['blocks']]
>     665 
> ~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py
>  in _reconstruct_block(item, columns, extension_columns)
>     707                                 klass=_int.CategoricalBlock)
>     708     elif 'timezone' in item:
> --> 709         dtype = make_datetimetz(item['timezone'])
>     710         block = _int.make_block(block_arr, placement=placement,
>     711                                 klass=_int.DatetimeTZBlock,
> ~/miniconda3/envs/emission/lib/python3.8/site-packages/pyarrow/pandas_compat.py
>  in make_datetimetz(tz)
>     734 def make_datetimetz(tz):
>     735     tz = pa.lib.string_to_tzinfo(tz)
> --> 736     return _pandas_api.datetimetz_type('ns', tz=tz)
>     737 
>     738 
> TypeError: 'NoneType' object is not callable
>  
> Perhaps interestingly, if I comment out the two `pd.to_datetime` lines, the 
> thing works (perhaps unsurprisingly), but if I then include them again, the 
> original reproducing example all of a sudden works. That is, this works:
> import pandas as pd                                                           
>            
> import pyarrow as pa                                                          
>            
> df = pd.DataFrame([{'Minutes5UTC': '2020-02-25T21:15:00+00:00', 'Minutes5DK': 
> '2020-02-25T22:15:00'}])
> context = pa.default_serialization_context()
> pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())
> df = pd.DataFrame([{'Minutes5UTC': '2020-02-25T21:15:00+00:00', 'Minutes5DK': 
> '2020-02-25T22:15:00'}])
> df['Minutes5DK'] = pd.to_datetime(df.Minutes5DK)
> df['Minutes5UTC'] = pd.to_datetime(df.Minutes5UTC)
> context = pa.default_serialization_context()
> pa.deserialize(pa.serialize(df).to_buffer().to_pybytes())
> The issue occurs with pyarrow 0.16.0, and in both pandas 0.25.3 and 1.0.1.



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

Reply via email to