[arrow] branch master updated: ARROW-2692: [Python] Add test for writing dictionary encoded columns to chunked Parquet files

wesm Mon, 11 Jun 2018 20:58:29 -0700

This is an automated email from the ASF dual-hosted git repository.

wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git



The following commit(s) were added to refs/heads/master by this push:
     new 9a76caa  ARROW-2692: [Python] Add test for writing dictionary encoded 
columns to chunked Parquet files
9a76caa is described below

commit 9a76caa1ce180920eed6f3ea6c9a74b93c17485e
Author: Korn, Uwe <[email protected]>
AuthorDate: Mon Jun 11 23:58:00 2018 -0400

    ARROW-2692: [Python] Add test for writing dictionary encoded columns to 
chunked Parquet files
    
    Author: Korn, Uwe <[email protected]>
    
    Closes #2127 from xhochy/ARROW-2692 and squashes the following commits:
    
    b48855a8 <Korn, Uwe> ARROW-2692:  Add test for writing dictionary encoded 
columns to chunked Parquet files
---
 python/pyarrow/tests/test_parquet.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/python/pyarrow/tests/test_parquet.py 
b/python/pyarrow/tests/test_parquet.py
index 730a02a..eb405af 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -107,7 +107,7 @@ def test_single_pylist_column_roundtrip(tmpdir):
             assert data_written.equals(data_read)
 
 
-def alltypes_sample(size=10000, seed=0):
+def alltypes_sample(size=10000, seed=0, categorical=False):
     np.random.seed(seed)
     arrays = {
         'uint8': np.arange(size, dtype=np.uint8),
@@ -125,33 +125,36 @@ def alltypes_sample(size=10000, seed=0):
         # them
         'datetime': np.arange("2016-01-01T00:00:00.001", size,
                               dtype='datetime64[ms]'),
-        'str': [str(x) for x in range(size)],
+        'str': pd.Series([str(x) for x in range(size)]),
         'empty_str': [''] * size,
         'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
         'null': [None] * size,
         'null_list': [None] * 2 + [[None] * (x % 4) for x in range(size - 2)],
     }
+    if categorical:
+        arrays['str_category'] = arrays['str'].astype('category')
     return pd.DataFrame(arrays)
 
 
 @parquet
-def test_pandas_parquet_2_0_rountrip(tmpdir):
[email protected]('chunk_size', [None, 1000])
+def test_pandas_parquet_2_0_rountrip(tmpdir, chunk_size):
     import pyarrow.parquet as pq
-    df = alltypes_sample(size=10000)
+    df = alltypes_sample(size=10000, categorical=True)
 
     filename = tmpdir.join('pandas_rountrip.parquet')
     arrow_table = pa.Table.from_pandas(df)
     assert b'pandas' in arrow_table.schema.metadata
 
     _write_table(arrow_table, filename.strpath, version="2.0",
-                 coerce_timestamps='ms')
+                 coerce_timestamps='ms', chunk_size=chunk_size)
     table_read = pq.read_pandas(filename.strpath)
     assert b'pandas' in table_read.schema.metadata
 
     assert arrow_table.schema.metadata == table_read.schema.metadata
 
-    df_read = table_read.to_pandas()
-    tm.assert_frame_equal(df, df_read)
+    df_read = table_read.to_pandas(categories=['str_category'])
+    tm.assert_frame_equal(df, df_read, check_categorical=False)
 
 
 @parquet

-- 
To stop receiving notification emails like this one, please contact
[email protected].

[arrow] branch master updated: ARROW-2692: [Python] Add test for writing dictionary encoded columns to chunked Parquet files

Reply via email to