This is an automated email from the ASF dual-hosted git repository.
wesm pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/master by this push:
new 9a76caa ARROW-2692: [Python] Add test for writing dictionary encoded
columns to chunked Parquet files
9a76caa is described below
commit 9a76caa1ce180920eed6f3ea6c9a74b93c17485e
Author: Korn, Uwe <[email protected]>
AuthorDate: Mon Jun 11 23:58:00 2018 -0400
ARROW-2692: [Python] Add test for writing dictionary encoded columns to
chunked Parquet files
Author: Korn, Uwe <[email protected]>
Closes #2127 from xhochy/ARROW-2692 and squashes the following commits:
b48855a8 <Korn, Uwe> ARROW-2692: Add test for writing dictionary encoded
columns to chunked Parquet files
---
python/pyarrow/tests/test_parquet.py | 17 ++++++++++-------
1 file changed, 10 insertions(+), 7 deletions(-)
diff --git a/python/pyarrow/tests/test_parquet.py
b/python/pyarrow/tests/test_parquet.py
index 730a02a..eb405af 100644
--- a/python/pyarrow/tests/test_parquet.py
+++ b/python/pyarrow/tests/test_parquet.py
@@ -107,7 +107,7 @@ def test_single_pylist_column_roundtrip(tmpdir):
assert data_written.equals(data_read)
-def alltypes_sample(size=10000, seed=0):
+def alltypes_sample(size=10000, seed=0, categorical=False):
np.random.seed(seed)
arrays = {
'uint8': np.arange(size, dtype=np.uint8),
@@ -125,33 +125,36 @@ def alltypes_sample(size=10000, seed=0):
# them
'datetime': np.arange("2016-01-01T00:00:00.001", size,
dtype='datetime64[ms]'),
- 'str': [str(x) for x in range(size)],
+ 'str': pd.Series([str(x) for x in range(size)]),
'empty_str': [''] * size,
'str_with_nulls': [None] + [str(x) for x in range(size - 2)] + [None],
'null': [None] * size,
'null_list': [None] * 2 + [[None] * (x % 4) for x in range(size - 2)],
}
+ if categorical:
+ arrays['str_category'] = arrays['str'].astype('category')
return pd.DataFrame(arrays)
@parquet
-def test_pandas_parquet_2_0_rountrip(tmpdir):
[email protected]('chunk_size', [None, 1000])
+def test_pandas_parquet_2_0_rountrip(tmpdir, chunk_size):
import pyarrow.parquet as pq
- df = alltypes_sample(size=10000)
+ df = alltypes_sample(size=10000, categorical=True)
filename = tmpdir.join('pandas_rountrip.parquet')
arrow_table = pa.Table.from_pandas(df)
assert b'pandas' in arrow_table.schema.metadata
_write_table(arrow_table, filename.strpath, version="2.0",
- coerce_timestamps='ms')
+ coerce_timestamps='ms', chunk_size=chunk_size)
table_read = pq.read_pandas(filename.strpath)
assert b'pandas' in table_read.schema.metadata
assert arrow_table.schema.metadata == table_read.schema.metadata
- df_read = table_read.to_pandas()
- tm.assert_frame_equal(df, df_read)
+ df_read = table_read.to_pandas(categories=['str_category'])
+ tm.assert_frame_equal(df, df_read, check_categorical=False)
@parquet
--
To stop receiving notification emails like this one, please contact
[email protected].