wjones127 commented on issue #12416:
URL: https://github.com/apache/arrow/issues/12416#issuecomment-1043536284
Good new is this should be an easy fix: the newer datasets implementation
doesn't have this issue, so you can pass in `use_legacy_dataset=False` (default
is `True` for now). See the example below.
@jorisvandenbossche Is the legacy being deprecated and removed later? Or
should we consider looking into fixing this?
```python
import pandas as pd
import polars as pl
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.dataset as ds
import tempfile
def read_csv(csv_file):
print("reading csv")
csv = """pid1,pid2,intCol,strCol
2010,10000,23455555508999,Peaceful
2015,15000,7753285016841556620,Happy
2020,25000,,World""".encode()
good_df = (pl
.read_csv(csv, dtypes={"intCol": pl.Utf8})
.with_column(pl.col("intCol").str.replace("", "0").cast(pl.Int64))
)
bad_df = pl.read_csv(csv)
#df = good_df
df = bad_df
print(df.head(10))
table = df.to_arrow()
print('Table Schema..\n',table.schema)
print(table['intCol'])
return table
def save_table(table, location):
pq.write_to_dataset(table, location, partition_cols=['pid1','pid2'],
use_legacy_dataset=False)
# Old deprecated datasets API
# pq.write_to_dataset(table, location, partition_cols=['pid1','pid2'])
def read_table(location):
schema = pa.schema([ ('pid1', pa.int64()), ('pid2', pa.int64())])
partition = ds.partitioning(schema=schema, flavor='hive')
dataset = ds.dataset(location, partitioning=partition)
table = dataset.to_table()
print(table['intCol'])
print("Retrived table schema\n", table)
df = pl.from_arrow(table)
print(df.head(10))
ds_dir = tempfile.mkdtemp()
table = read_csv(None)
save_table(table, ds_dir)
read_table(ds_dir)
```
```
reading csv
shape: (3, 4)
┌──────┬───────┬─────────────────────┬──────────┐
│ pid1 ┆ pid2 ┆ intCol ┆ strCol │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ i64 ┆ i64 ┆ str │
╞══════╪═══════╪═════════════════════╪══════════╡
│ 2010 ┆ 10000 ┆ 23455555508999 ┆ Peaceful │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2015 ┆ 15000 ┆ 7753285016841556620 ┆ Happy │
├╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 2020 ┆ 25000 ┆ null ┆ World │
└──────┴───────┴─────────────────────┴──────────┘
Table Schema..
pid1: int64
pid2: int64
intCol: int64
strCol: large_string
[
[
23455555508999,
7753285016841556620,
null
]
]
[
[
23455555508999
],
[
7753285016841556620
],
[
null
]
]
Retrived table schema
pyarrow.Table
intCol: int64
strCol: large_string
pid1: int64
pid2: int64
shape: (3, 4)
┌─────────────────────┬──────────┬──────┬───────┐
│ intCol ┆ strCol ┆ pid1 ┆ pid2 │
│ --- ┆ --- ┆ --- ┆ --- │
│ i64 ┆ str ┆ i64 ┆ i64 │
╞═════════════════════╪══════════╪══════╪═══════╡
│ 23455555508999 ┆ Peaceful ┆ 2010 ┆ 10000 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ 7753285016841556620 ┆ Happy ┆ 2015 ┆ 15000 │
├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
│ null ┆ World ┆ 2020 ┆ 25000 │
└─────────────────────┴──────────┴──────┴───────┘
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]