wjones127 commented on issue #12416:
URL: https://github.com/apache/arrow/issues/12416#issuecomment-1043536284


   Good new is this should be an easy fix: the newer datasets implementation 
doesn't have this issue, so you can pass in `use_legacy_dataset=False` (default 
is `True` for now). See the example below.
   
   @jorisvandenbossche Is the legacy being deprecated and removed later? Or 
should we consider looking into fixing this?
   
   ```python
   import pandas as pd
   import polars as pl
   import pyarrow as pa
   import pyarrow.parquet as pq
   import pyarrow.dataset as ds
   import tempfile
   
   def read_csv(csv_file):
       print("reading csv")
       csv = """pid1,pid2,intCol,strCol
       2010,10000,23455555508999,Peaceful
       2015,15000,7753285016841556620,Happy
       2020,25000,,World""".encode()
       good_df = (pl
               .read_csv(csv, dtypes={"intCol": pl.Utf8})
              .with_column(pl.col("intCol").str.replace("", "0").cast(pl.Int64))
          )
       bad_df = pl.read_csv(csv)
       #df = good_df
       df = bad_df
       print(df.head(10))
       table = df.to_arrow()
       print('Table Schema..\n',table.schema)
       print(table['intCol'])
       return table
   
   def save_table(table, location):
       pq.write_to_dataset(table, location, partition_cols=['pid1','pid2'], 
use_legacy_dataset=False)
       # Old deprecated datasets API
       # pq.write_to_dataset(table, location, partition_cols=['pid1','pid2'])
   
   def read_table(location):
       schema = pa.schema([ ('pid1', pa.int64()), ('pid2', pa.int64())])
       partition = ds.partitioning(schema=schema, flavor='hive')
       dataset = ds.dataset(location, partitioning=partition)
       table = dataset.to_table()
       print(table['intCol'])
       print("Retrived table schema\n", table)
       df = pl.from_arrow(table)
       print(df.head(10))
   
   ds_dir = tempfile.mkdtemp()
   
   table = read_csv(None)
   save_table(table, ds_dir)
   read_table(ds_dir)
   ``` 
   
   ```
   reading csv
   shape: (3, 4)
   ┌──────┬───────┬─────────────────────┬──────────┐
   │ pid1 ┆ pid2  ┆ intCol              ┆ strCol   │
   │ ---  ┆ ---   ┆ ---                 ┆ ---      │
   │ i64  ┆ i64   ┆ i64                 ┆ str      │
   ╞══════╪═══════╪═════════════════════╪══════════╡
   │ 2010 ┆ 10000 ┆ 23455555508999      ┆ Peaceful │
   ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
   │ 2015 ┆ 15000 ┆ 7753285016841556620 ┆ Happy    │
   ├╌╌╌╌╌╌┼╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
   │ 2020 ┆ 25000 ┆ null                ┆ World    │
   └──────┴───────┴─────────────────────┴──────────┘
   Table Schema..
    pid1: int64
   pid2: int64
   intCol: int64
   strCol: large_string
   [
     [
       23455555508999,
       7753285016841556620,
       null
     ]
   ]
   [
     [
       23455555508999
     ],
     [
       7753285016841556620
     ],
     [
       null
     ]
   ]
   Retrived table schema
    pyarrow.Table
   intCol: int64
   strCol: large_string
   pid1: int64
   pid2: int64
   shape: (3, 4)
   ┌─────────────────────┬──────────┬──────┬───────┐
   │ intCol              ┆ strCol   ┆ pid1 ┆ pid2  │
   │ ---                 ┆ ---      ┆ ---  ┆ ---   │
   │ i64                 ┆ str      ┆ i64  ┆ i64   │
   ╞═════════════════════╪══════════╪══════╪═══════╡
   │ 23455555508999      ┆ Peaceful ┆ 2010 ┆ 10000 │
   ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
   │ 7753285016841556620 ┆ Happy    ┆ 2015 ┆ 15000 │
   ├╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌┼╌╌╌╌╌╌╌┤
   │ null                ┆ World    ┆ 2020 ┆ 25000 │
   └─────────────────────┴──────────┴──────┴───────┘
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


Reply via email to