EMCP opened a new issue, #4356:
URL: https://github.com/apache/arrow-rs/issues/4356
**Describe the bug**
I am attempting to try out arrow-rs for the first time, with the eventual
goal to migrate off of the python implementation. one of the newest files that
came across my bench started to throw an exception during this routine to
dedupe data.. and I am unsure why..
Here's the routine :
```
fn example_get_frame(some_file_path: &str) -> PolarsResult<DataFrame> {
let r = fs::File::open(some_file_path).unwrap();
let reader = ParquetReader::new(r);
return reader.finish()
}
fn dedupe_parquet_file(entry: walkdir::DirEntry, output_dir: String) {
println!("modifying !");
let df = example_get_frame(entry.path().to_str().unwrap());
let mut new_df = df.expect("").unique(None,
UniqueKeepStrategy::First).expect("");
//TODO: build and verify a proper path
let new_output_filepath = Path::join(Path::new( output_dir.as_str()),
entry.file_name().to_str().unwrap());
println!("{}", new_output_filepath.to_str().unwrap());
let mut file = fs::File::create(new_output_filepath).unwrap();
ParquetWriter::new(&mut file).finish(&mut new_df).unwrap();
println!();
}
```
The Error
```
thread 'main' panicked at ': ArrowError(ExternalFormat("File out of
specification: Invalid DECIMAL: scale (1) cannot be greater than or equal to
precision (1)"))', src/main.rs:21:25
stack backtrace:
0: rust_begin_unwind
at
/rustc/84c898d65adf2f39a5a98507f1fe0ce10a2b8dbc/library/std/src/panicking.rs:579:5
1: core::panicking::panic_fmt
at
/rustc/84c898d65adf2f39a5a98507f1fe0ce10a2b8dbc/library/core/src/panicking.rs:64:14
2: core::result::unwrap_failed
at
/rustc/84c898d65adf2f39a5a98507f1fe0ce10a2b8dbc/library/core/src/result.rs:1750:5
3: core::result::Result<T,E>::expect
at
/rustc/84c898d65adf2f39a5a98507f1fe0ce10a2b8dbc/library/core/src/result.rs:1047:23
4: parquet_dedupe_data::dedupe_parquet_file
at ./src/main.rs:21:22
5: parquet_dedupe_data::main
at ./src/main.rs:53:13
6: core::ops::function::FnOnce::call_once
at
/rustc/84c898d65adf2f39a5a98507f1fe0ce10a2b8dbc/library/core/src/ops/function.rs:250:5
note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose
backtrace.
```
**To Reproduce**
As you can see I walk the input DIR.. find parquet files.. and attempt to
dedupe them.
**Expected behavior**
I am thinking either there's an error in my data... or this case of the
decimal is not supported well by arrow-rs..
**Additional context**
Here's the schema of the offending file
```
{
"type" : "record",
"name" : "schema",
"fields" : [ {
"name" : "category",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "maturity",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "liquid_hours",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "long_name",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "contract_month",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "real_expiration_date",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "under_sec_type",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "trading_hours",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "ev_rule",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "time_zone_id",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "next_option_partial",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "next_option_date",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "price_magnifier",
"type" : [ "null", {
"type" : "fixed",
"name" : "price_magnifier",
"size" : 2,
"logicalType" : "decimal",
"precision" : 4,
"scale" : 1
} ],
"default" : null
}, {
"name" : "agg_group",
"type" : [ "null", "long" ],
"default" : null
}, {
"name" : "stock_type",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "under_symbol",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "market_rule_ids",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "query_start_time",
"type" : [ "null", "long" ],
"default" : null
}, {
"name" : "last_trade_time",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "convertible",
"type" : [ "null", "boolean" ],
"default" : null
}, {
"name" : "coupon",
"type" : [ "null", {
"type" : "fixed",
"name" : "coupon",
"size" : 1,
"logicalType" : "decimal",
"precision" : 1,
"scale" : 1
} ],
"default" : null
}, {
"name" : "cusip_check_digit",
"type" : [ "null", "long" ],
"default" : null
}, {
"name" : "callable",
"type" : [ "null", "boolean" ],
"default" : null
}, {
"name" : "isin",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "issue_date",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "ratings",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "putable",
"type" : [ "null", "boolean" ],
"default" : null
}, {
"name" : "min_tick",
"type" : [ "null", {
"type" : "fixed",
"name" : "min_tick",
"size" : 2,
"logicalType" : "decimal",
"precision" : 4,
"scale" : 4
} ],
"default" : null
}, {
"name" : "market_name",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "order_types",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "next_option_type",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "suggested_size_increment",
"type" : [ "null", "long" ],
"default" : null
}, {
"name" : "bond_type",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "industry",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "contract_id",
"type" : [ "null", "long" ],
"default" : null
}, {
"name" : "ev_multiplier",
"type" : [ "null", {
"type" : "fixed",
"name" : "ev_multiplier",
"size" : 1,
"logicalType" : "decimal",
"precision" : 1,
"scale" : 1
} ],
"default" : null
}, {
"name" : "subcategory",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "min_size",
"type" : [ "null", "long" ],
"default" : null
}, {
"name" : "under_contract_id",
"type" : [ "null", "long" ],
"default" : null
}, {
"name" : "cusip",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "coupon_type",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "desc_append",
"type" : [ "null", "string" ],
"default" : null
}, {
"name" : "size_increment",
"type" : [ "null", "long" ],
"default" : null
}, {
"name" : "notes",
"type" : [ "null", "string" ],
"default" : null
} ]
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]