EMCP opened a new issue, #4356:
URL: https://github.com/apache/arrow-rs/issues/4356

   **Describe the bug**
   
   I am attempting to try out arrow-rs for the first time, with the eventual 
goal to migrate off of the python implementation.  one of the newest files that 
came across my bench started to throw an exception during this routine to 
dedupe data.. and I am unsure why..
   
   Here's the routine :
   
   ```
   
   fn example_get_frame(some_file_path: &str) -> PolarsResult<DataFrame> {
       let r = fs::File::open(some_file_path).unwrap();
       let reader = ParquetReader::new(r);
       return reader.finish()
   }
   
   fn dedupe_parquet_file(entry: walkdir::DirEntry, output_dir: String) {
   
       println!("modifying !");
       let df = example_get_frame(entry.path().to_str().unwrap());
   
       let mut new_df = df.expect("").unique(None, 
UniqueKeepStrategy::First).expect("");
   
       //TODO: build and verify a proper path
       let new_output_filepath = Path::join(Path::new( output_dir.as_str()), 
entry.file_name().to_str().unwrap());
       println!("{}", new_output_filepath.to_str().unwrap());
       let mut file = fs::File::create(new_output_filepath).unwrap();
       ParquetWriter::new(&mut file).finish(&mut new_df).unwrap();
   
       println!();
   
   }
   ```
   The Error
   
   ```
   thread 'main' panicked at ': ArrowError(ExternalFormat("File out of 
specification: Invalid DECIMAL: scale (1) cannot be greater than or equal to 
precision (1)"))', src/main.rs:21:25
   stack backtrace:
      0: rust_begin_unwind
                at 
/rustc/84c898d65adf2f39a5a98507f1fe0ce10a2b8dbc/library/std/src/panicking.rs:579:5
      1: core::panicking::panic_fmt
                at 
/rustc/84c898d65adf2f39a5a98507f1fe0ce10a2b8dbc/library/core/src/panicking.rs:64:14
      2: core::result::unwrap_failed
                at 
/rustc/84c898d65adf2f39a5a98507f1fe0ce10a2b8dbc/library/core/src/result.rs:1750:5
      3: core::result::Result<T,E>::expect
                at 
/rustc/84c898d65adf2f39a5a98507f1fe0ce10a2b8dbc/library/core/src/result.rs:1047:23
      4: parquet_dedupe_data::dedupe_parquet_file
                at ./src/main.rs:21:22
      5: parquet_dedupe_data::main
                at ./src/main.rs:53:13
      6: core::ops::function::FnOnce::call_once
                at 
/rustc/84c898d65adf2f39a5a98507f1fe0ce10a2b8dbc/library/core/src/ops/function.rs:250:5
   note: Some details are omitted, run with `RUST_BACKTRACE=full` for a verbose 
backtrace.
   ```
   
   **To Reproduce**
   As you can see I walk the input DIR.. find parquet files.. and attempt to 
dedupe them.
   
   **Expected behavior**
   
   I am thinking either there's an error in my data... or this case of the 
decimal is not supported well by arrow-rs.. 
   
   **Additional context**
   
   Here's the schema of the offending file
   
   ```
   {
     "type" : "record",
     "name" : "schema",
     "fields" : [ {
       "name" : "category",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "maturity",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "liquid_hours",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "long_name",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "contract_month",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "real_expiration_date",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "under_sec_type",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "trading_hours",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "ev_rule",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "time_zone_id",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "next_option_partial",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "next_option_date",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "price_magnifier",
       "type" : [ "null", {
         "type" : "fixed",
         "name" : "price_magnifier",
         "size" : 2,
         "logicalType" : "decimal",
         "precision" : 4,
         "scale" : 1
       } ],
       "default" : null
     }, {
       "name" : "agg_group",
       "type" : [ "null", "long" ],
       "default" : null
     }, {
       "name" : "stock_type",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "under_symbol",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "market_rule_ids",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "query_start_time",
       "type" : [ "null", "long" ],
       "default" : null
     }, {
       "name" : "last_trade_time",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "convertible",
       "type" : [ "null", "boolean" ],
       "default" : null
     }, {
       "name" : "coupon",
       "type" : [ "null", {
         "type" : "fixed",
         "name" : "coupon",
         "size" : 1,
         "logicalType" : "decimal",
         "precision" : 1,
         "scale" : 1
       } ],
       "default" : null
     }, {
       "name" : "cusip_check_digit",
       "type" : [ "null", "long" ],
       "default" : null
     }, {
       "name" : "callable",
       "type" : [ "null", "boolean" ],
       "default" : null
     }, {
       "name" : "isin",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "issue_date",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "ratings",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "putable",
       "type" : [ "null", "boolean" ],
       "default" : null
     }, {
       "name" : "min_tick",
       "type" : [ "null", {
         "type" : "fixed",
         "name" : "min_tick",
         "size" : 2,
         "logicalType" : "decimal",
         "precision" : 4,
         "scale" : 4
       } ],
       "default" : null
     }, {
       "name" : "market_name",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "order_types",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "next_option_type",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "suggested_size_increment",
       "type" : [ "null", "long" ],
       "default" : null
     }, {
       "name" : "bond_type",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "industry",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "contract_id",
       "type" : [ "null", "long" ],
       "default" : null
     }, {
       "name" : "ev_multiplier",
       "type" : [ "null", {
         "type" : "fixed",
         "name" : "ev_multiplier",
         "size" : 1,
         "logicalType" : "decimal",
         "precision" : 1,
         "scale" : 1
       } ],
       "default" : null
     }, {
       "name" : "subcategory",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "min_size",
       "type" : [ "null", "long" ],
       "default" : null
     }, {
       "name" : "under_contract_id",
       "type" : [ "null", "long" ],
       "default" : null
     }, {
       "name" : "cusip",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "coupon_type",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "desc_append",
       "type" : [ "null", "string" ],
       "default" : null
     }, {
       "name" : "size_increment",
       "type" : [ "null", "long" ],
       "default" : null
     }, {
       "name" : "notes",
       "type" : [ "null", "string" ],
       "default" : null
     } ]
   }
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to