xiarixiaoyao edited a comment on issue #4609: URL: https://github.com/apache/hudi/issues/4609#issuecomment-1014176723
@ravs11 sorry for that i cannot reproduce the exception, env: spark3.1.1 , hadoop 3.1.1, parquet 1.12.2 could you pls help me check me code: val path1 = new Path("/tmp/default/clustering/") // create parquet file spark.sql( s""" |create table table1 ( |product_id INT, |product_name STRING, |product_category STRING, |create_time BIGINT, |utc_date STRING |) using parquet | location '${new Path(path1, "p1").toString}' | partitioned by (utc_date) """.stripMargin) spark.sql( s""" | insert into table1 values | (123, 'laptop1' , 'electronics1', 1671881778000, '2021-12-24'), | (124, 'laptop2' , 'electronics2', 1671881778000, '2021-12-25'), | (125, 'laptop3' , 'electronics3', 1671881778000, '2021-12-24'), | (126, 'laptop4' , 'electronics4', 1671881778000, '2021-12-25'), | (127, 'laptop5' , 'electronics5', 1671881778000, '2021-12-24') |""".stripMargin) spark.sql( s""" | insert into table1 values | (451, 'tshirt1' , 'mens wear1', 1671968178000, '2021-12-24'), | (452, 'tshirt2' , 'mens wear2', 1671968178000, '2021-12-25'), | (453, 'tshirt3' , 'mens wear3', 1671968178000, '2021-12-24'), | (454, 'tshirt4' , 'mens wear4', 1671968178000, '2021-12-25'), | (455, 'tshirt5' , 'mens wear5', 1671968178000, '2021-12-24') |""".stripMargin) spark.sql( s""" | insert into table1 values | (551, 'ts1' , 'wear1', 1671968178000, '2021-12-24'), | (552, 'ts2' , 'wear2', 1671968178000, '2021-12-25'), | (553, 'ts3' , 'wear3', 1671968178000, '2021-12-24'), | (554, 'ts4' , 'wear4', 1671968178000, '2021-12-25'), | (555, 'ts5' , 'wear5', 1671968178000, '2021-12-24') |""".stripMargin) // bulK_insert with z-order Seq("2021-12-24", "2021-12-25").foreach { utcDate => val dfx = spark.read.parquet(s"/tmp/default/clustering/p1/utc_date=$utcDate") .withColumn("utc_date", lit(utcDate)) val savePath = s"/tmp/default/clustering/hudi_z_order" dfx.write.format("org.apache.hudi") .option("hoodie.table.name", s"hudi_z_order_test") .option("hoodie.datasource.write.table.name", s"hudi_z_order_test") .option("hoodie.datasource.write.operation", "bulk_insert") .option("hoodie.sql.insert.mode", "non-strict") .option("hoodie.datasource.write.precombine.field", "create_time") .option("hoodie.datasource.write.recordkey.field", "product_id") .option("hoodie.datasource.write.partitionpath.field", "utc_date") .option("hoodie.datasource.write.keygenerator.class", "org.apache.hudi.keygen.SimpleKeyGenerator") .option("hoodie.datasource.write.hive_style_partitioning", "true") .option("hoodie.bulkinsert.shuffle.parallelism", "3") .option("hoodie.bulkinsert.sort.mode", "NONE") .option("hoodie.embed.timeline.server", "false") .option("hoodie.parquet.compression.codec", "zstd") .option("hoodie.clustering.inline", "true") .option("hoodie.clustering.inline.max.commits", "1") .option("hoodie.clustering.plan.strategy.target.file.max.bytes", "1073741824") .option("hoodie.clustering.plan.strategy.small.file.limit", "536870912") .option("hoodie.clustering.plan.strategy.sort.columns", "product_name,product_category") .option("hoodie.clustering.plan.strategy.max.bytes.per.group", Long.MaxValue.toString) .option("hoodie.layout.optimize.enable", "true") .option("hoodie.layout.optimize.strategy", "z-order") .mode(SaveMode.Append).save(savePath) } -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org