xiarixiaoyao commented on issue #4609:
URL: https://github.com/apache/hudi/issues/4609#issuecomment-1014176723


   @ravs11  sorry for that i cannot reproduce the exception, pls help me check 
me code
       val path1 = new Path("/tmp/default/clustering/")
    
       // create parquet file 
       spark.sql(
         s"""
            |create table table1 (
            |product_id INT,
            |product_name STRING,
            |product_category STRING,
            |create_time BIGINT,
            |utc_date STRING
            |) using parquet
            | location '${new Path(path1, "p1").toString}'
            | partitioned by (utc_date)
                """.stripMargin)
       spark.sql(
         s"""
            | insert into table1 values
            | (123, 'laptop1' , 'electronics1', 1671881778000, '2021-12-24'),
            | (124, 'laptop2' , 'electronics2', 1671881778000, '2021-12-25'),
            | (125, 'laptop3' , 'electronics3', 1671881778000, '2021-12-24'),
            | (126, 'laptop4' , 'electronics4', 1671881778000, '2021-12-25'),
            | (127, 'laptop5' , 'electronics5', 1671881778000, '2021-12-24')
            |""".stripMargin)
   
       spark.sql(
         s"""
            | insert into table1 values
            | (451, 'tshirt1' , 'mens wear1', 1671968178000, '2021-12-24'),
            | (452, 'tshirt2' , 'mens wear2', 1671968178000, '2021-12-25'),
            | (453, 'tshirt3' , 'mens wear3', 1671968178000, '2021-12-24'),
            | (454, 'tshirt4' , 'mens wear4', 1671968178000, '2021-12-25'),
            | (455, 'tshirt5' , 'mens wear5', 1671968178000, '2021-12-24')
            |""".stripMargin)
   
       spark.sql(
         s"""
            | insert into table1 values
            | (551, 'ts1' , 'wear1', 1671968178000, '2021-12-24'),
            | (552, 'ts2' , 'wear2', 1671968178000, '2021-12-25'),
            | (553, 'ts3' , 'wear3', 1671968178000, '2021-12-24'),
            | (554, 'ts4' , 'wear4', 1671968178000, '2021-12-25'),
            | (555, 'ts5' , 'wear5', 1671968178000, '2021-12-24')
            |""".stripMargin)
       // bulK_insert with z-order
       Seq("2021-12-24", "2021-12-25").foreach { utcDate =>
         val dfx = 
spark.read.parquet(s"/tmp/default/clustering/p1/utc_date=$utcDate")
           .withColumn("utc_date", lit(utcDate))
         val savePath = s"/tmp/default/clustering/hudi_z_order"
         dfx.write.format("org.apache.hudi")
           .option("hoodie.table.name", s"hudi_z_order_test")
           .option("hoodie.datasource.write.table.name", s"hudi_z_order_test")
           .option("hoodie.datasource.write.operation", "bulk_insert")
           .option("hoodie.sql.insert.mode", "non-strict")
           .option("hoodie.datasource.write.precombine.field", "create_time")
           .option("hoodie.datasource.write.recordkey.field", "product_id")
           .option("hoodie.datasource.write.partitionpath.field", "utc_date")
           .option("hoodie.datasource.write.keygenerator.class", 
"org.apache.hudi.keygen.SimpleKeyGenerator")
           .option("hoodie.datasource.write.hive_style_partitioning", "true")
           .option("hoodie.bulkinsert.shuffle.parallelism", "3")
           .option("hoodie.bulkinsert.sort.mode", "NONE")
           .option("hoodie.embed.timeline.server", "false")
           .option("hoodie.parquet.compression.codec", "zstd")
           .option("hoodie.clustering.inline", "true")
           .option("hoodie.clustering.inline.max.commits", "1")
           .option("hoodie.clustering.plan.strategy.target.file.max.bytes", 
"1073741824")
           .option("hoodie.clustering.plan.strategy.small.file.limit", 
"536870912")
           .option("hoodie.clustering.plan.strategy.sort.columns", 
"product_name,product_category")
           .option("hoodie.clustering.plan.strategy.max.bytes.per.group", 
Long.MaxValue.toString)
           .option("hoodie.layout.optimize.enable", "true")
           .option("hoodie.layout.optimize.strategy", "z-order")
           .mode(SaveMode.Append).save(savePath)
       }


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to