[ https://issues.apache.org/jira/browse/HUDI-773?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17081030#comment-17081030 ]
Yanjia Gary Li commented on HUDI-773: ------------------------------------- surprisingly easy...I tried the following test using Spark2.4 HDinsigh cluster with Azure Data Lake Storage V2. Hudi ran out of the box. No extra config needed. {code:java} // Initial Batch val outputPath = "/Test/HudiWrite" val df1 = Seq( ("0", "year=2019", "test1", "pass", "201901"), ("1", "year=2019", "test1", "pass", "201901"), ("2", "year=2020", "test1", "pass", "201901"), ("3", "year=2020", "test1", "pass", "201901") ).toDF("_uuid", "_partition", "PARAM_NAME", "RESULT_STRING", "TIMESTAMP") val bulk_insert_ops = Map( DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY -> "_uuid", DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY -> "_partition", DataSourceWriteOptions.OPERATION_OPT_KEY -> DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL, DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY -> "TIMESTAMP", "hoodie.bulkinsert.shuffle.parallelism" -> "10", "hoodie.upsert.shuffle.parallelism" -> "10", HoodieWriteConfig.TABLE_NAME -> "test" ) df1.write.format("org.apache.hudi").options(bulk_insert_ops).mode(SaveMode.Overwrite).save(outputPath) // Upsert val upsert_ops = Map( DataSourceWriteOptions.RECORDKEY_FIELD_OPT_KEY -> "_uuid", DataSourceWriteOptions.PARTITIONPATH_FIELD_OPT_KEY -> "_partition", DataSourceWriteOptions.OPERATION_OPT_KEY -> DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL, DataSourceWriteOptions.PRECOMBINE_FIELD_OPT_KEY -> "TIMESTAMP", "hoodie.bulkinsert.shuffle.parallelism" -> "10", "hoodie.upsert.shuffle.parallelism" -> "10", HoodieWriteConfig.TABLE_NAME -> "test" ) val df2 = Seq( ("0", "year=2019", "test1", "pass", "201910"), ("1", "year=2019", "test1", "pass", "201910"), ("2", "year=2020", "test1", "pass", "201910"), ("3", "year=2020", "test1", "pass", "201910") ).toDF("_uuid", "_partition", "PARAM_NAME", "RESULT_STRING", "TIMESTAMP") df2.write.format("org.apache.hudi").options(upsert_ops).mode(SaveMode.Append).save(outputPath) // Read as hudi format val df_read = spark.read.format("org.apache.hudi").option(DataSourceReadOptions.QUERY_TYPE_OPT_KEY, DataSourceReadOptions.QUERY_TYPE_SNAPSHOT_OPT_VAL).load(outputPath) assert(df_read.count() == 4){code} > Hudi On Azure Data Lake Storage V2 > ---------------------------------- > > Key: HUDI-773 > URL: https://issues.apache.org/jira/browse/HUDI-773 > Project: Apache Hudi (incubating) > Issue Type: New Feature > Components: Usability > Reporter: Yanjia Gary Li > Assignee: Yanjia Gary Li > Priority: Minor > -- This message was sent by Atlassian Jira (v8.3.4#803005)