xushiyan edited a comment on issue #3617:
URL: https://github.com/apache/hudi/issues/3617#issuecomment-927522620


   @novakov-alexey I checked the behavior is fixed in 0.9.0. Please give 
release-0.9.0 a try. You can find some guide here to override EMR hudi jars. 
https://hudi.apache.org/learn/faq#how-to-override-hudi-jars-in-emr
   
   I used this snippet to reproduce your scenario with 0.9.0. After the bulk 
insert commit, I can see schema value is populated in the commit file. This 
should allow you to sync with Glue Catalog.
   
   ```scala
       val opts = Map(
         DataSourceWriteOptions.TABLE_NAME.key() -> "language",
         DataSourceWriteOptions.TABLE_TYPE.key() -> 
DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL,
         DataSourceWriteOptions.OPERATION.key() -> 
DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL,
         DataSourceWriteOptions.RECORDKEY_FIELD.key() -> "lang",
         DataSourceWriteOptions.PRECOMBINE_FIELD.key() -> "score",
         DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key() -> 
classOf[NonpartitionedKeyGenerator].getCanonicalName,
         DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS.key() -> 
classOf[NonPartitionedExtractor].getCanonicalName,
         DataSourceWriteOptions.HIVE_STYLE_PARTITIONING.key() -> "true",
         HoodieWriteConfig.TBL_NAME.key() -> "language",
         HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key() -> "1",
         HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key() -> "1",
         HoodieWriteConfig.BULKINSERT_PARALLELISM_VALUE.key() -> "1",
         HoodieWriteConfig.FINALIZE_WRITE_PARALLELISM_VALUE.key() -> "1",
         "spark.default.parallelism" -> "1",
         "spark.sql.shuffle.partitions" -> "1"
       )
   
       val simpleSchema = StructType(Array(
         StructField("lang", StringType, nullable = false),
         StructField("score", IntegerType, nullable = false)
       ))
       val emptyDF = spark.createDataFrame(spark.sparkContext.emptyRDD[Row], 
simpleSchema)
       emptyDF.write.format("hudi").options(opts).mode(Overwrite).save(basePath)
       val data = spark.read.format("hudi").load(basePath)
       data.show()
   ```
   
   ```json
   ➜ cat .hoodie/20210926212533.commit
   {
     "partitionToWriteStats" : { },
     "compacted" : false,
     "extraMetadata" : {
       "schema" : 
"{\"type\":\"record\",\"name\":\"language_record\",\"namespace\":\"hoodie.language\",\"fields\":[{\"name\":\"lang\",\"type\":\"string\"},{\"name\":\"score\",\"type\":\"int\"}]}"
     },
     "operationType" : "BULK_INSERT",
     "fileIdAndRelativePaths" : { },
     "totalRecordsDeleted" : 0,
     "totalLogRecordsCompacted" : 0,
     "totalLogFilesCompacted" : 0,
     "totalCompactedRecordsUpdated" : 0,
     "totalLogFilesSize" : 0,
     "totalScanTime" : 0,
     "totalCreateTime" : 0,
     "totalUpsertTime" : 0,
     "minAndMaxEventTime" : {
       "Optional.empty" : {
         "val" : null,
         "present" : false
       }
     },
     "writePartitionPaths" : [ ]
   }
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


Reply via email to