xushiyan edited a comment on issue #3617: URL: https://github.com/apache/hudi/issues/3617#issuecomment-927522620
@novakov-alexey I checked the behavior is fixed in 0.9.0. Please give release-0.9.0 a try. You can find some guide here to override EMR hudi jars. https://hudi.apache.org/learn/faq#how-to-override-hudi-jars-in-emr I used this snippet to reproduce your scenario with 0.9.0. After the bulk insert commit, I can see schema value is populated in the commit file. This should allow you to sync with Glue Catalog. ```scala val opts = Map( DataSourceWriteOptions.TABLE_NAME.key() -> "language", DataSourceWriteOptions.TABLE_TYPE.key() -> DataSourceWriteOptions.COW_TABLE_TYPE_OPT_VAL, DataSourceWriteOptions.OPERATION.key() -> DataSourceWriteOptions.BULK_INSERT_OPERATION_OPT_VAL, DataSourceWriteOptions.RECORDKEY_FIELD.key() -> "lang", DataSourceWriteOptions.PRECOMBINE_FIELD.key() -> "score", DataSourceWriteOptions.KEYGENERATOR_CLASS_NAME.key() -> classOf[NonpartitionedKeyGenerator].getCanonicalName, DataSourceWriteOptions.HIVE_PARTITION_EXTRACTOR_CLASS.key() -> classOf[NonPartitionedExtractor].getCanonicalName, DataSourceWriteOptions.HIVE_STYLE_PARTITIONING.key() -> "true", HoodieWriteConfig.TBL_NAME.key() -> "language", HoodieWriteConfig.INSERT_PARALLELISM_VALUE.key() -> "1", HoodieWriteConfig.UPSERT_PARALLELISM_VALUE.key() -> "1", HoodieWriteConfig.BULKINSERT_PARALLELISM_VALUE.key() -> "1", HoodieWriteConfig.FINALIZE_WRITE_PARALLELISM_VALUE.key() -> "1", "spark.default.parallelism" -> "1", "spark.sql.shuffle.partitions" -> "1" ) val simpleSchema = StructType(Array( StructField("lang", StringType, nullable = false), StructField("score", IntegerType, nullable = false) )) val emptyDF = spark.createDataFrame(spark.sparkContext.emptyRDD[Row], simpleSchema) emptyDF.write.format("hudi").options(opts).mode(Overwrite).save(basePath) val data = spark.read.format("hudi").load(basePath) data.show() ``` ```json ➜ cat .hoodie/20210926212533.commit { "partitionToWriteStats" : { }, "compacted" : false, "extraMetadata" : { "schema" : "{\"type\":\"record\",\"name\":\"language_record\",\"namespace\":\"hoodie.language\",\"fields\":[{\"name\":\"lang\",\"type\":\"string\"},{\"name\":\"score\",\"type\":\"int\"}]}" }, "operationType" : "BULK_INSERT", "fileIdAndRelativePaths" : { }, "totalRecordsDeleted" : 0, "totalLogRecordsCompacted" : 0, "totalLogFilesCompacted" : 0, "totalCompactedRecordsUpdated" : 0, "totalLogFilesSize" : 0, "totalScanTime" : 0, "totalCreateTime" : 0, "totalUpsertTime" : 0, "minAndMaxEventTime" : { "Optional.empty" : { "val" : null, "present" : false } }, "writePartitionPaths" : [ ] } ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org