MrAladdin opened a new issue, #11007:
URL: https://github.com/apache/hudi/issues/11007

   **Describe the problem you faced**
   1、spark structured streaming : upsert mor (record_index)
   2、After compacting, there are a large number of logs with size 0, and they 
can never be cleared.
   
   
   **Environment Description**
   
   * Hudi version :0.14.1
   
   * Spark version :3.4.1
   
   * Hive version :3.1.2
   
   * Hadoop version :3.1.3
   
   * Storage (HDFS/S3/GCS..) :hdfs
   
   * Running on Docker? (yes/no) :no
   
   
   **Additional context**
   .writeStream
             .format("hudi")
             .option("hoodie.table.base.file.format", "PARQUET")
             .option("hoodie.allow.empty.commit", "true")
             .option("hoodie.datasource.write.drop.partition.columns","false")
             .option("hoodie.table.services.enabled", "true")
             .option("hoodie.datasource.write.streaming.checkpoint.identifier", 
"lakehouse-dwd-social-kbi-beauty-v1-writer-1")
             .option(PRECOMBINE_FIELD.key(), "date_kbiUdate")
             .option(RECORDKEY_FIELD.key(), "records_key")
             .option(PARTITIONPATH_FIELD.key(), "partition_index_date")
             .option(DataSourceWriteOptions.OPERATION.key(), 
DataSourceWriteOptions.UPSERT_OPERATION_OPT_VAL)
             .option(DataSourceWriteOptions.TABLE_TYPE.key(), 
DataSourceWriteOptions.MOR_TABLE_TYPE_OPT_VAL)
             .option("hoodie.combine.before.upsert", "true")
             
.option("hoodie.datasource.write.payload.class","org.apache.hudi.common.model.OverwriteWithLatestAvroPayload")
   
             //markers
             .option("hoodie.write.markers.type", "DIRECT")
   
             //timeline server
             .option("hoodie.embed.timeline.server", "true")
   
             //File System View Storage Configurations
             .option("hoodie.filesystem.view.remote.timeout.secs", "1200")
             .option("hoodie.filesystem.view.remote.retry.enable", "true")
             .option("hoodie.filesystem.view.remote.retry.initial_interval_ms", 
"500")
             .option("hoodie.filesystem.view.remote.retry.max_numbers", "15")
             .option("hoodie.filesystem.view.remote.retry.max_interval_ms", 
"8000")
   
             //schema cache
             .option("hoodie.schema.cache.enable", "true")
   
             //spark write
             .option("hoodie.datasource.write.streaming.ignore.failed.batch", 
"false")
             .option("hoodie.datasource.write.streaming.retry.count", "6")
             .option("hoodie.datasource.write.streaming.retry.interval.ms", 
"3000")
   
             //metadata
             .option("hoodie.metadata.enable", "true")
             .option("hoodie.metadata.index.async", "false")
             .option("hoodie.metadata.index.check.timeout.seconds", "900")
             .option("hoodie.auto.adjust.lock.configs", "true")
             .option("hoodie.metadata.optimized.log.blocks.scan.enable", "true")
             .option("hoodie.metadata.index.column.stats.enable", "false")
             .option("hoodie.metadata.index.column.stats.parallelism", "100")
             .option("hoodie.metadata.index.column.stats.file.group.count", "4")
             
.option("hoodie.metadata.index.column.stats.column.list","date_udate,date_publishedAt")
             .option("hoodie.metadata.compact.max.delta.commits", "10")
   
   
             //metadata
             .option("hoodie.metadata.record.index.enable", "true")
             .option("hoodie.index.type", "RECORD_INDEX")
             .option("hoodie.metadata.max.init.parallelism", "100000")
             .option("hoodie.metadata.record.index.min.filegroup.count", "10")
             .option("hoodie.metadata.record.index.max.filegroup.count", 
"10000")
             .option("hoodie.metadata.record.index.max.filegroup.size", 
"1073741824")
             .option("hoodie.metadata.auto.initialize", "true")
             .option("hoodie.metadata.record.index.growth.factor", "2.0")
             .option("hoodie.metadata.max.logfile.size", "2147483648")
             .option("hoodie.metadata.log.compaction.enable", "false")
             .option("hoodie.metadata.log.compaction.blocks.threshold", "5")
             .option("hoodie.metadata.max.deltacommits.when_pending", "1000")
   
             //file size
             .option("hoodie.parquet.field_id.write.enabled", "true")
             .option("hoodie.copyonwrite.insert.auto.split", "true")
             .option("hoodie.record.size.estimation.threshold", "1.0")
             .option("hoodie.parquet.block.size", "536870912")
             .option("hoodie.parquet.max.file.size", "536870912")
             .option("hoodie.parquet.small.file.limit", "314572800")
             .option("hoodie.logfile.max.size", "536870912")
             .option("hoodie.logfile.data.block.max.size", "536870912")
             .option("hoodie.logfile.to.parquet.compression.ratio", "0.35")
   
             //archive
             .option("hoodie.keep.max.commits", "30")
             .option("hoodie.keep.min.commits", "20")
             .option("hoodie.commits.archival.batch", "10")
             .option("hoodie.archive.automatic", "true") 
             .option("hoodie.archive.async", "true")
             .option("hoodie.archive.beyond.savepoint", "true")
             .option("hoodie.fail.on.timeline.archiving", "true")
   
             //cleaner
             .option("hoodie.clean.allow.multiple", "true")
             .option("hoodie.cleaner.incremental.mode", "true")
             .option("hoodie.clean.async", "true")
             .option("hoodie.cleaner.policy.failed.writes", "LAZY")
             .option("hoodie.cleaner.delete.bootstrap.base.file", "true")
             .option("hoodie.clean.automatic", "true")
             .option("hoodie.cleaner.policy", "KEEP_LATEST_BY_HOURS")
             .option("hoodie.cleaner.hours.retained", "6")
             .option("hoodie.clean.trigger.strategy", "NUM_COMMITS")
             .option("hoodie.clean.max.commits", "10")
   
             //compact
             .option("hoodie.datasource.compaction.async.enable", "true")
             .option("hoodie.compact.inline", "false")
             .option("hoodie.compact.schedule.inline", "false")
             .option("hoodie.compaction.lazy.block.read", "true")
             .option("hoodie.compaction.reverse.log.read", "false")
             .option("hoodie.compaction.target.io", compact_limit)
             .option("hoodie.compaction.strategy", 
"org.apache.hudi.table.action.compact.strategy.LogFileSizeBasedCompactionStrategy")
             .option("hoodie.compact.inline.trigger.strategy", "NUM_AND_TIME")
             .option("hoodie.compact.inline.max.delta.commits", "5")
             .option("hoodie.compact.inline.max.delta.seconds", "3600")
             .option("hoodie.memory.compaction.fraction", "0.6")
   
             //schema
             .option("hoodie.datasource.write.reconcile.schema", "true")
             .option("hoodie.avro.schema.external.transformation","true")
             .option("hoodie.avro.schema.validate", "true")
   
             //lock
             .option("hoodie.write.concurrency.mode", 
"optimistic_concurrency_control")
             .option("hoodie.write.lock.provider", 
"org.apache.hudi.client.transaction.lock.FileSystemBasedLockProvider")
             .option("hoodie.write.lock.filesystem.expire", "10")
   
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@hudi.apache.org.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to