[ https://issues.apache.org/jira/browse/HUDI-7579?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Vinaykumar Bhat updated HUDI-7579: ---------------------------------- Description: The following create-table and inserts should create a table with 3 partitions (with each partition having one slice){{{{}}{}}} {code:java} spark.sql( s""" |create table test_table( | id int, | name string, | ts long, | price int |) using hudi | options ( | primaryKey ='id', | type = 'cow', | preCombineField = 'ts', | hoodie.metadata.record.index.enable = 'true', | hoodie.datasource.write.recordkey.field = 'id' | ) | partitioned by(price) | location '$basePath' """.stripMargin) spark.sql(s"insert into test_table (id, name, ts, price) values(1, 'a1', 1000, 10)") spark.sql(s"insert into test_table (id, name, ts, price) values(2, 'a2', 200000, 100)") spark.sql(s"insert into test_table (id, name, ts, price) values(3, 'a3', 2000000000, 1000)"){code} Now create a functional index (using col stats) on this table. The col-stat in the MDT should have three entries (representing column level stats for 3 files). However, col stats only has one single entry (for one of the file). {code:java} var createIndexSql = s"create index idx_datestr on test_table using column_stats(ts) options(func='from_unixtime', format='yyyy-MM-dd')" spark.sql(createIndexSql) spark.sql(s"select key, type, ColumnStatsMetadata from hudi_metadata('test_table') where type = 3").show(false) {code} As seen below, col-stats has only one entry for one of the file (and is missing statistics for two other files): *{32490467-702f-4bb4-81e8-91082da9baf0-0_0-28-66_20240409095623406.parquet, ts, \{null, null, null, null, null, null, {1970-01-01}, null, null, null, null}, \{null, null, null, null, null, null, {1970-01-01}, null, null, null, null}, 1, 0, 434874, 869748, false}* {code:java} +------------------------------------------------+----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |key |type|ColumnStatsMetadata | +------------------------------------------------+----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |oyTjviKHuhI=/vI1OU7mFjI=Ev9dj4Bf3S0TEjEiWebRSQ==|3 |{32490467-702f-4bb4-81e8-91082da9baf0-0_0-28-66_20240409095623406.parquet, ts, {null, null, null, null, null, null, {1970-01-01}, null, null, null, null}, {null, null, null, null, null, null, {1970-01-01}, null, null, null, null}, 1, 0, 434874, 869748, false}| +------------------------------------------------+----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ {code} was: The following create-table and inserts should create a table with 3 partitions (with each partition having one slice){{{{}}{}}} {code:java} spark.sql( s""" |create table test_table( | id int, | name string, | ts long, | price int |) using hudi | options ( | primaryKey ='id', | type = '$tableType', | preCombineField = 'ts', | hoodie.metadata.record.index.enable = 'true', | hoodie.datasource.write.recordkey.field = 'id' | ) | partitioned by(price) | location '$basePath' """.stripMargin) spark.sql(s"insert into test_table (id, name, ts, price) values(1, 'a1', 1000, 10)") spark.sql(s"insert into test_table (id, name, ts, price) values(2, 'a2', 200000, 100)") spark.sql(s"insert into test_table (id, name, ts, price) values(3, 'a3', 2000000000, 1000)"){code} Now create a functional index (using col stats) on this table. The col-stat in the MDT should have three entries (representing column level stats for 3 files). However, col stats only has one single entry (for one of the file). {code:java} var createIndexSql = s"create index idx_datestr on $tableName using column_stats(ts) options(func='from_unixtime', format='yyyy-MM-dd')" spark.sql(createIndexSql) spark.sql(s"select key, type, ColumnStatsMetadata from hudi_metadata('$tableName') where type = 3").show(false) {code} As seen below, col-stats has only one entry for one of the file (and is missing statistics for two other files): *{32490467-702f-4bb4-81e8-91082da9baf0-0_0-28-66_20240409095623406.parquet, ts, \{null, null, null, null, null, null, {1970-01-01}, null, null, null, null}, \{null, null, null, null, null, null, {1970-01-01}, null, null, null, null}, 1, 0, 434874, 869748, false}* {code:java} +------------------------------------------------+----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |key |type|ColumnStatsMetadata | +------------------------------------------------+----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |oyTjviKHuhI=/vI1OU7mFjI=Ev9dj4Bf3S0TEjEiWebRSQ==|3 |{32490467-702f-4bb4-81e8-91082da9baf0-0_0-28-66_20240409095623406.parquet, ts, {null, null, null, null, null, null, {1970-01-01}, null, null, null, null}, {null, null, null, null, null, null, {1970-01-01}, null, null, null, null}, 1, 0, 434874, 869748, false}| +------------------------------------------------+----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ {code} > Functional index (on col stats) creation fails to process all files/partitions > ------------------------------------------------------------------------------ > > Key: HUDI-7579 > URL: https://issues.apache.org/jira/browse/HUDI-7579 > Project: Apache Hudi > Issue Type: Bug > Reporter: Vinaykumar Bhat > Priority: Major > > The following create-table and inserts should create a table with 3 > partitions (with each partition having one slice){{{{}}{}}} > {code:java} > spark.sql( > s""" > |create table test_table( > | id int, > | name string, > | ts long, > | price int > |) using hudi > | options ( > | primaryKey ='id', > | type = 'cow', > | preCombineField = 'ts', > | hoodie.metadata.record.index.enable = 'true', > | hoodie.datasource.write.recordkey.field = 'id' > | ) > | partitioned by(price) > | location '$basePath' > """.stripMargin) > spark.sql(s"insert into test_table (id, name, ts, price) values(1, 'a1', > 1000, 10)") > spark.sql(s"insert into test_table (id, name, ts, price) values(2, 'a2', > 200000, 100)") > spark.sql(s"insert into test_table (id, name, ts, price) values(3, 'a3', > 2000000000, 1000)"){code} > Now create a functional index (using col stats) on this table. The col-stat > in the MDT should have three entries (representing column level stats for 3 > files). However, col stats only has one single entry (for one of the file). > > {code:java} > var createIndexSql = s"create index idx_datestr on test_table using > column_stats(ts) options(func='from_unixtime', format='yyyy-MM-dd')" > spark.sql(createIndexSql) > spark.sql(s"select key, type, ColumnStatsMetadata from > hudi_metadata('test_table') where type = 3").show(false) {code} > As seen below, col-stats has only one entry for one of the file (and is > missing statistics for two other files): > *{32490467-702f-4bb4-81e8-91082da9baf0-0_0-28-66_20240409095623406.parquet, > ts, \{null, null, null, null, null, null, {1970-01-01}, null, null, null, > null}, \{null, null, null, null, null, null, {1970-01-01}, null, null, null, > null}, 1, 0, 434874, 869748, false}* > {code:java} > +------------------------------------------------+----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ > |key |type|ColumnStatsMetadata > > > > | > +------------------------------------------------+----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ > |oyTjviKHuhI=/vI1OU7mFjI=Ev9dj4Bf3S0TEjEiWebRSQ==|3 > |{32490467-702f-4bb4-81e8-91082da9baf0-0_0-28-66_20240409095623406.parquet, > ts, {null, null, null, null, null, null, {1970-01-01}, null, null, null, > null}, {null, null, null, null, null, null, {1970-01-01}, null, null, null, > null}, 1, 0, 434874, 869748, false}| > +------------------------------------------------+----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ > {code} > > -- This message was sent by Atlassian Jira (v8.20.10#820010)