Github user wangyum commented on the issue: https://github.com/apache/spark/pull/22743 This happens when a table `LogicalRelation` has been cached, then we change `spark.sql.statistics.fallBackToHdfs` or `spark.sql.defaultSizeInBytes` will not have any effect to stats, it always uses the stats already cached in `LogicalRelation`. This is an example: ```scala import org.apache.spark.sql.catalyst.QualifiedTableName import org.apache.spark.sql.catalyst.catalog.SessionCatalog import org.apache.spark.sql.execution.datasources.LogicalRelation spark.sql("CREATE TABLE t1 (c1 bigint) STORED AS PARQUET") spark.sql("INSERT INTO TABLE t1 VALUES (1)") spark.sql("REFRESH TABLE t1") val catalog = spark.sessionState.catalog val qualifiedTableName = QualifiedTableName(catalog.getCurrentDatabase, "t1") spark.sql("SELECT * from t1").collect() val cachedRelation = catalog.getCachedTable(qualifiedTableName) cachedRelation.asInstanceOf[LogicalRelation].catalogTable.get.stats.get.sizeInBytes // res4: BigInt = 9223372036854775807 spark.sql("set spark.sql.statistics.fallBackToHdfs=true") spark.sql("SELECT * from t1").collect() val cachedRelation = catalog.getCachedTable(qualifiedTableName) cachedRelation.asInstanceOf[LogicalRelation].catalogTable.get.stats.get.sizeInBytes // res7: BigInt = 9223372036854775807 // It should compute from file system, but still 9223372036854775807 spark.sql("REFRESH TABLE t1") spark.sql("SELECT * from t1").collect() val cachedRelation = catalog.getCachedTable(qualifiedTableName) cachedRelation.asInstanceOf[LogicalRelation].catalogTable.get.stats.get.sizeInBytes // res10: BigInt = 708 // If we refresh this table, it correct. ```
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org