spark git commit: [SPARK-7447] [SQL] Don't re-merge Parquet schema when the relation is deserialized
Repository: spark Updated Branches: refs/heads/branch-1.4 0ed376afa -> 898be6248 [SPARK-7447] [SQL] Don't re-merge Parquet schema when the relation is deserialized JIRA: https://issues.apache.org/jira/browse/SPARK-7447 `MetadataCache` in `ParquetRelation2` is annotated as `transient`. When `ParquetRelation2` is deserialized, we ask `MetadataCache` to refresh and perform schema merging again. It is time-consuming especially for very many parquet files. With the new `FSBasedParquetRelation`, although `MetadataCache` is not `transient` now, `MetadataCache.refresh()` still performs schema merging again when the relation is deserialized. Author: Liang-Chi Hsieh Closes #6012 from viirya/without_remerge_schema and squashes the following commits: 2663957 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into without_remerge_schema 6ac7d93 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into without_remerge_schema b0fc09b [Liang-Chi Hsieh] Don't generate and merge parquetSchema multiple times. (cherry picked from commit 339905578790fa37fcad9684b859b443313a5aa2) Signed-off-by: Cheng Lian Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/898be624 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/898be624 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/898be624 Branch: refs/heads/branch-1.4 Commit: 898be6248940acddab6f40af0fbc7e7abb3adb76 Parents: 0ed376a Author: Liang-Chi Hsieh Authored: Sun May 17 15:42:21 2015 +0800 Committer: Cheng Lian Committed: Sun May 17 15:42:40 2015 +0800 -- .../apache/spark/sql/parquet/newParquet.scala | 32 +++- 1 file changed, 18 insertions(+), 14 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/898be624/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala index 946062f..bcbdb1e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala @@ -340,7 +340,7 @@ private[sql] class ParquetRelation2( // Schema of the actual Parquet files, without partition columns discovered from partition // directory paths. -var dataSchema: StructType = _ +var dataSchema: StructType = null // Schema of the whole table, including partition columns. var schema: StructType = _ @@ -379,19 +379,23 @@ private[sql] class ParquetRelation2( f -> new Footer(f.getPath, parquetMetadata) }.seq.toMap - dataSchema = { -val dataSchema0 = - maybeDataSchema -.orElse(readSchema()) -.orElse(maybeMetastoreSchema) -.getOrElse(sys.error("Failed to get the schema.")) - -// If this Parquet relation is converted from a Hive Metastore table, must reconcile case -// case insensitivity issue and possible schema mismatch (probably caused by schema -// evolution). -maybeMetastoreSchema - .map(ParquetRelation2.mergeMetastoreParquetSchema(_, dataSchema0)) - .getOrElse(dataSchema0) + // If we already get the schema, don't need to re-compute it since the schema merging is + // time-consuming. + if (dataSchema == null) { +dataSchema = { + val dataSchema0 = +maybeDataSchema + .orElse(readSchema()) + .orElse(maybeMetastoreSchema) + .getOrElse(sys.error("Failed to get the schema.")) + + // If this Parquet relation is converted from a Hive Metastore table, must reconcile case + // case insensitivity issue and possible schema mismatch (probably caused by schema + // evolution). + maybeMetastoreSchema +.map(ParquetRelation2.mergeMetastoreParquetSchema(_, dataSchema0)) +.getOrElse(dataSchema0) +} } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org
spark git commit: [SPARK-7447] [SQL] Don't re-merge Parquet schema when the relation is deserialized
Repository: spark Updated Branches: refs/heads/master edf09ea1b -> 339905578 [SPARK-7447] [SQL] Don't re-merge Parquet schema when the relation is deserialized JIRA: https://issues.apache.org/jira/browse/SPARK-7447 `MetadataCache` in `ParquetRelation2` is annotated as `transient`. When `ParquetRelation2` is deserialized, we ask `MetadataCache` to refresh and perform schema merging again. It is time-consuming especially for very many parquet files. With the new `FSBasedParquetRelation`, although `MetadataCache` is not `transient` now, `MetadataCache.refresh()` still performs schema merging again when the relation is deserialized. Author: Liang-Chi Hsieh Closes #6012 from viirya/without_remerge_schema and squashes the following commits: 2663957 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into without_remerge_schema 6ac7d93 [Liang-Chi Hsieh] Merge remote-tracking branch 'upstream/master' into without_remerge_schema b0fc09b [Liang-Chi Hsieh] Don't generate and merge parquetSchema multiple times. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/33990557 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/33990557 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/33990557 Branch: refs/heads/master Commit: 339905578790fa37fcad9684b859b443313a5aa2 Parents: edf09ea Author: Liang-Chi Hsieh Authored: Sun May 17 15:42:21 2015 +0800 Committer: Cheng Lian Committed: Sun May 17 15:42:21 2015 +0800 -- .../apache/spark/sql/parquet/newParquet.scala | 32 +++- 1 file changed, 18 insertions(+), 14 deletions(-) -- http://git-wip-us.apache.org/repos/asf/spark/blob/33990557/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala -- diff --git a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala index 946062f..bcbdb1e 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/parquet/newParquet.scala @@ -340,7 +340,7 @@ private[sql] class ParquetRelation2( // Schema of the actual Parquet files, without partition columns discovered from partition // directory paths. -var dataSchema: StructType = _ +var dataSchema: StructType = null // Schema of the whole table, including partition columns. var schema: StructType = _ @@ -379,19 +379,23 @@ private[sql] class ParquetRelation2( f -> new Footer(f.getPath, parquetMetadata) }.seq.toMap - dataSchema = { -val dataSchema0 = - maybeDataSchema -.orElse(readSchema()) -.orElse(maybeMetastoreSchema) -.getOrElse(sys.error("Failed to get the schema.")) - -// If this Parquet relation is converted from a Hive Metastore table, must reconcile case -// case insensitivity issue and possible schema mismatch (probably caused by schema -// evolution). -maybeMetastoreSchema - .map(ParquetRelation2.mergeMetastoreParquetSchema(_, dataSchema0)) - .getOrElse(dataSchema0) + // If we already get the schema, don't need to re-compute it since the schema merging is + // time-consuming. + if (dataSchema == null) { +dataSchema = { + val dataSchema0 = +maybeDataSchema + .orElse(readSchema()) + .orElse(maybeMetastoreSchema) + .getOrElse(sys.error("Failed to get the schema.")) + + // If this Parquet relation is converted from a Hive Metastore table, must reconcile case + // case insensitivity issue and possible schema mismatch (probably caused by schema + // evolution). + maybeMetastoreSchema +.map(ParquetRelation2.mergeMetastoreParquetSchema(_, dataSchema0)) +.getOrElse(dataSchema0) +} } } - To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org