Re: [PR] [SPARK-48031][SQL] Support view schema evolution [spark]

via GitHub Fri, 10 May 2024 07:46:31 -0700


cloud-fan commented on code in PR #46267:
URL: https://github.com/apache/spark/pull/46267#discussion_r1596852480



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala:
##########
@@ -945,54 +945,87 @@ class SessionCatalog(
           throw QueryCompilationErrors.invalidViewText(viewText, 
metadata.qualifiedName)
       }
     }
-    val projectList = if (!isHiveCreatedView(metadata)) {
-      val viewColumnNames = if (metadata.viewQueryColumnNames.isEmpty) {
-        // For view created before Spark 2.2.0, the view text is already fully 
qualified, the plan
-        // output is the same with the view output.
-        metadata.schema.fieldNames.toImmutableArraySeq
-      } else {
-        assert(metadata.viewQueryColumnNames.length == metadata.schema.length)
-        metadata.viewQueryColumnNames
-      }
+    val schemaMode = metadata.viewSchemaMode
+    if (schemaMode == SchemaEvolution) {
+      View(desc = metadata, isTempView = isTempView, child = parsedPlan)
+    } else {
+      val projectList = if (!isHiveCreatedView(metadata)) {
+        val viewColumnNames = if (metadata.viewQueryColumnNames.isEmpty) {
+          // For view created before Spark 2.2.0, the view text is already 
fully qualified, the plan
+          // output is the same with the view output.
+          metadata.schema.fieldNames.toImmutableArraySeq
+        } else {
+          assert(metadata.viewQueryColumnNames.length == 
metadata.schema.length)
+          metadata.viewQueryColumnNames
+        }
 
-      // For view queries like `SELECT * FROM t`, the schema of the referenced 
table/view may
-      // change after the view has been created. We need to add an extra 
SELECT to pick the columns
-      // according to the recorded column names (to get the correct view 
column ordering and omit
-      // the extra columns that we don't require), with UpCast (to make sure 
the type change is
-      // safe) and Alias (to respect user-specified view column names) 
according to the view schema
-      // in the catalog.
-      // Note that, the column names may have duplication, e.g. `CREATE VIEW 
v(x, y) AS
-      // SELECT 1 col, 2 col`. We need to make sure that the matching 
attributes have the same
-      // number of duplications, and pick the corresponding attribute by 
ordinal.
-      val viewConf = View.effectiveSQLConf(metadata.viewSQLConfigs, isTempView)
-      val normalizeColName: String => String = if 
(viewConf.caseSensitiveAnalysis) {
-        identity
+        // For view queries like `SELECT * FROM t`, the schema of the 
referenced table/view may
+        // change after the view has been created. We need to add an extra 
SELECT to pick the
+        // columns according to the recorded column names (to get the correct 
view column ordering
+        // and omit the extra columns that we don't require), with UpCast (to 
make sure the type
+        // change is safe) and Alias (to respect user-specified view column 
names) according to the
+        // view schema in the catalog.
+        // Note that, the column names may have duplication, e.g. `CREATE VIEW 
v(x, y) AS
+        // SELECT 1 col, 2 col`. We need to make sure that the matching 
attributes have the same
+        // number of duplications, and pick the corresponding attribute by 
ordinal.
+        val viewConf = View.effectiveSQLConf(metadata.viewSQLConfigs, 
isTempView)
+        val normalizeColName: String => String = if 
(viewConf.caseSensitiveAnalysis) {
+          identity
+        } else {
+          _.toLowerCase(Locale.ROOT)
+        }
+        val nameToCounts = 
viewColumnNames.groupBy(normalizeColName).transform((_, v) => v.length)
+        val nameToCurrentOrdinal = 
scala.collection.mutable.HashMap.empty[String, Int]
+        val viewDDL = buildViewDDL(metadata, isTempView)
+
+        viewColumnNames.zip(metadata.schema).map { case (name, field) =>
+          val normalizedName = normalizeColName(name)
+          val count = nameToCounts(normalizedName)
+          val ordinal = nameToCurrentOrdinal.getOrElse(normalizedName, 0)
+          nameToCurrentOrdinal(normalizedName) = ordinal + 1
+          val col = GetViewColumnByNameAndOrdinal(
+            metadata.identifier.toString, name, ordinal, count, viewDDL)
+          val cast = schemaMode match {
+            /*
+            ** For schema binding, we cast the column to the expected type 
using safe cast only.
+            ** For legacy behavior, we cast the column to the expected type 
using safe cast only.
+            ** For schema compensation, we cast the column to the expected 
type using any cast
+            *  in ansi mode.
+            ** For schema (type) evolution, we take the column as is.
+            */
+            case SchemaBinding => UpCast(col, field.dataType)
+            case SchemaUnsupported => UpCast(col, field.dataType)
+            case SchemaCompensation => Cast(col, field.dataType, ansiEnabled = 
true)
+            case SchemaTypeEvolution => col
+            case other => throw SparkException.internalError("Unexpected 
ViewSchemaMode")
+          }
+          Alias(cast, field.name)(explicitMetadata = Some(field.metadata))
+        }
       } else {
-        _.toLowerCase(Locale.ROOT)
-      }
-      val nameToCounts = 
viewColumnNames.groupBy(normalizeColName).transform((_, v) => v.length)
-      val nameToCurrentOrdinal = 
scala.collection.mutable.HashMap.empty[String, Int]
-      val viewDDL = buildViewDDL(metadata, isTempView)
-
-      viewColumnNames.zip(metadata.schema).map { case (name, field) =>
-        val normalizedName = normalizeColName(name)
-        val count = nameToCounts(normalizedName)
-        val ordinal = nameToCurrentOrdinal.getOrElse(normalizedName, 0)
-        nameToCurrentOrdinal(normalizedName) = ordinal + 1
-        val col = GetViewColumnByNameAndOrdinal(
-          metadata.identifier.toString, name, ordinal, count, viewDDL)
-        Alias(UpCast(col, field.dataType), field.name)(explicitMetadata = 
Some(field.metadata))
-      }
-    } else {
-      // For view created by hive, the parsed view plan may have different 
output columns with
-      // the schema stored in metadata. For example: `CREATE VIEW v AS SELECT 
1 FROM t`
-      // the schema in metadata will be `_c0` while the parsed view plan has 
column named `1`
-      metadata.schema.zipWithIndex.map { case (field, index) =>
-        val col = GetColumnByOrdinal(index, field.dataType)
-        Alias(UpCast(col, field.dataType), field.name)(explicitMetadata = 
Some(field.metadata))
+        // For view created by hive, the parsed view plan may have different 
output columns with
+        // the schema stored in metadata. For example: `CREATE VIEW v AS 
SELECT 1 FROM t`
+        // the schema in metadata will be `_c0` while the parsed view plan has 
column named `1`
+        metadata.schema.zipWithIndex.map { case (field, index) =>
+          val col = GetColumnByOrdinal(index, field.dataType)
+          val cast = schemaMode match {
+            /*
+            ** For schema binding, we cast the column to the expected type 
using safe cast only.
+            ** For legacy behavior, we cast the column to the expected type 
using safe cast only.
+            ** For schema compensation, we cast the column to the expected 
type using any cast
+            *  in ansi mode.
+            ** For schema (type) evolution, we take teh column as is.
+            */
+            case SchemaBinding => UpCast(col, field.dataType)
+            case SchemaUnsupported => UpCast(col, field.dataType)
+            case SchemaCompensation => Cast(col, field.dataType, ansiEnabled = 
true)
+            case SchemaTypeEvolution => col
+            case other => throw SparkException.internalError("Unexpected 
ViewSchemaMode")
+          }
+          Alias(cast, field.name)(explicitMetadata = Some(field.metadata))

Review Comment:
   nit: let's create a methond to do the cast, to avoid code duplication



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/catalog/SessionCatalog.scala:
##########
@@ -945,54 +945,87 @@ class SessionCatalog(
           throw QueryCompilationErrors.invalidViewText(viewText, 
metadata.qualifiedName)
       }
     }
-    val projectList = if (!isHiveCreatedView(metadata)) {
-      val viewColumnNames = if (metadata.viewQueryColumnNames.isEmpty) {
-        // For view created before Spark 2.2.0, the view text is already fully 
qualified, the plan
-        // output is the same with the view output.
-        metadata.schema.fieldNames.toImmutableArraySeq
-      } else {
-        assert(metadata.viewQueryColumnNames.length == metadata.schema.length)
-        metadata.viewQueryColumnNames
-      }
+    val schemaMode = metadata.viewSchemaMode
+    if (schemaMode == SchemaEvolution) {
+      View(desc = metadata, isTempView = isTempView, child = parsedPlan)
+    } else {
+      val projectList = if (!isHiveCreatedView(metadata)) {
+        val viewColumnNames = if (metadata.viewQueryColumnNames.isEmpty) {
+          // For view created before Spark 2.2.0, the view text is already 
fully qualified, the plan
+          // output is the same with the view output.
+          metadata.schema.fieldNames.toImmutableArraySeq
+        } else {
+          assert(metadata.viewQueryColumnNames.length == 
metadata.schema.length)
+          metadata.viewQueryColumnNames
+        }
 
-      // For view queries like `SELECT * FROM t`, the schema of the referenced 
table/view may
-      // change after the view has been created. We need to add an extra 
SELECT to pick the columns
-      // according to the recorded column names (to get the correct view 
column ordering and omit
-      // the extra columns that we don't require), with UpCast (to make sure 
the type change is
-      // safe) and Alias (to respect user-specified view column names) 
according to the view schema
-      // in the catalog.
-      // Note that, the column names may have duplication, e.g. `CREATE VIEW 
v(x, y) AS
-      // SELECT 1 col, 2 col`. We need to make sure that the matching 
attributes have the same
-      // number of duplications, and pick the corresponding attribute by 
ordinal.
-      val viewConf = View.effectiveSQLConf(metadata.viewSQLConfigs, isTempView)
-      val normalizeColName: String => String = if 
(viewConf.caseSensitiveAnalysis) {
-        identity
+        // For view queries like `SELECT * FROM t`, the schema of the 
referenced table/view may
+        // change after the view has been created. We need to add an extra 
SELECT to pick the
+        // columns according to the recorded column names (to get the correct 
view column ordering
+        // and omit the extra columns that we don't require), with UpCast (to 
make sure the type
+        // change is safe) and Alias (to respect user-specified view column 
names) according to the
+        // view schema in the catalog.
+        // Note that, the column names may have duplication, e.g. `CREATE VIEW 
v(x, y) AS
+        // SELECT 1 col, 2 col`. We need to make sure that the matching 
attributes have the same
+        // number of duplications, and pick the corresponding attribute by 
ordinal.
+        val viewConf = View.effectiveSQLConf(metadata.viewSQLConfigs, 
isTempView)
+        val normalizeColName: String => String = if 
(viewConf.caseSensitiveAnalysis) {
+          identity
+        } else {
+          _.toLowerCase(Locale.ROOT)
+        }
+        val nameToCounts = 
viewColumnNames.groupBy(normalizeColName).transform((_, v) => v.length)
+        val nameToCurrentOrdinal = 
scala.collection.mutable.HashMap.empty[String, Int]
+        val viewDDL = buildViewDDL(metadata, isTempView)
+
+        viewColumnNames.zip(metadata.schema).map { case (name, field) =>
+          val normalizedName = normalizeColName(name)
+          val count = nameToCounts(normalizedName)
+          val ordinal = nameToCurrentOrdinal.getOrElse(normalizedName, 0)
+          nameToCurrentOrdinal(normalizedName) = ordinal + 1
+          val col = GetViewColumnByNameAndOrdinal(
+            metadata.identifier.toString, name, ordinal, count, viewDDL)
+          val cast = schemaMode match {
+            /*
+            ** For schema binding, we cast the column to the expected type 
using safe cast only.
+            ** For legacy behavior, we cast the column to the expected type 
using safe cast only.
+            ** For schema compensation, we cast the column to the expected 
type using any cast
+            *  in ansi mode.
+            ** For schema (type) evolution, we take the column as is.
+            */
+            case SchemaBinding => UpCast(col, field.dataType)
+            case SchemaUnsupported => UpCast(col, field.dataType)
+            case SchemaCompensation => Cast(col, field.dataType, ansiEnabled = 
true)
+            case SchemaTypeEvolution => col
+            case other => throw SparkException.internalError("Unexpected 
ViewSchemaMode")
+          }
+          Alias(cast, field.name)(explicitMetadata = Some(field.metadata))
+        }
       } else {
-        _.toLowerCase(Locale.ROOT)
-      }
-      val nameToCounts = 
viewColumnNames.groupBy(normalizeColName).transform((_, v) => v.length)
-      val nameToCurrentOrdinal = 
scala.collection.mutable.HashMap.empty[String, Int]
-      val viewDDL = buildViewDDL(metadata, isTempView)
-
-      viewColumnNames.zip(metadata.schema).map { case (name, field) =>
-        val normalizedName = normalizeColName(name)
-        val count = nameToCounts(normalizedName)
-        val ordinal = nameToCurrentOrdinal.getOrElse(normalizedName, 0)
-        nameToCurrentOrdinal(normalizedName) = ordinal + 1
-        val col = GetViewColumnByNameAndOrdinal(
-          metadata.identifier.toString, name, ordinal, count, viewDDL)
-        Alias(UpCast(col, field.dataType), field.name)(explicitMetadata = 
Some(field.metadata))
-      }
-    } else {
-      // For view created by hive, the parsed view plan may have different 
output columns with
-      // the schema stored in metadata. For example: `CREATE VIEW v AS SELECT 
1 FROM t`
-      // the schema in metadata will be `_c0` while the parsed view plan has 
column named `1`
-      metadata.schema.zipWithIndex.map { case (field, index) =>
-        val col = GetColumnByOrdinal(index, field.dataType)
-        Alias(UpCast(col, field.dataType), field.name)(explicitMetadata = 
Some(field.metadata))
+        // For view created by hive, the parsed view plan may have different 
output columns with
+        // the schema stored in metadata. For example: `CREATE VIEW v AS 
SELECT 1 FROM t`
+        // the schema in metadata will be `_c0` while the parsed view plan has 
column named `1`
+        metadata.schema.zipWithIndex.map { case (field, index) =>
+          val col = GetColumnByOrdinal(index, field.dataType)
+          val cast = schemaMode match {
+            /*
+            ** For schema binding, we cast the column to the expected type 
using safe cast only.
+            ** For legacy behavior, we cast the column to the expected type 
using safe cast only.
+            ** For schema compensation, we cast the column to the expected 
type using any cast
+            *  in ansi mode.
+            ** For schema (type) evolution, we take teh column as is.
+            */
+            case SchemaBinding => UpCast(col, field.dataType)
+            case SchemaUnsupported => UpCast(col, field.dataType)
+            case SchemaCompensation => Cast(col, field.dataType, ansiEnabled = 
true)
+            case SchemaTypeEvolution => col
+            case other => throw SparkException.internalError("Unexpected 
ViewSchemaMode")
+          }
+          Alias(cast, field.name)(explicitMetadata = Some(field.metadata))

Review Comment:
   nit: let's create a method to do the cast, to avoid code duplication



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Re: [PR] [SPARK-48031][SQL] Support view schema evolution [spark]

Reply via email to