This is an automated email from the ASF dual-hosted git repository. hvanhovell pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new c036836 [SPARK-26495][SQL] Simplify the SelectedField extractor. c036836 is described below commit c0368363f8a81dd739c6c90fb2849b2a3ab4d8e4 Author: Herman van Hovell <hvanhov...@databricks.com> AuthorDate: Mon Dec 31 17:46:06 2018 +0100 [SPARK-26495][SQL] Simplify the SelectedField extractor. ## What changes were proposed in this pull request? The current `SelectedField` extractor is somewhat complicated and it seems to be handling cases that should be handled automatically: - `GetArrayItem(child: GetStructFieldObject())` - `GetArrayStructFields(child: GetArrayStructFields())` - `GetMap(value: GetStructFieldObject())` This PR removes those cases and simplifies the extractor by passing down the data type instead of a field. ## How was this patch tested? Existing tests. Closes #23397 from hvanhovell/SPARK-26495. Authored-by: Herman van Hovell <hvanhov...@databricks.com> Signed-off-by: Herman van Hovell <hvanhov...@databricks.com> --- .../apache/spark/sql/execution/SelectedField.scala | 103 ++++++++------------- 1 file changed, 41 insertions(+), 62 deletions(-) diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/SelectedField.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/SelectedField.scala index 0e7c593..68f797a 100644 --- a/sql/core/src/main/scala/org/apache/spark/sql/execution/SelectedField.scala +++ b/sql/core/src/main/scala/org/apache/spark/sql/execution/SelectedField.scala @@ -17,6 +17,7 @@ package org.apache.spark.sql.execution +import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ @@ -51,8 +52,6 @@ import org.apache.spark.sql.types._ * type appropriate to the complex type extractor. In our example, the name of the child expression * is "name" and its data type is a [[org.apache.spark.sql.types.StructType]] with a single string * field named "first". - * - * @param expr the top-level complex type extractor */ private[execution] object SelectedField { def unapply(expr: Expression): Option[StructField] = { @@ -64,71 +63,51 @@ private[execution] object SelectedField { selectField(unaliased, None) } - private def selectField(expr: Expression, fieldOpt: Option[StructField]): Option[StructField] = { + /** + * Convert an expression into the parts of the schema (the field) it accesses. + */ + private def selectField(expr: Expression, dataTypeOpt: Option[DataType]): Option[StructField] = { expr match { - // No children. Returns a StructField with the attribute name or None if fieldOpt is None. - case AttributeReference(name, dataType, nullable, metadata) => - fieldOpt.map(field => - StructField(name, wrapStructType(dataType, field), nullable, metadata)) - // Handles case "expr0.field[n]", where "expr0" is of struct type and "expr0.field" is of - // array type. - case GetArrayItem(x @ GetStructFieldObject(child, field @ StructField(name, - dataType, nullable, metadata)), _) => - val childField = fieldOpt.map(field => StructField(name, - wrapStructType(dataType, field), nullable, metadata)).getOrElse(field) - selectField(child, Some(childField)) - // Handles case "expr0.field[n]", where "expr0.field" is of array type. - case GetArrayItem(child, _) => - selectField(child, fieldOpt) - // Handles case "expr0.field.subfield", where "expr0" and "expr0.field" are of array type. - case GetArrayStructFields(child: GetArrayStructFields, - field @ StructField(name, dataType, nullable, metadata), _, _, _) => - val childField = fieldOpt.map(field => StructField(name, - wrapStructType(dataType, field), - nullable, metadata)).orElse(Some(field)) - selectField(child, childField) - // Handles case "expr0.field", where "expr0" is of array type. - case GetArrayStructFields(child, - field @ StructField(name, dataType, nullable, metadata), _, _, _) => - val childField = - fieldOpt.map(field => StructField(name, - wrapStructType(dataType, field), - nullable, metadata)).orElse(Some(field)) - selectField(child, childField) - // Handles case "expr0.field[key]", where "expr0" is of struct type and "expr0.field" is of - // map type. - case GetMapValue(x @ GetStructFieldObject(child, field @ StructField(name, - dataType, - nullable, metadata)), _) => - val childField = fieldOpt.map(field => StructField(name, - wrapStructType(dataType, field), - nullable, metadata)).orElse(Some(field)) - selectField(child, childField) - // Handles case "expr0.field[key]", where "expr0.field" is of map type. + case a: Attribute => + dataTypeOpt.map { dt => + StructField(a.name, dt, a.nullable) + } + case c: GetStructField => + val field = c.childSchema(c.ordinal) + val newField = field.copy(dataType = dataTypeOpt.getOrElse(field.dataType)) + selectField(c.child, Option(struct(newField))) + case GetArrayStructFields(child, field, _, _, containsNull) => + val newFieldDataType = dataTypeOpt match { + case None => + // GetArrayStructFields is the top level extractor. This means its result is + // not pruned and we need to use the element type of the array its producing. + field.dataType + case Some(ArrayType(dataType, _)) => + // GetArrayStructFields is part of a chain of extractors and its result is pruned + // by a parent expression. In this case need to use the parent element type. + dataType + case Some(x) => + // This should not happen. + throw new AnalysisException(s"DataType '$x' is not supported by GetArrayStructFields.") + } + val newField = StructField(field.name, newFieldDataType, field.nullable) + selectField(child, Option(ArrayType(struct(newField), containsNull))) case GetMapValue(child, _) => - selectField(child, fieldOpt) - // Handles case "expr0.field", where expr0 is of struct type. - case GetStructFieldObject(child, - field @ StructField(name, dataType, nullable, metadata)) => - val childField = fieldOpt.map(field => StructField(name, - wrapStructType(dataType, field), - nullable, metadata)).orElse(Some(field)) - selectField(child, childField) + // GetMapValue does not select a field from a struct (i.e. prune the struct) so it can't be + // the top-level extractor. However it can be part of an extractor chain. + val MapType(keyType, _, valueContainsNull) = child.dataType + val opt = dataTypeOpt.map(dt => MapType(keyType, dt, valueContainsNull)) + selectField(child, opt) + case GetArrayItem(child, _) => + // GetArrayItem does not select a field from a struct (i.e. prune the struct) so it can't be + // the top-level extractor. However it can be part of an extractor chain. + val ArrayType(_, containsNull) = child.dataType + val opt = dataTypeOpt.map(dt => ArrayType(dt, containsNull)) + selectField(child, opt) case _ => None } } - // Constructs a composition of complex types with a StructType(Array(field)) at its core. Returns - // a StructType for a StructType, an ArrayType for an ArrayType and a MapType for a MapType. - private def wrapStructType(dataType: DataType, field: StructField): DataType = { - dataType match { - case _: StructType => - StructType(Array(field)) - case ArrayType(elementType, containsNull) => - ArrayType(wrapStructType(elementType, field), containsNull) - case MapType(keyType, valueType, valueContainsNull) => - MapType(keyType, wrapStructType(valueType, field), valueContainsNull) - } - } + private def struct(field: StructField): StructType = StructType(Array(field)) } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org