maropu commented on a change in pull request #31769: URL: https://github.com/apache/spark/pull/31769#discussion_r589834290
########## File path: sql/core/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala ########## @@ -352,10 +352,21 @@ final class DataFrameNaFunctions private[sql](df: DataFrame) { * * @since 1.3.1 */ - def replace[T](cols: Seq[String], replacement: Map[T, T]): DataFrame = replace0(cols, replacement) + def replace[T](cols: Seq[String], replacement: Map[T, T]): DataFrame = { + val attrs = cols.map { colName => + // Check column name exists + val attr = df.resolve(colName) match { + case a: Attribute => a + case _ => throw new UnsupportedOperationException( Review comment: `AnalysisException` instead? How about following the behaivour of the other functions in this class? e.g., ``` scala> df.show() +----+---------+ | a| b| +----+---------+ |null|{1, null}| +----+---------+ scala> df.printSchema() root |-- a: integer (nullable = true) |-- b: struct (nullable = false) | |-- c0: integer (nullable = false) | |-- c1: integer (nullable = true) scala> df.na.fill(3, Seq("a")).show() +---+---------+ | a| b| +---+---------+ | 3|{1, null}| +---+---------+ scala> df.na.fill(3, Seq("b")).show() +----+---------+ | a| b| +----+---------+ |null|{1, null}| +----+---------+ scala> df.na.fill(3, Seq("b.c2")).show() org.apache.spark.sql.AnalysisException: No such struct field c2 in c0, c1 at org.apache.spark.sql.catalyst.expressions.ExtractValue$.findField(complexTypeExtractors.scala:82) at org.apache.spark.sql.catalyst.expressions.ExtractValue$.apply(complexTypeExtractors.scala:55) at org.apache.spark.sql.catalyst.expressions.package$AttributeSeq.$anonfun$resolve$1(package.scala:348) at scala.collection.IndexedSeqOptimized.foldLeft(IndexedSeqOptimized.scala:60) at scala.collection.IndexedSeqOptimized.foldLeft$(IndexedSeqOptimized.scala:68) at scala.collection.mutable.ArrayBuffer.foldLeft(ArrayBuffer.scala:49) at org.apache.spark.sql.catalyst.expressions.package$AttributeSeq.resolve(package.scala:347) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolve(LogicalPlan.scala:119) at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.resolveQuoted(LogicalPlan.scala:130) at org.apache.spark.sql.Dataset.resolve(Dataset.scala:262) at org.apache.spark.sql.Dataset.col(Dataset.scala:1361) at org.apache.spark.sql.DataFrameNaFunctions.$anonfun$toAttributes$1(DataFrameNaFunctions.scala:475) at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238) at scala.collection.immutable.List.foreach(List.scala:392) at scala.collection.TraversableLike.map(TraversableLike.scala:238) at scala.collection.TraversableLike.map$(TraversableLike.scala:231) at scala.collection.immutable.List.map(List.scala:298) at org.apache.spark.sql.DataFrameNaFunctions.toAttributes(DataFrameNaFunctions.scala:475) at org.apache.spark.sql.DataFrameNaFunctions.fill(DataFrameNaFunctions.scala:163) ... 47 elided scala> df.na.fill(3, Seq("c")).show() org.apache.spark.sql.AnalysisException: Cannot resolve column name "c" among (a, b) at org.apache.spark.sql.Dataset.org$apache$spark$sql$Dataset$$resolveException(Dataset.scala:272) at org.apache.spark.sql.Dataset.$anonfun$resolve$1(Dataset.scala:263) at scala.Option.getOrElse(Option.scala:189) at org.apache.spark.sql.Dataset.resolve(Dataset.scala:263) at org.apache.spark.sql.Dataset.col(Dataset.scala:1361) at org.apache.spark.sql.DataFrameNaFunctions.$anonfun$toAttributes$1(DataFrameNaFunctions.scala:475) at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238) at scala.collection.immutable.List.foreach(List.scala:392) at scala.collection.TraversableLike.map(TraversableLike.scala:238) at scala.collection.TraversableLike.map$(TraversableLike.scala:231) at scala.collection.immutable.List.map(List.scala:298) at org.apache.spark.sql.DataFrameNaFunctions.toAttributes(DataFrameNaFunctions.scala:475) at org.apache.spark.sql.DataFrameNaFunctions.fill(DataFrameNaFunctions.scala:163) ... 47 elided ``` ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org