[ https://issues.apache.org/jira/browse/SPARK-38507?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17504703#comment-17504703 ]
qian commented on SPARK-38507: ------------------------------ Hi [~amavrommatis] The reason for this problem is because you alias dataframe *df* as *df*, resulting in a shema conflict. You can try this command: {code:scala} df.withColumn("field3", lit(0)).select("field3").show(2) {code} While this command works, the result is not right {code:scala} df.withColumn("df.field2", lit(0)).select("df.field2").show(2) {code} Result is origin column *field2*, not your new column *df.field2*, the value of which is 0. > DataFrame withColumn method not adding or replacing columns when alias is used > ------------------------------------------------------------------------------ > > Key: SPARK-38507 > URL: https://issues.apache.org/jira/browse/SPARK-38507 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 3.1.2 > Reporter: Alexandros Mavrommatis > Priority: Major > Labels: SQL, catalyst > > I have an input DataFrame *df* created as follows: > {code:java} > import spark.implicits._ > val df = List((5, 10), (6, 20)).toDF("field1", "field2").alias("df") {code} > When I execute either this command: > {code:java} > df.select("df.field2").show(2) {code} > or that one: > {code:java} > df.withColumn("df.field2", lit(0)).select("df.field2").show(2) {code} > I get the same result: > {code:java} > +------+ > |field2| > +------+ > | 10| > | 20| > +------+ {code} > Additionally, when I execute the following command: > {code:java} > df.withColumn("df.field3", lit(0)).select("df.field3").show(2){code} > I get this exception: > {code:java} > org.apache.spark.sql.AnalysisException: cannot resolve '`df.field3`' given > input columns: [df.field3, df.field1, df.field2]; 'Project ['df.field3] +- > Project [field1#7, field2#8, 0 AS df.field3#31] +- SubqueryAlias df > +- Project [_1#2 AS field1#7, _2#3 AS field2#8] +- LocalRelation > [_1#2, _2#3] at > org.apache.spark.sql.catalyst.analysis.package$AnalysisErrorAt.failAnalysis(package.scala:42) > at > org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$$nestedInanonfun$checkAnalysis$1$2.applyOrElse(CheckAnalysis.scala:155) > at > org.apache.spark.sql.catalyst.analysis.CheckAnalysis$$anonfun$$nestedInanonfun$checkAnalysis$1$2.applyOrElse(CheckAnalysis.scala:152) > at > org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformUp$2(TreeNode.scala:342) > at > org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:74) > at > org.apache.spark.sql.catalyst.trees.TreeNode.transformUp(TreeNode.scala:342) > at > org.apache.spark.sql.catalyst.plans.QueryPlan.$anonfun$transformExpressionsUp$1(QueryPlan.scala:104) > at > org.apache.spark.sql.catalyst.plans.QueryPlan.$anonfun$mapExpressions$1(QueryPlan.scala:116) > at > org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:74) > at > org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpression$1(QueryPlan.scala:116) > at > org.apache.spark.sql.catalyst.plans.QueryPlan.recursiveTransform$1(QueryPlan.scala:127) > at > org.apache.spark.sql.catalyst.plans.QueryPlan.$anonfun$mapExpressions$3(QueryPlan.scala:132) > at > scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:238) > at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62) > at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55) > at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49) at > scala.collection.TraversableLike.map(TraversableLike.scala:238) at > scala.collection.TraversableLike.map$(TraversableLike.scala:231) at > scala.collection.AbstractTraversable.map(Traversable.scala:108) at > org.apache.spark.sql.catalyst.plans.QueryPlan.recursiveTransform$1(QueryPlan.scala:132) > at > org.apache.spark.sql.catalyst.plans.QueryPlan.$anonfun$mapExpressions$4(QueryPlan.scala:137) > at > org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:244) > at > org.apache.spark.sql.catalyst.plans.QueryPlan.mapExpressions(QueryPlan.scala:137) > at > org.apache.spark.sql.catalyst.plans.QueryPlan.transformExpressionsUp(QueryPlan.scala:104) > at > org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis$1(CheckAnalysis.scala:152) > at > org.apache.spark.sql.catalyst.analysis.CheckAnalysis.$anonfun$checkAnalysis$1$adapted(CheckAnalysis.scala:93) > at > org.apache.spark.sql.catalyst.trees.TreeNode.foreachUp(TreeNode.scala:184) > at > org.apache.spark.sql.catalyst.analysis.CheckAnalysis.checkAnalysis(CheckAnalysis.scala:93) > at > org.apache.spark.sql.catalyst.analysis.CheckAnalysis.checkAnalysis$(CheckAnalysis.scala:90) > at > org.apache.spark.sql.catalyst.analysis.Analyzer.checkAnalysis(Analyzer.scala:155) > at > org.apache.spark.sql.catalyst.analysis.Analyzer.$anonfun$executeAndCheck$1(Analyzer.scala:176) > at > org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper$.markInAnalyzer(AnalysisHelper.scala:228) > at > org.apache.spark.sql.catalyst.analysis.Analyzer.executeAndCheck(Analyzer.scala:173) > at > org.apache.spark.sql.execution.QueryExecution.$anonfun$analyzed$1(QueryExecution.scala:73) > at > org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111) > at > org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:143) > at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775) > at > org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:143) > at > org.apache.spark.sql.execution.QueryExecution.analyzed$lzycompute(QueryExecution.scala:73) > at > org.apache.spark.sql.execution.QueryExecution.analyzed(QueryExecution.scala:71) > at > org.apache.spark.sql.execution.QueryExecution.assertAnalyzed(QueryExecution.scala:63) > at org.apache.spark.sql.Dataset$.$anonfun$ofRows$1(Dataset.scala:90) at > org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775) at > org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:88) at > org.apache.spark.sql.Dataset.withPlan(Dataset.scala:3715) at > org.apache.spark.sql.Dataset.select(Dataset.scala:1462) at > org.apache.spark.sql.Dataset.select(Dataset.scala:1479) > ... 49 elided {code} -- This message was sent by Atlassian Jira (v8.20.1#820001) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org