Onur Satici created SPARK-23274: ----------------------------------- Summary: ReplaceExceptWithFilter fails on dataframes filtered on same column Key: SPARK-23274 URL: https://issues.apache.org/jira/browse/SPARK-23274 Project: Spark Issue Type: Bug Components: SQL Affects Versions: 2.3.0 Reporter: Onur Satici
Steps to reproduce: {code:java} $ cat test.csv a,b 1,2 1,3 2,2 2,4 {code} {code:java} val df = spark.read.format("csv").option("header", "true").load("test.csv") val df1 = df.filter($"a" === 1) val df2 = df.filter($"a" === 2) df1.select("b").except(df2.select("b")).show {code} results in: {code:java} java.util.NoSuchElementException: key not found: a at scala.collection.MapLike$class.default(MapLike.scala:228) at scala.collection.AbstractMap.default(Map.scala:59) at scala.collection.MapLike$class.apply(MapLike.scala:141) at scala.collection.AbstractMap.apply(Map.scala:59) at org.apache.spark.sql.catalyst.optimizer.ReplaceExceptWithFilter$$anonfun$org$apache$spark$sql$catalyst$optimizer$ReplaceExceptWithFilter$$transformCondition$1.applyOrElse(ReplaceExceptWithFilter.scala:60) at org.apache.spark.sql.catalyst.optimizer.ReplaceExceptWithFilter$$anonfun$org$apache$spark$sql$catalyst$optimizer$ReplaceExceptWithFilter$$transformCondition$1.applyOrElse(ReplaceExceptWithFilter.scala:60) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267) at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70) at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:266) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306) at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187) at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304) at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:272) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306) at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187) at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304) at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:272) at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:256) at org.apache.spark.sql.catalyst.optimizer.ReplaceExceptWithFilter$.org$apache$spark$sql$catalyst$optimizer$ReplaceExceptWithFilter$$transformCondition(ReplaceExceptWithFilter.scala:60) at org.apache.spark.sql.catalyst.optimizer.ReplaceExceptWithFilter$$anonfun$apply$1.applyOrElse(ReplaceExceptWithFilter.scala:50) at org.apache.spark.sql.catalyst.optimizer.ReplaceExceptWithFilter$$anonfun$apply$1.applyOrElse(ReplaceExceptWithFilter.scala:48) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$2.apply(TreeNode.scala:267) at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:70) at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:266) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306) at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187) at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304) at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:272) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$transformDown$1.apply(TreeNode.scala:272) at org.apache.spark.sql.catalyst.trees.TreeNode$$anonfun$4.apply(TreeNode.scala:306) at org.apache.spark.sql.catalyst.trees.TreeNode.mapProductIterator(TreeNode.scala:187) at org.apache.spark.sql.catalyst.trees.TreeNode.mapChildren(TreeNode.scala:304) at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:272) at org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:256) at org.apache.spark.sql.catalyst.optimizer.ReplaceExceptWithFilter$.apply(ReplaceExceptWithFilter.scala:48) at org.apache.spark.sql.catalyst.optimizer.ReplaceExceptWithFilter$.apply(ReplaceExceptWithFilter.scala:41) at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:92) at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1$$anonfun$apply$1.apply(RuleExecutor.scala:89) at scala.collection.IndexedSeqOptimized$class.foldl(IndexedSeqOptimized.scala:57) at scala.collection.IndexedSeqOptimized$class.foldLeft(IndexedSeqOptimized.scala:66) at scala.collection.mutable.WrappedArray.foldLeft(WrappedArray.scala:35) at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:89) at org.apache.spark.sql.catalyst.rules.RuleExecutor$$anonfun$execute$1.apply(RuleExecutor.scala:81) at scala.collection.immutable.List.foreach(List.scala:381) at org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:81) at org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:79) at org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:79) at org.apache.spark.sql.execution.QueryExecution.sparkPlan$lzycompute(QueryExecution.scala:85) at org.apache.spark.sql.execution.QueryExecution.sparkPlan(QueryExecution.scala:81) at org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:90) at org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:90) at org.apache.spark.sql.Dataset.withAction(Dataset.scala:3141) at org.apache.spark.sql.Dataset.head(Dataset.scala:2426) at org.apache.spark.sql.Dataset.take(Dataset.scala:2640) at org.apache.spark.sql.Dataset.showString(Dataset.scala:237) at org.apache.spark.sql.Dataset.show(Dataset.scala:668) at org.apache.spark.sql.Dataset.show(Dataset.scala:627) at org.apache.spark.sql.Dataset.show(Dataset.scala:636) ... 49 elided {code} -- This message was sent by Atlassian JIRA (v7.6.3#76005) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org