bluzy opened a new issue, #7065:
URL: https://github.com/apache/iceberg/issues/7065

   ### Query engine
   
   Spark
   
   ### Question
   
   I have a table with some nested fields, and tried to delete rows on spark 
sql.
   But the operation was field with error like:
   `java.lang.NullPointerException: Cannot filter by nested column`
   
   
   For simplify I wrote scala test code
   ```scala
   class NestedDeleteTest extends AnyFunSuite with Matchers {
   
     test("delete with nested field") {
       val catalogName = "hadoop"
       val dbName      = "test_db"
       val tableName   = "test_table"
       val tableId = s"$catalogName.$dbName.$tableName"
   
       val warehouse = s"${System.getProperty("user.dir")}/warehouse"
   
       val spark = SparkSession.builder
         .master("local[1]")
         .config("spark.sql.extensions", 
"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
         .config(s"spark.sql.catalog.$catalogName", 
"org.apache.iceberg.spark.SparkCatalog")
         .config(s"spark.sql.catalog.$catalogName.type", "hadoop")
         .config(s"spark.sql.catalog.$catalogName.warehouse", warehouse)
         .getOrCreate()
   
       val rows = List(
         """{"nested":{"id":1},"timestamp":1}""",
         """{"nested":{"id":2},"timestamp":1}""",
         """{"nested":{"id":1},"timestamp":2}"""
       )
   
       val df = 
spark.read.json(spark.createDataset(spark.sparkContext.parallelize(rows))(Encoders.STRING))
   
       df.writeTo(tableId).createOrReplace()
   
       val before = spark.table(tableId)
       before.show()
       before.count() shouldBe 3
   
       val sql = s"DELETE FROM $tableId WHERE nested.id=1 AND timestamp=2"
       spark.sql(sql)
   
       val after = spark.table(tableId)
       after.show()
       after.count() shouldBe 2
     }
   }
   ```
   
   Then, the problem is reproduced.
   
   stacktrace:
   ```
   java.lang.NullPointerException: Cannot filter by nested column: 5: id: 
optional long
        at 
org.apache.iceberg.relocated.com.google.common.base.Preconditions.checkNotNull(Preconditions.java:994)
        at 
org.apache.iceberg.expressions.StrictMetricsEvaluator$MetricsEvalVisitor.eq(StrictMetricsEvaluator.java:305)
        at 
org.apache.iceberg.expressions.StrictMetricsEvaluator$MetricsEvalVisitor.eq(StrictMetricsEvaluator.java:83)
        at 
org.apache.iceberg.expressions.ExpressionVisitors$BoundExpressionVisitor.predicate(ExpressionVisitors.java:162)
        at 
org.apache.iceberg.expressions.ExpressionVisitors.visitEvaluator(ExpressionVisitors.java:390)
        at 
org.apache.iceberg.expressions.ExpressionVisitors.visitEvaluator(ExpressionVisitors.java:405)
        at 
org.apache.iceberg.expressions.StrictMetricsEvaluator$MetricsEvalVisitor.eval(StrictMetricsEvaluator.java:101)
        at 
org.apache.iceberg.expressions.StrictMetricsEvaluator$MetricsEvalVisitor.access$100(StrictMetricsEvaluator.java:83)
        at 
org.apache.iceberg.expressions.StrictMetricsEvaluator.eval(StrictMetricsEvaluator.java:77)
        at 
org.apache.iceberg.spark.source.SparkTable.lambda$canDeleteUsingMetadata$2(SparkTable.java:300)
        at 
org.apache.iceberg.relocated.com.google.common.collect.Iterators.all(Iterators.java:710)
        at 
org.apache.iceberg.relocated.com.google.common.collect.Iterables.all(Iterables.java:645)
        at 
org.apache.iceberg.spark.source.SparkTable.canDeleteUsingMetadata(SparkTable.java:289)
        at 
org.apache.iceberg.spark.source.SparkTable.canDeleteWhere(SparkTable.java:269)
        at 
org.apache.spark.sql.execution.datasources.v2.OptimizeMetadataOnlyDeleteFromTable$$anonfun$apply$1.applyOrElse(OptimizeMetadataOnlyDeleteFromTable.scala:53)
        at 
org.apache.spark.sql.execution.datasources.v2.OptimizeMetadataOnlyDeleteFromTable$$anonfun$apply$1.applyOrElse(OptimizeMetadataOnlyDeleteFromTable.scala:44)
        at 
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:481)
        at 
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:82)
        at 
org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:481)
        at 
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
        at 
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
        at 
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
        at 
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
        at 
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
        at 
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:457)
        at 
org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:425)
        at 
org.apache.spark.sql.execution.datasources.v2.OptimizeMetadataOnlyDeleteFromTable$.apply(OptimizeMetadataOnlyDeleteFromTable.scala:44)
        at 
org.apache.spark.sql.execution.datasources.v2.OptimizeMetadataOnlyDeleteFromTable$.apply(OptimizeMetadataOnlyDeleteFromTable.scala:40)
        at 
org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:211)
        at 
scala.collection.LinearSeqOptimized.foldLeft(LinearSeqOptimized.scala:126)
        at 
scala.collection.LinearSeqOptimized.foldLeft$(LinearSeqOptimized.scala:122)
        at scala.collection.immutable.List.foldLeft(List.scala:91)
        at 
org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:208)
        at 
org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:200)
        at scala.collection.immutable.List.foreach(List.scala:431)
        at 
org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:200)
        at 
org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:179)
        at 
org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:88)
        at 
org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:179)
        at 
org.apache.spark.sql.execution.QueryExecution.$anonfun$optimizedPlan$1(QueryExecution.scala:125)
        at 
org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
        at 
org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:183)
        at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
        at 
org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:183)
        at 
org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:121)
        at 
org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:117)
        at 
org.apache.spark.sql.execution.QueryExecution.assertOptimized(QueryExecution.scala:135)
        at 
org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:153)
        at 
org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:150)
        at 
org.apache.spark.sql.execution.QueryExecution.simpleString(QueryExecution.scala:201)
        at 
org.apache.spark.sql.execution.QueryExecution.org$apache$spark$sql$execution$QueryExecution$$explainString(QueryExecution.scala:246)
        at 
org.apache.spark.sql.execution.QueryExecution.explainString(QueryExecution.scala:215)
        at 
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:98)
        at 
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
        at 
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
        at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
        at 
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
        at 
org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:97)
        at 
org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:93)
        at 
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:481)
        at 
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:82)
        at 
org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:481)
        at 
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
        at 
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
        at 
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
        at 
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
        at 
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
        at 
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:457)
        at 
org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:93)
        at 
org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:80)
        at 
org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:78)
        at org.apache.spark.sql.Dataset.<init>(Dataset.scala:219)
        at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:99)
        at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
        at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:96)
        at 
org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:618)
        at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
        at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:613)
   ```
   
   Is it normal operation?
   I am confused because I found test case for delete with nested column
   
https://github.com/apache/iceberg/blob/39a2c12b843f6f16e3389e738531e24a32f5bf39/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java#L383-L397


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to