bluzy opened a new issue, #7065:
URL: https://github.com/apache/iceberg/issues/7065
### Query engine
Spark
### Question
I have a table with some nested fields, and tried to delete rows on spark
sql.
But the operation was field with error like:
`java.lang.NullPointerException: Cannot filter by nested column`
For simplify I wrote scala test code
```scala
class NestedDeleteTest extends AnyFunSuite with Matchers {
test("delete with nested field") {
val catalogName = "hadoop"
val dbName = "test_db"
val tableName = "test_table"
val tableId = s"$catalogName.$dbName.$tableName"
val warehouse = s"${System.getProperty("user.dir")}/warehouse"
val spark = SparkSession.builder
.master("local[1]")
.config("spark.sql.extensions",
"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
.config(s"spark.sql.catalog.$catalogName",
"org.apache.iceberg.spark.SparkCatalog")
.config(s"spark.sql.catalog.$catalogName.type", "hadoop")
.config(s"spark.sql.catalog.$catalogName.warehouse", warehouse)
.getOrCreate()
val rows = List(
"""{"nested":{"id":1},"timestamp":1}""",
"""{"nested":{"id":2},"timestamp":1}""",
"""{"nested":{"id":1},"timestamp":2}"""
)
val df =
spark.read.json(spark.createDataset(spark.sparkContext.parallelize(rows))(Encoders.STRING))
df.writeTo(tableId).createOrReplace()
val before = spark.table(tableId)
before.show()
before.count() shouldBe 3
val sql = s"DELETE FROM $tableId WHERE nested.id=1 AND timestamp=2"
spark.sql(sql)
val after = spark.table(tableId)
after.show()
after.count() shouldBe 2
}
}
```
Then, the problem is reproduced.
stacktrace:
```
java.lang.NullPointerException: Cannot filter by nested column: 5: id:
optional long
at
org.apache.iceberg.relocated.com.google.common.base.Preconditions.checkNotNull(Preconditions.java:994)
at
org.apache.iceberg.expressions.StrictMetricsEvaluator$MetricsEvalVisitor.eq(StrictMetricsEvaluator.java:305)
at
org.apache.iceberg.expressions.StrictMetricsEvaluator$MetricsEvalVisitor.eq(StrictMetricsEvaluator.java:83)
at
org.apache.iceberg.expressions.ExpressionVisitors$BoundExpressionVisitor.predicate(ExpressionVisitors.java:162)
at
org.apache.iceberg.expressions.ExpressionVisitors.visitEvaluator(ExpressionVisitors.java:390)
at
org.apache.iceberg.expressions.ExpressionVisitors.visitEvaluator(ExpressionVisitors.java:405)
at
org.apache.iceberg.expressions.StrictMetricsEvaluator$MetricsEvalVisitor.eval(StrictMetricsEvaluator.java:101)
at
org.apache.iceberg.expressions.StrictMetricsEvaluator$MetricsEvalVisitor.access$100(StrictMetricsEvaluator.java:83)
at
org.apache.iceberg.expressions.StrictMetricsEvaluator.eval(StrictMetricsEvaluator.java:77)
at
org.apache.iceberg.spark.source.SparkTable.lambda$canDeleteUsingMetadata$2(SparkTable.java:300)
at
org.apache.iceberg.relocated.com.google.common.collect.Iterators.all(Iterators.java:710)
at
org.apache.iceberg.relocated.com.google.common.collect.Iterables.all(Iterables.java:645)
at
org.apache.iceberg.spark.source.SparkTable.canDeleteUsingMetadata(SparkTable.java:289)
at
org.apache.iceberg.spark.source.SparkTable.canDeleteWhere(SparkTable.java:269)
at
org.apache.spark.sql.execution.datasources.v2.OptimizeMetadataOnlyDeleteFromTable$$anonfun$apply$1.applyOrElse(OptimizeMetadataOnlyDeleteFromTable.scala:53)
at
org.apache.spark.sql.execution.datasources.v2.OptimizeMetadataOnlyDeleteFromTable$$anonfun$apply$1.applyOrElse(OptimizeMetadataOnlyDeleteFromTable.scala:44)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:481)
at
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:82)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:481)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:457)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transform(TreeNode.scala:425)
at
org.apache.spark.sql.execution.datasources.v2.OptimizeMetadataOnlyDeleteFromTable$.apply(OptimizeMetadataOnlyDeleteFromTable.scala:44)
at
org.apache.spark.sql.execution.datasources.v2.OptimizeMetadataOnlyDeleteFromTable$.apply(OptimizeMetadataOnlyDeleteFromTable.scala:40)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$2(RuleExecutor.scala:211)
at
scala.collection.LinearSeqOptimized.foldLeft(LinearSeqOptimized.scala:126)
at
scala.collection.LinearSeqOptimized.foldLeft$(LinearSeqOptimized.scala:122)
at scala.collection.immutable.List.foldLeft(List.scala:91)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1(RuleExecutor.scala:208)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$execute$1$adapted(RuleExecutor.scala:200)
at scala.collection.immutable.List.foreach(List.scala:431)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor.execute(RuleExecutor.scala:200)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor.$anonfun$executeAndTrack$1(RuleExecutor.scala:179)
at
org.apache.spark.sql.catalyst.QueryPlanningTracker$.withTracker(QueryPlanningTracker.scala:88)
at
org.apache.spark.sql.catalyst.rules.RuleExecutor.executeAndTrack(RuleExecutor.scala:179)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$optimizedPlan$1(QueryExecution.scala:125)
at
org.apache.spark.sql.catalyst.QueryPlanningTracker.measurePhase(QueryPlanningTracker.scala:111)
at
org.apache.spark.sql.execution.QueryExecution.$anonfun$executePhase$1(QueryExecution.scala:183)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at
org.apache.spark.sql.execution.QueryExecution.executePhase(QueryExecution.scala:183)
at
org.apache.spark.sql.execution.QueryExecution.optimizedPlan$lzycompute(QueryExecution.scala:121)
at
org.apache.spark.sql.execution.QueryExecution.optimizedPlan(QueryExecution.scala:117)
at
org.apache.spark.sql.execution.QueryExecution.assertOptimized(QueryExecution.scala:135)
at
org.apache.spark.sql.execution.QueryExecution.executedPlan$lzycompute(QueryExecution.scala:153)
at
org.apache.spark.sql.execution.QueryExecution.executedPlan(QueryExecution.scala:150)
at
org.apache.spark.sql.execution.QueryExecution.simpleString(QueryExecution.scala:201)
at
org.apache.spark.sql.execution.QueryExecution.org$apache$spark$sql$execution$QueryExecution$$explainString(QueryExecution.scala:246)
at
org.apache.spark.sql.execution.QueryExecution.explainString(QueryExecution.scala:215)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$5(SQLExecution.scala:98)
at
org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:163)
at
org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:90)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at
org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:64)
at
org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:97)
at
org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:93)
at
org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:481)
at
org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:82)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:481)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:30)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
at
org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at
org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:30)
at
org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:457)
at
org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:93)
at
org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:80)
at
org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:78)
at org.apache.spark.sql.Dataset.<init>(Dataset.scala:219)
at org.apache.spark.sql.Dataset$.$anonfun$ofRows$2(Dataset.scala:99)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.Dataset$.ofRows(Dataset.scala:96)
at
org.apache.spark.sql.SparkSession.$anonfun$sql$1(SparkSession.scala:618)
at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:775)
at org.apache.spark.sql.SparkSession.sql(SparkSession.scala:613)
```
Is it normal operation?
I am confused because I found test case for delete with nested column
https://github.com/apache/iceberg/blob/39a2c12b843f6f16e3389e738531e24a32f5bf39/spark/v3.1/spark-extensions/src/test/java/org/apache/iceberg/spark/extensions/TestDelete.java#L383-L397
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]