LuciferYang commented on code in PR #45290: URL: https://github.com/apache/spark/pull/45290#discussion_r1517243024
########## sql/core/src/test/scala/org/apache/spark/sql/CollationSuite.scala: ########## @@ -183,6 +185,57 @@ class CollationSuite extends DatasourceV2SQLBase { } } + test("aggregates count respects collation") { + Seq( + ("ucs_basic", Seq("AAA", "aaa"), Seq(Row(1, "AAA"), Row(1, "aaa"))), Review Comment: This test case failed in the daily test of Maven + Java 21, both on Linux and MacOS. @dbatomic Do you have time to investigate the cause of this failure? - linux: https://github.com/apache/spark/actions/runs/8189363924/job/22416511124 ``` - aggregates count respects collation *** FAILED *** Exception thrown while executing query: == Parsed Logical Plan == CTE [t] : +- 'SubqueryAlias t : +- 'Project ['collate('col1, unicode_CI) AS c#427560] : +- 'UnresolvedInlineTable [col1], [[AAA], [aaa]] +- 'Aggregate ['c], [unresolvedalias('COUNT(1)), 'c] +- 'UnresolvedRelation [t], [], false == Analyzed Logical Plan == count(1): bigint, c: string COLLATE 'UNICODE_CI' WithCTE :- CTERelationDef 106, false : +- SubqueryAlias t : +- Project [collate(col1#427562, unicode_CI) AS c#427560] : +- LocalRelation [col1#427562] +- Aggregate [c#427560], [count(1) AS count(1)#427563L, c#427560] +- SubqueryAlias t +- CTERelationRef 106, true, [c#427560], false == Optimized Logical Plan == Aggregate [c#427560], [count(1) AS count(1)#427563L, c#427560] +- LocalRelation [c#427560] == Physical Plan == AdaptiveSparkPlan isFinalPlan=false +- == Current Plan == SortAggregate(key=[c#427560], functions=[count(1)], output=[count(1)#427563L, c#427560]) +- Sort [c#427560 ASC NULLS FIRST], false, 0 +- ShuffleQueryStage 0 +- Exchange hashpartitioning(c#427560, 5), ENSURE_REQUIREMENTS, [plan_id=435997] +- SortAggregate(key=[c#427560], functions=[partial_count(1)], output=[c#427560, count#427567L]) +- *(1) Sort [c#427560 ASC NULLS FIRST], false, 0 +- *(1) LocalTableScan [c#427560] +- == Initial Plan == SortAggregate(key=[c#427560], functions=[count(1)], output=[count(1)#427563L, c#427560]) +- Sort [c#427560 ASC NULLS FIRST], false, 0 +- Exchange hashpartitioning(c#427560, 5), ENSURE_REQUIREMENTS, [plan_id=435931] +- SortAggregate(key=[c#427560], functions=[partial_count(1)], output=[c#427560, count#427567L]) +- Sort [c#427560 ASC NULLS FIRST], false, 0 +- LocalTableScan [c#427560] == Exception == org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 393.0 failed 1 times, most recent failure: Lost task 1.0 in stage 393.0 (TID 394) (localhost executor driver): java.lang.StringIndexOutOfBoundsException: Index 3 out of bounds for length 3 at java.base/jdk.internal.util.Preconditions$1.apply(Preconditions.java:55) at java.base/jdk.internal.util.Preconditions$1.apply(Preconditions.java:52) at java.base/jdk.internal.util.Preconditions$4.apply(Preconditions.java:213) at java.base/jdk.internal.util.Preconditions$4.apply(Preconditions.java:210) at java.base/jdk.internal.util.Preconditions.outOfBounds(Preconditions.java:98) at java.base/jdk.internal.util.Preconditions.outOfBoundsCheckIndex(Preconditions.java:106) at java.base/jdk.internal.util.Preconditions.checkIndex(Preconditions.java:302) at java.base/java.lang.String.checkIndex(String.java:4832) at java.base/java.lang.StringLatin1.charAt(StringLatin1.java:46) at java.base/java.lang.String.charAt(String.java:1555) at com.ibm.icu.impl.coll.UTF16CollationIterator.handleNextCE32(UTF16CollationIterator.java:107) at com.ibm.icu.impl.coll.CollationIterator.nextCE(CollationIterator.java:247) at com.ibm.icu.impl.coll.CollationKeys.writeSortKeyUpToQuaternary(CollationKeys.java:374) at com.ibm.icu.text.RuleBasedCollator.writeSortKey(RuleBasedCollator.java:1159) at com.ibm.icu.text.RuleBasedCollator.getRawCollationKey(RuleBasedCollator.java:1146) at com.ibm.icu.text.RuleBasedCollator.getCollationKey(RuleBasedCollator.java:1071) at com.ibm.icu.text.RuleBasedCollator.getCollationKey(RuleBasedCollator.java:1064) at org.apache.spark.sql.catalyst.util.CollationFactory$Collation.lambda$new$2(CollationFactory.java:104) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source) at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.$anonfun$prepareShuffleDependency$8(ShuffleExchangeExec.scala:330) at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.$anonfun$prepareShuffleDependency$8$adapted(ShuffleExchangeExec.scala:330) at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.$anonfun$prepareShuffleDependency$18(ShuffleExchangeExec.scala:401) at scala.collection.Iterator$$anon$9.next(Iterator.scala:584) at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:169) at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:56) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:111) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54) at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171) at org.apache.spark.scheduler.Task.run(Task.scala:146) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:632) at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64) at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:97) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:635) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642) at java.base/java.lang.Thread.run(Thread.java:1583) ``` - macos-14: https://github.com/apache/spark/actions/runs/8194082205/job/22416483724 ``` - aggregates count respects collation *** FAILED *** Exception thrown while executing query: == Parsed Logical Plan == CTE [t] : +- 'SubqueryAlias t : +- 'Project ['collate('col1, unicode_CI) AS c#427467] : +- 'UnresolvedInlineTable [col1], [[aaa], [aaa]] +- 'Aggregate ['c], [unresolvedalias('COUNT(1)), 'c] +- 'UnresolvedRelation [t], [], false == Analyzed Logical Plan == count(1): bigint, c: string COLLATE 'UNICODE_CI' WithCTE :- CTERelationDef 103, false : +- SubqueryAlias t : +- Project [collate(col1#427469, unicode_CI) AS c#427467] : +- LocalRelation [col1#427469] +- Aggregate [c#427467], [count(1) AS count(1)#427470L, c#427467] +- SubqueryAlias t +- CTERelationRef 103, true, [c#427467], false == Optimized Logical Plan == Aggregate [c#427467], [count(1) AS count(1)#427470L, c#427467] +- LocalRelation [c#427467] == Physical Plan == AdaptiveSparkPlan isFinalPlan=false +- == Current Plan == SortAggregate(key=[c#427467], functions=[count(1)], output=[count(1)#427470L, c#427467]) +- Sort [c#427467 ASC NULLS FIRST], false, 0 +- ShuffleQueryStage 0 +- Exchange hashpartitioning(c#427467, 5), ENSURE_REQUIREMENTS, [plan_id=435580] +- SortAggregate(key=[c#427467], functions=[partial_count(1)], output=[c#427467, count#427474L]) +- *(1) Sort [c#427467 ASC NULLS FIRST], false, 0 +- *(1) LocalTableScan [c#427467] +- == Initial Plan == SortAggregate(key=[c#427467], functions=[count(1)], output=[count(1)#427470L, c#427467]) +- Sort [c#427467 ASC NULLS FIRST], false, 0 +- Exchange hashpartitioning(c#427467, 5), ENSURE_REQUIREMENTS, [plan_id=435514] +- SortAggregate(key=[c#427467], functions=[partial_count(1)], output=[c#427467, count#427474L]) +- Sort [c#427467 ASC NULLS FIRST], false, 0 +- LocalTableScan [c#427467] == Exception == org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 387.0 failed 1 times, most recent failure: Lost task 0.0 in stage 387.0 (TID 387) (localhost executor driver): java.lang.StringIndexOutOfBoundsException: Index 4 out of bounds for length 3 at java.base/jdk.internal.util.Preconditions$1.apply(Preconditions.java:55) at java.base/jdk.internal.util.Preconditions$1.apply(Preconditions.java:52) at java.base/jdk.internal.util.Preconditions$4.apply(Preconditions.java:213) at java.base/jdk.internal.util.Preconditions$4.apply(Preconditions.java:210) at java.base/jdk.internal.util.Preconditions.outOfBounds(Preconditions.java:98) at java.base/jdk.internal.util.Preconditions.outOfBoundsCheckIndex(Preconditions.java:106) at java.base/jdk.internal.util.Preconditions.checkIndex(Preconditions.java:302) at java.base/java.lang.String.checkIndex(String.java:4832) at java.base/java.lang.StringLatin1.charAt(StringLatin1.java:46) at java.base/java.lang.String.charAt(String.java:1555) at com.ibm.icu.impl.coll.UTF16CollationIterator.handleNextCE32(UTF16CollationIterator.java:107) at com.ibm.icu.impl.coll.CollationIterator.nextCE(CollationIterator.java:247) at com.ibm.icu.impl.coll.CollationKeys.writeSortKeyUpToQuaternary(CollationKeys.java:374) at com.ibm.icu.text.RuleBasedCollator.writeSortKey(RuleBasedCollator.java:1159) at com.ibm.icu.text.RuleBasedCollator.getRawCollationKey(RuleBasedCollator.java:1146) at com.ibm.icu.text.RuleBasedCollator.getCollationKey(RuleBasedCollator.java:1071) at com.ibm.icu.text.RuleBasedCollator.getCollationKey(RuleBasedCollator.java:1064) at org.apache.spark.sql.catalyst.util.CollationFactory$Collation.lambda$new$2(CollationFactory.java:104) at org.apache.spark.sql.catalyst.expressions.GeneratedClass$SpecificUnsafeProjection.apply(Unknown Source) at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.$anonfun$prepareShuffleDependency$8(ShuffleExchangeExec.scala:330) at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.$anonfun$prepareShuffleDependency$8$adapted(ShuffleExchangeExec.scala:330) at org.apache.spark.sql.execution.exchange.ShuffleExchangeExec$.$anonfun$prepareShuffleDependency$18(ShuffleExchangeExec.scala:401) at scala.collection.Iterator$$anon$9.next(Iterator.scala:584) at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:169) at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:56) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:111) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54) at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:171) at org.apache.spark.scheduler.Task.run(Task.scala:146) at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$5(Executor.scala:632) at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64) at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61) at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:97) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:635) at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144) at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642) at java.base/java.lang.Thread.run(Thread.java:1583) ``` also cc @HyukjinKwon because I noticed that you are paying attention to the tests of maven + Java 21 on macos-14 -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org