770120041 commented on code in PR #15555:
URL: https://github.com/apache/iceberg/pull/15555#discussion_r2905843142
##########
spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestStoragePartitionedJoins.java:
##########
@@ -168,6 +168,75 @@ public void testJoinsWithBucketingOnDecimalColumn() throws
NoSuchTableException
checkJoin("decimal_col", "DECIMAL(20, 2)", "bucket(8, decimal_col)");
}
+ @TestTemplate
+ public void testJoinsWithBucketingOnStringColumn() throws
NoSuchTableException {
+ checkJoin("string_col", "STRING", "bucket(8, string_col)");
+ }
+
+ @TestTemplate
+ public void testJoinsWithIdentityAndBucketOnStringColumn() throws
NoSuchTableException {
+ // Regression test for GitHub issue #15349: SPJ with bucket partition key
on String column
+ // Bucket transform produces Integer, but source column is String
+ String createTableStmt =
+ "CREATE TABLE %s (id BIGINT, dep STRING, user_id STRING)"
+ + "USING iceberg "
+ + "PARTITIONED BY (dep, bucket(8, user_id))"
+ + "TBLPROPERTIES (%s)";
+
+ sql(createTableStmt, tableName, tablePropsAsString(TABLE_PROPERTIES));
+ sql(createTableStmt, tableName(OTHER_TABLE_NAME),
tablePropsAsString(TABLE_PROPERTIES));
+
+ sql(
+ "INSERT INTO %s VALUES (1, 'software', 'user1'), (2, 'hr', 'user2'),
(3, 'software', 'user3')",
+ tableName);
+ sql(
+ "INSERT INTO %s VALUES (1, 'software', 'user1'), (2, 'hr', 'user2'),
(4, 'software', 'user4')",
+ tableName(OTHER_TABLE_NAME));
+
+ assertPartitioningAwarePlan(
+ 1, /* expected num of shuffles with SPJ */
+ 3, /* expected num of shuffles without SPJ */
+ "SELECT t1.id, t1.dep, t1.user_id "
+ + "FROM %s t1 "
+ + "INNER JOIN %s t2 "
+ + "ON t1.id = t2.id AND t1.dep = t2.dep AND t1.user_id =
t2.user_id "
+ + "ORDER BY t1.id, t1.dep, t1.user_id",
+ tableName,
+ tableName(OTHER_TABLE_NAME));
+ }
+
+ @TestTemplate
+ public void testJoinsWithBucketOnStringAndIdentityColumns() throws
NoSuchTableException {
Review Comment:
Good points @huaxingao! Removed the redundant
`testJoinsWithBucketOnStringAndIdentityColumns` from v3.5, and added
`testJoinsWithBucketingOnStringColumn` and
`testJoinsWithIdentityAndBucketOnStringColumn` to v3.4 and v4.0 so all
four Spark versions have consistent coverage.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]