uros-db commented on code in PR #46040:
URL: https://github.com/apache/spark/pull/46040#discussion_r1568746724


##########
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##########
@@ -212,6 +212,49 @@ class CollationStringExpressionsSuite
     })
   }
 
+  test("Support Left/Right/Substr with collation") {
+    case class SubstringTestCase(query: String, collation: String, result: Row)
+    val checks = Seq("utf8_binary_lcase", "utf8_binary", "unicode", 
"unicode_ci").flatMap(
+      c => Seq(
+        SubstringTestCase("select substr('example' collate " + c + ", 1, 
100)", c, Row("example")),
+        SubstringTestCase("select substr('example' collate " + c + ", 2, 2)", 
c, Row("xa")),
+        SubstringTestCase("select substr('example' collate " + c + ", 0, 0)", 
c, Row("")),
+        SubstringTestCase("select substr('example' collate " + c + ", -3, 2)", 
c, Row("pl")),
+        SubstringTestCase("select substr(' a世a ' collate " + c + ", 2, 3)", c, 
Row("a世a")), // scalastyle:ignore
+        SubstringTestCase("select left(' a世a ' collate " + c + ", 3)", c, 
Row(" a世")), // scalastyle:ignore
+        SubstringTestCase("select right(' a世a ' collate " + c + ", 3)", c, 
Row("世a ")), // scalastyle:ignore
+        SubstringTestCase("select left('AaAaAaAa000000' collate " + c + ", 
3)", c, Row("AaA")),
+        SubstringTestCase("select right('AaAaAaAa000000' collate " + c + ", 
3)", c, Row("000")),
+        SubstringTestCase("select substr('' collate " + c + ", 1, 1)", c, 
Row("")),
+        SubstringTestCase("select left('' collate " + c + ", 1)", c, Row("")),
+        SubstringTestCase("select right('' collate " + c + ", 1)", c, Row("")),
+        // improper values
+        SubstringTestCase("select left(null collate " + c + ", 1)", c, 
Row(null)),
+        SubstringTestCase("select right(null collate " + c + ", 1)", c, 
Row(null)),
+        SubstringTestCase("select substr(null collate " + c + ", 1)", c, 
Row(null)),
+        SubstringTestCase("select substr(null collate " + c + ", 1, 1)", c, 
Row(null)),
+        SubstringTestCase("select left(null collate " + c + ", null)", c, 
Row(null)),
+        SubstringTestCase("select right(null collate " + c + ", null)", c, 
Row(null)),
+        SubstringTestCase("select substr(null collate " + c + ", null)", c, 
Row(null)),
+        SubstringTestCase("select substr(null collate " + c + ", null, null)", 
c, Row(null)),
+        SubstringTestCase("select left('AaAaAaAa000000' collate " + c + ", 
null)", c, Row(null)),
+        SubstringTestCase("select right('AaAaAaAa000000' collate " + c + ", 
null)", c, Row(null)),
+        SubstringTestCase("select substr('AaAaAaAa000000' collate " + c + ", 
null)", c, Row(null)),
+        SubstringTestCase("select substr('AaAaAaAa0' collate " + c + ", null, 
null)", c, Row(null)),
+        SubstringTestCase("select right('' collate " + c + ", null)", c, 
Row(null)),
+        SubstringTestCase("select substr('' collate " + c + ", null)", c, 
Row(null)),
+        SubstringTestCase("select substr('' collate " + c + ", null, null)", 
c, Row(null)),
+        SubstringTestCase("select left('' collate " + c + ", null)", c, 
Row(null))

Review Comment:
   28 cases * 4 collations = 112 tests
   
   I'd say we don't need that many SQL tests, there's no need to do 
`Seq("utf8_binary_lcase", "utf8_binary", "unicode", "unicode_ci").flatMap`, 
only 4 tests (with valid values) per function (substring/left/right) should be 
enough
   
   a couple of additional tests for improper values are fine as well, but we 
don't need to test every possible pair of collation & function



##########
sql/core/src/test/scala/org/apache/spark/sql/CollationStringExpressionsSuite.scala:
##########
@@ -212,6 +212,49 @@ class CollationStringExpressionsSuite
     })
   }
 
+  test("Support Left/Right/Substr with collation") {
+    case class SubstringTestCase(query: String, collation: String, result: Row)
+    val checks = Seq("utf8_binary_lcase", "utf8_binary", "unicode", 
"unicode_ci").flatMap(
+      c => Seq(
+        SubstringTestCase("select substr('example' collate " + c + ", 1, 
100)", c, Row("example")),
+        SubstringTestCase("select substr('example' collate " + c + ", 2, 2)", 
c, Row("xa")),
+        SubstringTestCase("select substr('example' collate " + c + ", 0, 0)", 
c, Row("")),
+        SubstringTestCase("select substr('example' collate " + c + ", -3, 2)", 
c, Row("pl")),
+        SubstringTestCase("select substr(' a世a ' collate " + c + ", 2, 3)", c, 
Row("a世a")), // scalastyle:ignore
+        SubstringTestCase("select left(' a世a ' collate " + c + ", 3)", c, 
Row(" a世")), // scalastyle:ignore
+        SubstringTestCase("select right(' a世a ' collate " + c + ", 3)", c, 
Row("世a ")), // scalastyle:ignore
+        SubstringTestCase("select left('AaAaAaAa000000' collate " + c + ", 
3)", c, Row("AaA")),
+        SubstringTestCase("select right('AaAaAaAa000000' collate " + c + ", 
3)", c, Row("000")),
+        SubstringTestCase("select substr('' collate " + c + ", 1, 1)", c, 
Row("")),
+        SubstringTestCase("select left('' collate " + c + ", 1)", c, Row("")),
+        SubstringTestCase("select right('' collate " + c + ", 1)", c, Row("")),
+        // improper values
+        SubstringTestCase("select left(null collate " + c + ", 1)", c, 
Row(null)),
+        SubstringTestCase("select right(null collate " + c + ", 1)", c, 
Row(null)),
+        SubstringTestCase("select substr(null collate " + c + ", 1)", c, 
Row(null)),
+        SubstringTestCase("select substr(null collate " + c + ", 1, 1)", c, 
Row(null)),
+        SubstringTestCase("select left(null collate " + c + ", null)", c, 
Row(null)),
+        SubstringTestCase("select right(null collate " + c + ", null)", c, 
Row(null)),
+        SubstringTestCase("select substr(null collate " + c + ", null)", c, 
Row(null)),
+        SubstringTestCase("select substr(null collate " + c + ", null, null)", 
c, Row(null)),
+        SubstringTestCase("select left('AaAaAaAa000000' collate " + c + ", 
null)", c, Row(null)),
+        SubstringTestCase("select right('AaAaAaAa000000' collate " + c + ", 
null)", c, Row(null)),
+        SubstringTestCase("select substr('AaAaAaAa000000' collate " + c + ", 
null)", c, Row(null)),
+        SubstringTestCase("select substr('AaAaAaAa0' collate " + c + ", null, 
null)", c, Row(null)),
+        SubstringTestCase("select right('' collate " + c + ", null)", c, 
Row(null)),
+        SubstringTestCase("select substr('' collate " + c + ", null)", c, 
Row(null)),
+        SubstringTestCase("select substr('' collate " + c + ", null, null)", 
c, Row(null)),
+        SubstringTestCase("select left('' collate " + c + ", null)", c, 
Row(null))

Review Comment:
   since we don't have unit tests for these functions, let's make sure to use a 
smaller number of tests here in order to test more things - for example, I 
don't see any case/accent variation here, as well as a wider variety of 
variable len characters, etc.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to