This is an automated email from the ASF dual-hosted git repository. maxgekk pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new e3b1bb117fe9 [SPARK-45262][SQL][TESTS][DOCS] Improve examples for regexp parameters e3b1bb117fe9 is described below commit e3b1bb117fe9bf0b17321e6359b7aa90f70a24b5 Author: Max Gekk <max.g...@gmail.com> AuthorDate: Fri Oct 6 22:34:40 2023 +0300 [SPARK-45262][SQL][TESTS][DOCS] Improve examples for regexp parameters ### What changes were proposed in this pull request? In the PR, I propose to add a few more examples for `LIKE`, `ILIKE`, `RLIKE`, `regexp_instr()`, `regexp_extract_all()` that highlight correctness of current description and test a couple more of corner cases. ### Why are the changes needed? The description of `LIKE` says: ``` ... in order to match "\abc", the pattern should be "\\abc" ``` but in Spark SQL shell: ```sql spark-sql (default)> SELECT c FROM t; \abc spark-sql (default)> SELECT c LIKE "\\abc" FROM t; [INVALID_FORMAT.ESC_IN_THE_MIDDLE] The format is invalid: '\\abc'. The escape character is not allowed to precede 'a'. spark-sql (default)> SELECT c LIKE "\\\\abc" FROM t; true ``` So, the description might confuse users since the pattern must contain 4 slashes when the pattern is a regular SQL string. New example shows that the pattern "\\abc" is correct if we take into account the string as a raw string: ```sql spark-sql (default)> SELECT c LIKE R"\\abc" FROM t; true ``` ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? By running new and modified tests: ``` $ build/sbt "test:testOnly *.StringFunctionsSuite" $ build/sbt "sql/test:testOnly org.apache.spark.sql.expressions.ExpressionInfoSuite" ``` ### Was this patch authored or co-authored using generative AI tooling? No. Closes #43037 from MaxGekk/fix-like-doc. Authored-by: Max Gekk <max.g...@gmail.com> Signed-off-by: Max Gekk <max.g...@gmail.com> --- .../sql/catalyst/expressions/regexpExpressions.scala | 18 ++++++++++++++++-- .../resources/sql-functions/sql-expression-schema.md | 2 +- .../org/apache/spark/sql/StringFunctionsSuite.scala | 5 +++++ 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index 87ea8b5a102a..b33de303b5d5 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -108,13 +108,15 @@ abstract class StringRegexExpression extends BinaryExpression Examples: > SELECT _FUNC_('Spark', '_park'); true + > SELECT '\\abc' AS S, S _FUNC_ r'\\abc', S _FUNC_ '\\\\abc'; + \abc true true > SET spark.sql.parser.escapedStringLiterals=true; spark.sql.parser.escapedStringLiterals true > SELECT '%SystemDrive%\Users\John' _FUNC_ '\%SystemDrive\%\\Users%'; true > SET spark.sql.parser.escapedStringLiterals=false; spark.sql.parser.escapedStringLiterals false - > SELECT '%SystemDrive%\\Users\\John' _FUNC_ '\%SystemDrive\%\\\\Users%'; + > SELECT '%SystemDrive%\\Users\\John' _FUNC_ r'%SystemDrive%\\Users%'; true > SELECT '%SystemDrive%/Users/John' _FUNC_ '/%SystemDrive/%//Users%' ESCAPE '/'; true @@ -226,13 +228,15 @@ case class Like(left: Expression, right: Expression, escapeChar: Char) Examples: > SELECT _FUNC_('Spark', '_Park'); true + > SELECT '\\abc' AS S, S _FUNC_ r'\\abc', S _FUNC_ '\\\\abc'; + \abc true true > SET spark.sql.parser.escapedStringLiterals=true; spark.sql.parser.escapedStringLiterals true > SELECT '%SystemDrive%\Users\John' _FUNC_ '\%SystemDrive\%\\users%'; true > SET spark.sql.parser.escapedStringLiterals=false; spark.sql.parser.escapedStringLiterals false - > SELECT '%SystemDrive%\\USERS\\John' _FUNC_ '\%SystemDrive\%\\\\Users%'; + > SELECT '%SystemDrive%\\USERS\\John' _FUNC_ r'%SystemDrive%\\Users%'; true > SELECT '%SystemDrive%/Users/John' _FUNC_ '/%SYSTEMDrive/%//Users%' ESCAPE '/'; true @@ -446,6 +450,8 @@ case class NotLikeAny(child: Expression, patterns: Seq[UTF8String]) extends Like spark.sql.parser.escapedStringLiterals false > SELECT _FUNC_('%SystemDrive%\\Users\\John', '%SystemDrive%\\\\Users.*'); true + > SELECT _FUNC_('%SystemDrive%\\Users\\John', r'%SystemDrive%\\Users.*'); + true """, note = """ Use LIKE to match with simple string pattern. @@ -596,6 +602,8 @@ case class StringSplit(str: Expression, regex: Expression, limit: Expression) Examples: > SELECT _FUNC_('100-200', '(\\d+)', 'num'); num-num + > SELECT _FUNC_('100-200', r'(\d+)', 'num'); + num-num """, since = "1.5.0", group = "string_funcs") @@ -813,6 +821,8 @@ abstract class RegExpExtractBase Examples: > SELECT _FUNC_('100-200', '(\\d+)-(\\d+)', 1); 100 + > SELECT _FUNC_('100-200', r'(\d+)-(\d+)', 1); + 100 """, since = "1.5.0", group = "string_funcs") @@ -909,6 +919,8 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio Examples: > SELECT _FUNC_('100-200, 300-400', '(\\d+)-(\\d+)', 1); ["100","300"] + > SELECT _FUNC_('100-200, 300-400', r'(\d+)-(\d+)', 1); + ["100","300"] """, since = "3.1.0", group = "string_funcs") @@ -1075,6 +1087,8 @@ case class RegExpSubStr(left: Expression, right: Expression) """, examples = """ Examples: + > SELECT _FUNC_(r"\abc", r"^\\abc$"); + 1 > SELECT _FUNC_('u...@spark.apache.org', '@[^.]*'); 5 """, diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index 89e840d12428..1573b5c56086 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -262,7 +262,7 @@ | org.apache.spark.sql.catalyst.expressions.RegExpCount | regexp_count | SELECT regexp_count('Steven Jones and Stephen Smith are the best players', 'Ste(v|ph)en') | struct<regexp_count(Steven Jones and Stephen Smith are the best players, Ste(v|ph)en):int> | | org.apache.spark.sql.catalyst.expressions.RegExpExtract | regexp_extract | SELECT regexp_extract('100-200', '(\\d+)-(\\d+)', 1) | struct<regexp_extract(100-200, (\d+)-(\d+), 1):string> | | org.apache.spark.sql.catalyst.expressions.RegExpExtractAll | regexp_extract_all | SELECT regexp_extract_all('100-200, 300-400', '(\\d+)-(\\d+)', 1) | struct<regexp_extract_all(100-200, 300-400, (\d+)-(\d+), 1):array<string>> | -| org.apache.spark.sql.catalyst.expressions.RegExpInStr | regexp_instr | SELECT regexp_instr('u...@spark.apache.org', '@[^.]*') | struct<regexp_instr(u...@spark.apache.org, @[^.]*, 0):int> | +| org.apache.spark.sql.catalyst.expressions.RegExpInStr | regexp_instr | SELECT regexp_instr(r"\abc", r"^\\abc$") | struct<regexp_instr(\abc, ^\\abc$, 0):int> | | org.apache.spark.sql.catalyst.expressions.RegExpReplace | regexp_replace | SELECT regexp_replace('100-200', '(\\d+)', 'num') | struct<regexp_replace(100-200, (\d+), num, 1):string> | | org.apache.spark.sql.catalyst.expressions.RegExpSubStr | regexp_substr | SELECT regexp_substr('Steven Jones and Stephen Smith are the best players', 'Ste(v|ph)en') | struct<regexp_substr(Steven Jones and Stephen Smith are the best players, Ste(v|ph)en):string> | | org.apache.spark.sql.catalyst.expressions.Remainder | % | SELECT 2 % 1.8 | struct<(2 % 1.8):decimal(2,1)> | diff --git a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala index 8e9be5dcdced..422498ac9dc6 100644 --- a/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala +++ b/sql/core/src/test/scala/org/apache/spark/sql/StringFunctionsSuite.scala @@ -998,6 +998,11 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession { checkAnswer(df.selectExpr("a ilike b escape '/'"), Seq(Row(true))) checkAnswer(df.select(ilike(col("a"), col("b"), lit('/'))), Seq(Row(true))) + val df2 = Seq(("""abc\""", """%\\""")).toDF("i", "p") + checkAnswer(df2.select(like(col("i"), col("p"))), Seq(Row(true))) + val df3 = Seq(("""\abc""", """\\abc""")).toDF("i", "p") + checkAnswer(df3.select(like(col("i"), col("p"))), Seq(Row(true))) + checkError( exception = intercept[AnalysisException] { df1.select(like(col("a"), col("b"), lit(618))).collect() --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org