This is an automated email from the ASF dual-hosted git repository. gengliang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new b6cf3e4598f [SPARK-40109][SQL] New SQL function: get() b6cf3e4598f is described below commit b6cf3e4598fb6ae9f9ed28c7d5a0d4152453a669 Author: Gengliang Wang <gengli...@apache.org> AuthorDate: Wed Aug 17 15:03:17 2022 -0700 [SPARK-40109][SQL] New SQL function: get() ### What changes were proposed in this pull request? Introduce a new SQL function `get()`: Returns element of array at given (0-based) index. If the index points outside of the array boundaries, then this function returns NULL. Examples: ``` > SELECT _FUNC_(array(1, 2, 3), 2); 2 > SELECT _FUNC_(array(1, 2, 3), 3); NULL > SELECT _FUNC_(array(1, 2, 3), -1); NULL ``` ### Why are the changes needed? Currently, when accessing array element with invalid index under ANSI SQL mode, the error is like: ``` [INVALID_ARRAY_INDEX] The index -1 is out of bounds. The array has 3 elements. Use `try_element_at` and increase the array index by 1(the starting array index is 1 for `try_element_at`) to tolerate accessing element at invalid index and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. ``` The provided solution is complicated. I suggest introducing a new method `get()` which always returns null on an invalid array index. This is from https://docs.snowflake.com/en/sql-reference/functions/get.html. Note: since Spark's map access always returns null, let's don't support map type in the get method for now. ### Does this PR introduce _any_ user-facing change? Yes, a new SQL function `get()`: returns element of array at given (0-based) index. If the index points outside of the array boundaries, then this function returns NULL. ### How was this patch tested? New UT Closes #37541 from gengliangwang/addGetMethod. Lead-authored-by: Gengliang Wang <gengli...@apache.org> Co-authored-by: Gengliang Wang <ltn...@gmail.com> Signed-off-by: Gengliang Wang <gengli...@apache.org> --- core/src/main/resources/error/error-classes.json | 2 +- .../sql/catalyst/analysis/FunctionRegistry.scala | 1 + .../expressions/collectionOperations.scala | 36 +++++++++++++++++++ .../sql-functions/sql-expression-schema.md | 1 + .../src/test/resources/sql-tests/inputs/array.sql | 6 ++++ .../resources/sql-tests/results/ansi/array.sql.out | 42 +++++++++++++++++++--- .../test/resources/sql-tests/results/array.sql.out | 32 +++++++++++++++++ 7 files changed, 114 insertions(+), 6 deletions(-) diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index c2c5f30564c..3f6c1ca0362 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -224,7 +224,7 @@ }, "INVALID_ARRAY_INDEX" : { "message" : [ - "The index <indexValue> is out of bounds. The array has <arraySize> elements. Use `try_element_at` and increase the array index by 1(the starting array index is 1 for `try_element_at`) to tolerate accessing element at invalid index and return NULL instead. If necessary set <ansiConfig> to \"false\" to bypass this error." + "The index <indexValue> is out of bounds. The array has <arraySize> elements. Use the SQL function `get()` to tolerate accessing element at invalid index and return NULL instead. If necessary set <ansiConfig> to \"false\" to bypass this error." ] }, "INVALID_ARRAY_INDEX_IN_ELEMENT_AT" : { diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index b655c45bd5f..42f3ca041b8 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -702,6 +702,7 @@ object FunctionRegistry { expression[TransformKeys]("transform_keys"), expression[MapZipWith]("map_zip_with"), expression[ZipWith]("zip_with"), + expression[Get]("get"), CreateStruct.registryEntry, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala index 3090916582e..40eade75578 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/collectionOperations.scala @@ -2072,6 +2072,42 @@ case class ArrayPosition(left: Expression, right: Expression) copy(left = newLeft, right = newRight) } +/** + * Returns the value of index `right` in Array `left`. If the index points outside of the array + * boundaries, then this function returns NULL. + */ +@ExpressionDescription( + usage = """ + _FUNC_(array, index) - Returns element of array at given (0-based) index. If the index points + outside of the array boundaries, then this function returns NULL. + """, + examples = """ + Examples: + > SELECT _FUNC_(array(1, 2, 3), 0); + 1 + > SELECT _FUNC_(array(1, 2, 3), 3); + NULL + > SELECT _FUNC_(array(1, 2, 3), -1); + NULL + """, + since = "3.4.0", + group = "array_funcs") +case class Get( + left: Expression, + right: Expression, + replacement: Expression) extends RuntimeReplaceable with InheritAnalysisRules { + + def this(left: Expression, right: Expression) = + this(left, right, GetArrayItem(left, right, failOnError = false)) + + override def prettyName: String = "get" + + override def parameters: Seq[Expression] = Seq(left, right) + + override protected def withNewChildInternal(newChild: Expression): Expression = + this.copy(replacement = newChild) +} + /** * Returns the value of index `right` in Array `left` or the value for key `right` in Map `left`. */ diff --git a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md index 7e7b42e55c2..bc7941f659f 100644 --- a/sql/core/src/test/resources/sql-functions/sql-expression-schema.md +++ b/sql/core/src/test/resources/sql-functions/sql-expression-schema.md @@ -129,6 +129,7 @@ | org.apache.spark.sql.catalyst.expressions.FormatString | printf | SELECT printf("Hello World %d %s", 100, "days") | struct<printf(Hello World %d %s, 100, days):string> | | org.apache.spark.sql.catalyst.expressions.FromUTCTimestamp | from_utc_timestamp | SELECT from_utc_timestamp('2016-08-31', 'Asia/Seoul') | struct<from_utc_timestamp(2016-08-31, Asia/Seoul):timestamp> | | org.apache.spark.sql.catalyst.expressions.FromUnixTime | from_unixtime | SELECT from_unixtime(0, 'yyyy-MM-dd HH:mm:ss') | struct<from_unixtime(0, yyyy-MM-dd HH:mm:ss):string> | +| org.apache.spark.sql.catalyst.expressions.Get | get | SELECT get(array(1, 2, 3), 0) | struct<get(array(1, 2, 3), 0):int> | | org.apache.spark.sql.catalyst.expressions.GetJsonObject | get_json_object | SELECT get_json_object('{"a":"b"}', '$.a') | struct<get_json_object({"a":"b"}, $.a):string> | | org.apache.spark.sql.catalyst.expressions.GreaterThan | > | SELECT 2 > 1 | struct<(2 > 1):boolean> | | org.apache.spark.sql.catalyst.expressions.GreaterThanOrEqual | >= | SELECT 2 >= 1 | struct<(2 >= 1):boolean> | diff --git a/sql/core/src/test/resources/sql-tests/inputs/array.sql b/sql/core/src/test/resources/sql-tests/inputs/array.sql index dfcf1742feb..cdab8b26ce9 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/array.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/array.sql @@ -113,3 +113,9 @@ select array_size(array(true)); select array_size(array(2, 1)); select array_size(NULL); select array_size(map('a', 1, 'b', 2)); + +-- function get() +select get(array(1, 2, 3), 0); +select get(array(1, 2, 3), 3); +select get(array(1, 2, 3), null); +select get(array(1, 2, 3), -1); diff --git a/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out b/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out index d2438cbaab6..8342362cf5e 100644 --- a/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/ansi/array.sql.out @@ -198,7 +198,7 @@ select elt(4, '123', '456') struct<> -- !query output org.apache.spark.SparkArrayIndexOutOfBoundsException -[INVALID_ARRAY_INDEX] The index 4 is out of bounds. The array has 2 elements. Use `try_element_at` and increase the array index by 1(the starting array index is 1 for `try_element_at`) to tolerate accessing element at invalid index and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. +[INVALID_ARRAY_INDEX] The index 4 is out of bounds. The array has 2 elements. Use the SQL function `get()` to tolerate accessing element at invalid index and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. == SQL(line 1, position 8) == select elt(4, '123', '456') ^^^^^^^^^^^^^^^^^^^^ @@ -210,7 +210,7 @@ select elt(0, '123', '456') struct<> -- !query output org.apache.spark.SparkArrayIndexOutOfBoundsException -[INVALID_ARRAY_INDEX] The index 0 is out of bounds. The array has 2 elements. Use `try_element_at` and increase the array index by 1(the starting array index is 1 for `try_element_at`) to tolerate accessing element at invalid index and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. +[INVALID_ARRAY_INDEX] The index 0 is out of bounds. The array has 2 elements. Use the SQL function `get()` to tolerate accessing element at invalid index and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. == SQL(line 1, position 8) == select elt(0, '123', '456') ^^^^^^^^^^^^^^^^^^^^ @@ -222,7 +222,7 @@ select elt(-1, '123', '456') struct<> -- !query output org.apache.spark.SparkArrayIndexOutOfBoundsException -[INVALID_ARRAY_INDEX] The index -1 is out of bounds. The array has 2 elements. Use `try_element_at` and increase the array index by 1(the starting array index is 1 for `try_element_at`) to tolerate accessing element at invalid index and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. +[INVALID_ARRAY_INDEX] The index -1 is out of bounds. The array has 2 elements. Use the SQL function `get()` to tolerate accessing element at invalid index and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. == SQL(line 1, position 8) == select elt(-1, '123', '456') ^^^^^^^^^^^^^^^^^^^^^ @@ -266,7 +266,7 @@ select array(1, 2, 3)[5] struct<> -- !query output org.apache.spark.SparkArrayIndexOutOfBoundsException -[INVALID_ARRAY_INDEX] The index 5 is out of bounds. The array has 3 elements. Use `try_element_at` and increase the array index by 1(the starting array index is 1 for `try_element_at`) to tolerate accessing element at invalid index and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. +[INVALID_ARRAY_INDEX] The index 5 is out of bounds. The array has 3 elements. Use the SQL function `get()` to tolerate accessing element at invalid index and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. == SQL(line 1, position 8) == select array(1, 2, 3)[5] ^^^^^^^^^^^^^^^^^ @@ -278,7 +278,7 @@ select array(1, 2, 3)[-1] struct<> -- !query output org.apache.spark.SparkArrayIndexOutOfBoundsException -[INVALID_ARRAY_INDEX] The index -1 is out of bounds. The array has 3 elements. Use `try_element_at` and increase the array index by 1(the starting array index is 1 for `try_element_at`) to tolerate accessing element at invalid index and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. +[INVALID_ARRAY_INDEX] The index -1 is out of bounds. The array has 3 elements. Use the SQL function `get()` to tolerate accessing element at invalid index and return NULL instead. If necessary set "spark.sql.ansi.enabled" to "false" to bypass this error. == SQL(line 1, position 8) == select array(1, 2, 3)[-1] ^^^^^^^^^^^^^^^^^^ @@ -323,3 +323,35 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException cannot resolve 'array_size(map('a', 1, 'b', 2))' due to data type mismatch: argument 1 requires array type, however, 'map('a', 1, 'b', 2)' is of map<string,int> type.; line 1 pos 7 + + +-- !query +select get(array(1, 2, 3), 0) +-- !query schema +struct<get(array(1, 2, 3), 0):int> +-- !query output +1 + + +-- !query +select get(array(1, 2, 3), 3) +-- !query schema +struct<get(array(1, 2, 3), 3):int> +-- !query output +NULL + + +-- !query +select get(array(1, 2, 3), null) +-- !query schema +struct<get(array(1, 2, 3), NULL):int> +-- !query output +NULL + + +-- !query +select get(array(1, 2, 3), -1) +-- !query schema +struct<get(array(1, 2, 3), -1):int> +-- !query output +NULL diff --git a/sql/core/src/test/resources/sql-tests/results/array.sql.out b/sql/core/src/test/resources/sql-tests/results/array.sql.out index 25bc001d3d6..b9dbd14b7d2 100644 --- a/sql/core/src/test/resources/sql-tests/results/array.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/array.sql.out @@ -295,3 +295,35 @@ struct<> -- !query output org.apache.spark.sql.AnalysisException cannot resolve 'array_size(map('a', 1, 'b', 2))' due to data type mismatch: argument 1 requires array type, however, 'map('a', 1, 'b', 2)' is of map<string,int> type.; line 1 pos 7 + + +-- !query +select get(array(1, 2, 3), 0) +-- !query schema +struct<get(array(1, 2, 3), 0):int> +-- !query output +1 + + +-- !query +select get(array(1, 2, 3), 3) +-- !query schema +struct<get(array(1, 2, 3), 3):int> +-- !query output +NULL + + +-- !query +select get(array(1, 2, 3), null) +-- !query schema +struct<get(array(1, 2, 3), NULL):int> +-- !query output +NULL + + +-- !query +select get(array(1, 2, 3), -1) +-- !query schema +struct<get(array(1, 2, 3), -1):int> +-- !query output +NULL --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org