Repository: spark Updated Branches: refs/heads/master 87ab0cec6 -> 7a3e5dc28
[SPARK-20749][SQL] Built-in SQL Function Support - all variants of LEN[GTH] ## What changes were proposed in this pull request? This PR adds built-in SQL function `BIT_LENGTH()`, `CHAR_LENGTH()`, and `OCTET_LENGTH()` functions. `BIT_LENGTH()` returns the bit length of the given string or binary expression. `CHAR_LENGTH()` returns the length of the given string or binary expression. (i.e. equal to `LENGTH()`) `OCTET_LENGTH()` returns the byte length of the given string or binary expression. ## How was this patch tested? Added new test suites for these three functions Author: Kazuaki Ishizaki <ishiz...@jp.ibm.com> Closes #18046 from kiszk/SPARK-20749. Project: http://git-wip-us.apache.org/repos/asf/spark/repo Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/7a3e5dc2 Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/7a3e5dc2 Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/7a3e5dc2 Branch: refs/heads/master Commit: 7a3e5dc28b67ac1630c5a578a27a5a5acf80aa51 Parents: 87ab0ce Author: Kazuaki Ishizaki <ishiz...@jp.ibm.com> Authored: Thu Jun 15 23:06:58 2017 -0700 Committer: Xiao Li <gatorsm...@gmail.com> Committed: Thu Jun 15 23:06:58 2017 -0700 ---------------------------------------------------------------------- .../catalyst/analysis/FunctionRegistry.scala | 3 + .../expressions/stringExpressions.scala | 61 +++++++++++++++++++- .../expressions/StringExpressionsSuite.scala | 20 +++++++ .../resources/sql-tests/inputs/operators.sql | 5 ++ .../sql-tests/results/operators.sql.out | 26 ++++++++- 5 files changed, 112 insertions(+), 3 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/spark/blob/7a3e5dc2/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index 8773281..e4e9918 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -305,6 +305,8 @@ object FunctionRegistry { expression[Chr]("char"), expression[Chr]("chr"), expression[Base64]("base64"), + expression[BitLength]("bit_length"), + expression[Length]("char_length"), expression[Concat]("concat"), expression[ConcatWs]("concat_ws"), expression[Decode]("decode"), @@ -321,6 +323,7 @@ object FunctionRegistry { expression[Levenshtein]("levenshtein"), expression[Like]("like"), expression[Lower]("lower"), + expression[OctetLength]("octet_length"), expression[StringLocate]("locate"), expression[StringLPad]("lpad"), expression[StringTrimLeft]("ltrim"), http://git-wip-us.apache.org/repos/asf/spark/blob/7a3e5dc2/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala old mode 100644 new mode 100755 index 717ada2..908fdb8 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala @@ -1199,15 +1199,18 @@ case class Substring(str: Expression, pos: Expression, len: Expression) } /** - * A function that return the length of the given string or binary expression. + * A function that returns the char length of the given string expression or + * number of bytes of the given binary expression. */ +// scalastyle:off line.size.limit @ExpressionDescription( - usage = "_FUNC_(expr) - Returns the length of `expr` or number of bytes in binary data.", + usage = "_FUNC_(expr) - Returns the character length of `expr` or number of bytes in binary data.", extended = """ Examples: > SELECT _FUNC_('Spark SQL'); 9 """) +// scalastyle:on line.size.limit case class Length(child: Expression) extends UnaryExpression with ImplicitCastInputTypes { override def dataType: DataType = IntegerType override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType)) @@ -1226,6 +1229,60 @@ case class Length(child: Expression) extends UnaryExpression with ImplicitCastIn } /** + * A function that returns the bit length of the given string or binary expression. + */ +@ExpressionDescription( + usage = "_FUNC_(expr) - Returns the bit length of `expr` or number of bits in binary data.", + extended = """ + Examples: + > SELECT _FUNC_('Spark SQL'); + 72 + """) +case class BitLength(child: Expression) extends UnaryExpression with ImplicitCastInputTypes { + override def dataType: DataType = IntegerType + override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType)) + + protected override def nullSafeEval(value: Any): Any = child.dataType match { + case StringType => value.asInstanceOf[UTF8String].numBytes * 8 + case BinaryType => value.asInstanceOf[Array[Byte]].length * 8 + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + child.dataType match { + case StringType => defineCodeGen(ctx, ev, c => s"($c).numBytes() * 8") + case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length * 8") + } + } +} + +/** + * A function that returns the byte length of the given string or binary expression. + */ +@ExpressionDescription( + usage = "_FUNC_(expr) - Returns the byte length of `expr` or number of bytes in binary data.", + extended = """ + Examples: + > SELECT _FUNC_('Spark SQL'); + 9 + """) +case class OctetLength(child: Expression) extends UnaryExpression with ImplicitCastInputTypes { + override def dataType: DataType = IntegerType + override def inputTypes: Seq[AbstractDataType] = Seq(TypeCollection(StringType, BinaryType)) + + protected override def nullSafeEval(value: Any): Any = child.dataType match { + case StringType => value.asInstanceOf[UTF8String].numBytes + case BinaryType => value.asInstanceOf[Array[Byte]].length + } + + override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + child.dataType match { + case StringType => defineCodeGen(ctx, ev, c => s"($c).numBytes()") + case BinaryType => defineCodeGen(ctx, ev, c => s"($c).length") + } + } +} + +/** * A function that return the Levenshtein distance between the two given strings. */ @ExpressionDescription( http://git-wip-us.apache.org/repos/asf/spark/blob/7a3e5dc2/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala ---------------------------------------------------------------------- diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala index 4bdb43b..4f08031 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/StringExpressionsSuite.scala @@ -558,20 +558,40 @@ class StringExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { // scalastyle:off // non ascii characters are not allowed in the source code, so we disable the scalastyle. checkEvaluation(Length(Literal("aè±è±c")), 4, create_row(string)) + checkEvaluation(OctetLength(Literal("aè±è±c")), 8, create_row(string)) + checkEvaluation(BitLength(Literal("aè±è±c")), 8 * 8, create_row(string)) // scalastyle:on checkEvaluation(Length(Literal(bytes)), 5, create_row(Array.empty[Byte])) + checkEvaluation(OctetLength(Literal(bytes)), 5, create_row(Array.empty[Byte])) + checkEvaluation(BitLength(Literal(bytes)), 5 * 8, create_row(Array.empty[Byte])) checkEvaluation(Length(a), 5, create_row(string)) + checkEvaluation(OctetLength(a), 5, create_row(string)) + checkEvaluation(BitLength(a), 5 * 8, create_row(string)) checkEvaluation(Length(b), 5, create_row(bytes)) + checkEvaluation(OctetLength(b), 5, create_row(bytes)) + checkEvaluation(BitLength(b), 5 * 8, create_row(bytes)) checkEvaluation(Length(a), 0, create_row("")) + checkEvaluation(OctetLength(a), 0, create_row("")) + checkEvaluation(BitLength(a), 0, create_row("")) checkEvaluation(Length(b), 0, create_row(Array.empty[Byte])) + checkEvaluation(OctetLength(b), 0, create_row(Array.empty[Byte])) + checkEvaluation(BitLength(b), 0, create_row(Array.empty[Byte])) checkEvaluation(Length(a), null, create_row(null)) + checkEvaluation(OctetLength(a), null, create_row(null)) + checkEvaluation(BitLength(a), null, create_row(null)) checkEvaluation(Length(b), null, create_row(null)) + checkEvaluation(OctetLength(b), null, create_row(null)) + checkEvaluation(BitLength(b), null, create_row(null)) checkEvaluation(Length(Literal.create(null, StringType)), null, create_row(string)) + checkEvaluation(OctetLength(Literal.create(null, StringType)), null, create_row(string)) + checkEvaluation(BitLength(Literal.create(null, StringType)), null, create_row(string)) checkEvaluation(Length(Literal.create(null, BinaryType)), null, create_row(bytes)) + checkEvaluation(OctetLength(Literal.create(null, BinaryType)), null, create_row(bytes)) + checkEvaluation(BitLength(Literal.create(null, BinaryType)), null, create_row(bytes)) } test("format_number / FormatNumber") { http://git-wip-us.apache.org/repos/asf/spark/blob/7a3e5dc2/sql/core/src/test/resources/sql-tests/inputs/operators.sql ---------------------------------------------------------------------- diff --git a/sql/core/src/test/resources/sql-tests/inputs/operators.sql b/sql/core/src/test/resources/sql-tests/inputs/operators.sql index 3934620..a8de23e 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/operators.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/operators.sql @@ -80,3 +80,8 @@ select 1 > 0.00001; -- mod select mod(7, 2), mod(7, 0), mod(0, 2), mod(7, null), mod(null, 2), mod(null, null); + +-- length +select BIT_LENGTH('abc'); +select CHAR_LENGTH('abc'); +select OCTET_LENGTH('abc'); http://git-wip-us.apache.org/repos/asf/spark/blob/7a3e5dc2/sql/core/src/test/resources/sql-tests/results/operators.sql.out ---------------------------------------------------------------------- diff --git a/sql/core/src/test/resources/sql-tests/results/operators.sql.out b/sql/core/src/test/resources/sql-tests/results/operators.sql.out index 51ccf76..85ee10b 100644 --- a/sql/core/src/test/resources/sql-tests/results/operators.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/operators.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 51 +-- Number of queries: 54 -- !query 0 @@ -420,3 +420,27 @@ select mod(7, 2), mod(7, 0), mod(0, 2), mod(7, null), mod(null, 2), mod(null, nu struct<(7 % 2):int,(7 % 0):int,(0 % 2):int,(7 % CAST(NULL AS INT)):int,(CAST(NULL AS INT) % 2):int,(CAST(NULL AS DOUBLE) % CAST(NULL AS DOUBLE)):double> -- !query 50 output 1 NULL 0 NULL NULL NULL + + +-- !query 51 +select BIT_LENGTH('abc') +-- !query 51 schema +struct<bitlength(abc):int> +-- !query 51 output +24 + + +-- !query 52 +select CHAR_LENGTH('abc') +-- !query 52 schema +struct<length(abc):int> +-- !query 52 output +3 + + +-- !query 53 +select OCTET_LENGTH('abc') +-- !query 53 schema +struct<octetlength(abc):int> +-- !query 53 output +3 --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org