[GitHub] [spark] dtenedor commented on a diff in pull request #39747: [SPARK-42191][SQL] Support udf 'luhn_check'
dtenedor commented on code in PR #39747: URL: https://github.com/apache/spark/pull/39747#discussion_r1089460097 ## sql/core/src/test/resources/sql-tests/inputs/string-functions.sql: ## @@ -231,3 +231,39 @@ CREATE TEMPORARY VIEW fmtTable(fmtField) AS SELECT * FROM VALUES ('invalidFormat SELECT to_binary('abc', fmtField) FROM fmtTable; -- Clean up DROP VIEW IF EXISTS fmtTable; +-- luhn_check +select luhn_check('4111'); +select luhn_check('5504'); +select luhn_check('349'); +select luhn_check('60110004'); +select luhn_check('378282246310005'); +select luhn_check('6011000990139424'); +select luhn_check('1234567890'); +select luhn_check('4'); +select luhn_check('4112'); +select luhn_check('371449635398431'); +select luhn_check('371449635398432'); +select luhn_check('30569309025904'); +select luhn_check('30569309025905'); +select luhn_check('6017'); +select luhn_check('6018'); +select luhn_check('35301110'); +select luhn_check('35301111'); +select luhn_check('5105105105105100'); +select luhn_check('5105105105105106'); +select luhn_check('510B105105105106'); +select luhn_check('E105105105105106'); +select luhn_check('-5105105105105106'); +select luhn_check(5105105105105106); Review Comment: maybe add some cases with whitespace to show the resulting behavior? ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala: ## @@ -3039,3 +3039,113 @@ case class SplitPart ( partNum = newChildren.apply(2)) } } + +/** + * Function to check if a given number string is a valid Luhn number. + * Returns true, if the number string is a valid Luhn number, false otherwise + */ +@ExpressionDescription( + usage = """ +_FUNC_(str ) - Checks that a string of digits is valid according to the Luhn algorithm. +This checksum function is widely applied on credit card numbers and government identification +numbers to distinguish valid numbers from mistyped, incorrect numbers. + """, + examples = """ +Examples: + > SELECT _FUNC_('8112189876'); + true + > SELECT _FUNC_('79927398713'); + true + > SELECT _FUNC_('79927398714'); + false + """, + since = "3.5.0", + group = "string_funcs") +case class Luhncheck(child: Expression) extends UnaryExpression with ExpectsInputTypes { + + override def nullable: Boolean = false + + override protected def withNewChildInternal(newChild: Expression): Luhncheck = +copy(child = newChild) + + /** + * Expected input types from child expressions. The i-th position in the returned seq indicates Review Comment: no need to copy the method comment from the base `Expression` class unchanged, you can just leave the method un-commented if it is simple and self-explanatory. ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala: ## @@ -3039,3 +3039,113 @@ case class SplitPart ( partNum = newChildren.apply(2)) } } + +/** + * Function to check if a given number string is a valid Luhn number. + * Returns true, if the number string is a valid Luhn number, false otherwise + */ +@ExpressionDescription( + usage = """ +_FUNC_(str ) - Checks that a string of digits is valid according to the Luhn algorithm. +This checksum function is widely applied on credit card numbers and government identification +numbers to distinguish valid numbers from mistyped, incorrect numbers. + """, + examples = """ +Examples: + > SELECT _FUNC_('8112189876'); + true + > SELECT _FUNC_('79927398713'); + true + > SELECT _FUNC_('79927398714'); + false + """, + since = "3.5.0", + group = "string_funcs") +case class Luhncheck(child: Expression) extends UnaryExpression with ExpectsInputTypes { + + override def nullable: Boolean = false + + override protected def withNewChildInternal(newChild: Expression): Luhncheck = +copy(child = newChild) + + /** + * Expected input types from child expressions. The i-th position in the returned seq indicates + * the type requirement for the i-th child. + * + * The possible values at each position are: + * 1. a specific data type, for example, LongType, StringType. + * 2. a non-leaf abstract data type, for example, NumericType, IntegralType, FractionalType. + */ + override def inputTypes: Seq[AbstractDataType] = Seq(StringType) + + /** + * Returns the [[DataType]] of the result of evaluating this expression. It is invalid to query + * the dataType of an unresolved expression (i.e., when `resolved` == false). + */ + override def dataType: DataType = BooleanType + + /** + * Default behavior of evaluation according to the default nullability of UnaryExpression. If + * subclass of UnaryExpression override nullable, probably should also override this. + */ + override def
[GitHub] [spark] dtenedor commented on a diff in pull request #39747: [SPARK-42191][SQL] Support udf 'luhn_check'
dtenedor commented on code in PR #39747: URL: https://github.com/apache/spark/pull/39747#discussion_r1087116548 ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala: ## @@ -3039,3 +3039,113 @@ case class SplitPart ( partNum = newChildren.apply(2)) } } + +/** + * Function to check if a given number string is a valid Luhn number. + * Returns true, if the number string is a valid Luhn number, false otherwise + */ +@ExpressionDescription( + usage = """ +_FUNC_(str ) - Checks that a string of digits is valid according to the Luhn algorithm Review Comment: ```suggestion _FUNC_(str ) - Checks that a string of digits is valid according to the Luhn algorithm. ``` ## sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala: ## @@ -3039,3 +3039,113 @@ case class SplitPart ( partNum = newChildren.apply(2)) } } + +/** + * Function to check if a given number string is a valid Luhn number. + * Returns true, if the number string is a valid Luhn number, false otherwise + */ +@ExpressionDescription( + usage = """ +_FUNC_(str ) - Checks that a string of digits is valid according to the Luhn algorithm +This checksum function is widely applied on credit card numbers and government identification +numbers to distinguish valid numbers from mistyped, incorrect numbers. + """, + examples = """ +Examples: + > SELECT _FUNC_('8112189876'); + true + > SELECT _FUNC_('79927398713'); + true + > SELECT _FUNC_('79927398714'); + false + """, + since = "3.4.0", + group = "string_funcs") +case class Luhncheck(child: Expression) extends UnaryExpression with ExpectsInputTypes { + + override def nullable: Boolean = false + + override protected def withNewChildInternal(newChild: Expression): Luhncheck = +copy(child = newChild) + + /** + * Expected input types from child expressions. The i-th position in the returned seq indicates + * the type requirement for the i-th child. + * + * The possible values at each position are: + * 1. a specific data type, for example, LongType, StringType. + * 2. a non-leaf abstract data type, for example, NumericType, IntegralType, FractionalType. + */ + override def inputTypes: Seq[AbstractDataType] = Seq(StringType) + + /** + * Returns the [[DataType]] of the result of evaluating this expression. It is invalid to query + * the dataType of an unresolved expression (i.e., when `resolved` == false). + */ + override def dataType: DataType = BooleanType + + /** + * Default behavior of evaluation according to the default nullability of UnaryExpression. If + * subclass of UnaryExpression override nullable, probably should also override this. + */ + override def eval(input: InternalRow): Any = Luhncheck.isLuhnNumber(child.eval(input)) + + /** + * Returns Java source code that can be compiled to evaluate this expression. The default + * behavior is to call the eval method of the expression. Concrete expression implementations + * should override this to do actual code generation. + * + * @param ctx + * a [[CodegenContext]] + * @param ev + * an [[ExprCode]] with unique terms. + * @return + * an [[ExprCode]] containing the Java source code to generate the given expression + */ + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { +defineCodeGen( + ctx, + ev, + c => { + s"""org.apache.spark.sql.catalyst.expressions.Luhncheck.isLuhnNumber($c)""" + }) + } +} + +object Luhncheck { + + /** + * Function to check if a given number string is a valid Luhn number + * + * @param numberString + * the number string to check + * @return + * true if the number string is a valid Luhn number, false otherwise + */ + def isLuhnNumber(numberString: Any): Boolean = { +if (numberString != null) { + val digits = numberString.asInstanceOf[UTF8String].toString + // check if all characters in the input string are digits + if (!digits.forall(_.isDigit)) { +return false // if not, return false + } + // reverse the string so that we can use a foldLeft function Review Comment: please update the comment to start with a capital letter and end with punctuation (here and everywhere else in the PR) ## sql/core/src/test/resources/sql-tests/inputs/string-functions.sql: ## @@ -231,3 +231,29 @@ CREATE TEMPORARY VIEW fmtTable(fmtField) AS SELECT * FROM VALUES ('invalidFormat SELECT to_binary('abc', fmtField) FROM fmtTable; -- Clean up DROP VIEW IF EXISTS fmtTable; +-- luhn_check +select luhn_check('4111'); +select luhn_check('5504'); +select luhn_check('349'); +select luhn_check('60110004'); +select luhn_check('378282246310005'); +select luhn_check('6011000990139424'); +select