[GitHub] [spark] dtenedor commented on a diff in pull request #39747: [SPARK-42191][SQL] Support udf 'luhn_check'

2023-01-27 Thread via GitHub


dtenedor commented on code in PR #39747:
URL: https://github.com/apache/spark/pull/39747#discussion_r1089460097


##
sql/core/src/test/resources/sql-tests/inputs/string-functions.sql:
##
@@ -231,3 +231,39 @@ CREATE TEMPORARY VIEW fmtTable(fmtField) AS SELECT * FROM 
VALUES ('invalidFormat
 SELECT to_binary('abc', fmtField) FROM fmtTable;
 -- Clean up
 DROP VIEW IF EXISTS fmtTable;
+-- luhn_check
+select luhn_check('4111');
+select luhn_check('5504');
+select luhn_check('349');
+select luhn_check('60110004');
+select luhn_check('378282246310005');
+select luhn_check('6011000990139424');
+select luhn_check('1234567890');
+select luhn_check('4');
+select luhn_check('4112');
+select luhn_check('371449635398431');
+select luhn_check('371449635398432');
+select luhn_check('30569309025904');
+select luhn_check('30569309025905');
+select luhn_check('6017');
+select luhn_check('6018');
+select luhn_check('35301110');
+select luhn_check('35301111');
+select luhn_check('5105105105105100');
+select luhn_check('5105105105105106');
+select luhn_check('510B105105105106');
+select luhn_check('E105105105105106');
+select luhn_check('-5105105105105106');
+select luhn_check(5105105105105106);

Review Comment:
   maybe add some cases with whitespace to show the resulting behavior?



##
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:
##
@@ -3039,3 +3039,113 @@ case class SplitPart (
   partNum = newChildren.apply(2))
   }
 }
+
+/**
+ * Function to check if a given number string is a valid Luhn number.
+ * Returns true, if the number string is a valid Luhn number, false otherwise
+ */
+@ExpressionDescription(
+  usage = """
+_FUNC_(str ) - Checks that a string of digits is valid according to the 
Luhn algorithm.
+This checksum function is widely applied on credit card numbers and 
government identification
+numbers to distinguish valid numbers from mistyped, incorrect numbers.
+  """,
+  examples = """
+Examples:
+  > SELECT _FUNC_('8112189876');
+   true
+  > SELECT _FUNC_('79927398713');
+   true
+  > SELECT _FUNC_('79927398714');
+   false
+  """,
+  since = "3.5.0",
+  group = "string_funcs")
+case class Luhncheck(child: Expression) extends UnaryExpression with 
ExpectsInputTypes {
+
+  override def nullable: Boolean = false
+
+  override protected def withNewChildInternal(newChild: Expression): Luhncheck 
=
+copy(child = newChild)
+
+  /**
+   * Expected input types from child expressions. The i-th position in the 
returned seq indicates

Review Comment:
   no need to copy the method comment from the base `Expression` class 
unchanged, you can just leave the method un-commented if it is simple and 
self-explanatory.



##
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:
##
@@ -3039,3 +3039,113 @@ case class SplitPart (
   partNum = newChildren.apply(2))
   }
 }
+
+/**
+ * Function to check if a given number string is a valid Luhn number.
+ * Returns true, if the number string is a valid Luhn number, false otherwise
+ */
+@ExpressionDescription(
+  usage = """
+_FUNC_(str ) - Checks that a string of digits is valid according to the 
Luhn algorithm.
+This checksum function is widely applied on credit card numbers and 
government identification
+numbers to distinguish valid numbers from mistyped, incorrect numbers.
+  """,
+  examples = """
+Examples:
+  > SELECT _FUNC_('8112189876');
+   true
+  > SELECT _FUNC_('79927398713');
+   true
+  > SELECT _FUNC_('79927398714');
+   false
+  """,
+  since = "3.5.0",
+  group = "string_funcs")
+case class Luhncheck(child: Expression) extends UnaryExpression with 
ExpectsInputTypes {
+
+  override def nullable: Boolean = false
+
+  override protected def withNewChildInternal(newChild: Expression): Luhncheck 
=
+copy(child = newChild)
+
+  /**
+   * Expected input types from child expressions. The i-th position in the 
returned seq indicates
+   * the type requirement for the i-th child.
+   *
+   * The possible values at each position are:
+   *   1. a specific data type, for example, LongType, StringType.
+   *   2. a non-leaf abstract data type, for example, NumericType, 
IntegralType, FractionalType.
+   */
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
+
+  /**
+   * Returns the [[DataType]] of the result of evaluating this expression. It 
is invalid to query
+   * the dataType of an unresolved expression (i.e., when `resolved` == false).
+   */
+  override def dataType: DataType = BooleanType
+
+  /**
+   * Default behavior of evaluation according to the default nullability of 
UnaryExpression. If
+   * subclass of UnaryExpression override nullable, probably should also 
override this.
+   */
+  override def 

[GitHub] [spark] dtenedor commented on a diff in pull request #39747: [SPARK-42191][SQL] Support udf 'luhn_check'

2023-01-25 Thread via GitHub


dtenedor commented on code in PR #39747:
URL: https://github.com/apache/spark/pull/39747#discussion_r1087116548


##
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:
##
@@ -3039,3 +3039,113 @@ case class SplitPart (
   partNum = newChildren.apply(2))
   }
 }
+
+/**
+ * Function to check if a given number string is a valid Luhn number.
+ * Returns true, if the number string is a valid Luhn number, false otherwise
+ */
+@ExpressionDescription(
+  usage = """
+_FUNC_(str ) - Checks that a string of digits is valid according to the 
Luhn algorithm

Review Comment:
   ```suggestion
   _FUNC_(str ) - Checks that a string of digits is valid according to the 
Luhn algorithm.
   ```



##
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:
##
@@ -3039,3 +3039,113 @@ case class SplitPart (
   partNum = newChildren.apply(2))
   }
 }
+
+/**
+ * Function to check if a given number string is a valid Luhn number.
+ * Returns true, if the number string is a valid Luhn number, false otherwise
+ */
+@ExpressionDescription(
+  usage = """
+_FUNC_(str ) - Checks that a string of digits is valid according to the 
Luhn algorithm
+This checksum function is widely applied on credit card numbers and 
government identification
+numbers to distinguish valid numbers from mistyped, incorrect numbers.
+  """,
+  examples = """
+Examples:
+  > SELECT _FUNC_('8112189876');
+   true
+  > SELECT _FUNC_('79927398713');
+   true
+  > SELECT _FUNC_('79927398714');
+   false
+  """,
+  since = "3.4.0",
+  group = "string_funcs")
+case class Luhncheck(child: Expression) extends UnaryExpression with 
ExpectsInputTypes {
+
+  override def nullable: Boolean = false
+
+  override protected def withNewChildInternal(newChild: Expression): Luhncheck 
=
+copy(child = newChild)
+
+  /**
+   * Expected input types from child expressions. The i-th position in the 
returned seq indicates
+   * the type requirement for the i-th child.
+   *
+   * The possible values at each position are:
+   *   1. a specific data type, for example, LongType, StringType.
+   *   2. a non-leaf abstract data type, for example, NumericType, 
IntegralType, FractionalType.
+   */
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
+
+  /**
+   * Returns the [[DataType]] of the result of evaluating this expression. It 
is invalid to query
+   * the dataType of an unresolved expression (i.e., when `resolved` == false).
+   */
+  override def dataType: DataType = BooleanType
+
+  /**
+   * Default behavior of evaluation according to the default nullability of 
UnaryExpression. If
+   * subclass of UnaryExpression override nullable, probably should also 
override this.
+   */
+  override def eval(input: InternalRow): Any = 
Luhncheck.isLuhnNumber(child.eval(input))
+
+  /**
+   * Returns Java source code that can be compiled to evaluate this 
expression. The default
+   * behavior is to call the eval method of the expression. Concrete 
expression implementations
+   * should override this to do actual code generation.
+   *
+   * @param ctx
+   *   a [[CodegenContext]]
+   * @param ev
+   *   an [[ExprCode]] with unique terms.
+   * @return
+   *   an [[ExprCode]] containing the Java source code to generate the given 
expression
+   */
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): 
ExprCode = {
+defineCodeGen(
+  ctx,
+  ev,
+  c => {
+
s"""org.apache.spark.sql.catalyst.expressions.Luhncheck.isLuhnNumber($c)"""
+  })
+  }
+}
+
+object Luhncheck {
+
+  /**
+   * Function to check if a given number string is a valid Luhn number
+   *
+   * @param numberString
+   *   the number string to check
+   * @return
+   *   true if the number string is a valid Luhn number, false otherwise
+   */
+  def isLuhnNumber(numberString: Any): Boolean = {
+if (numberString != null) {
+  val digits = numberString.asInstanceOf[UTF8String].toString
+  // check if all characters in the input string are digits
+  if (!digits.forall(_.isDigit)) {
+return false // if not, return false
+  }
+  // reverse the string so that we can use a foldLeft function

Review Comment:
   please update the comment to start with a capital letter and end with 
punctuation (here and everywhere else in the PR)



##
sql/core/src/test/resources/sql-tests/inputs/string-functions.sql:
##
@@ -231,3 +231,29 @@ CREATE TEMPORARY VIEW fmtTable(fmtField) AS SELECT * FROM 
VALUES ('invalidFormat
 SELECT to_binary('abc', fmtField) FROM fmtTable;
 -- Clean up
 DROP VIEW IF EXISTS fmtTable;
+-- luhn_check
+select luhn_check('4111');
+select luhn_check('5504');
+select luhn_check('349');
+select luhn_check('60110004');
+select luhn_check('378282246310005');
+select luhn_check('6011000990139424');
+select