uros-db commented on code in PR #46845:
URL: https://github.com/apache/spark/pull/46845#discussion_r1631166863


##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:
##########
@@ -686,6 +686,205 @@ case class EndsWith(left: Expression, right: Expression) 
extends StringPredicate
     newLeft: Expression, newRight: Expression): EndsWith = copy(left = 
newLeft, right = newRight)
 }
 
+/**
+ * A function that checks if a UTF8 string is valid.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(str) - Returns true if `str` is a valid UTF-8 sequence, 
otherwise returns false.",
+  arguments = """
+    Arguments:
+      * str - a string expression
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_(x'61');
+       true
+      > SELECT _FUNC_(x'80');
+       false
+      > SELECT _FUNC_(x'61C262');
+       false
+  """,
+  since = "4.0.0",
+  group = "string_funcs")
+case class IsValidUTF8(srcExpr: Expression) extends UnaryExpression with 
ImplicitCastInputTypes
+  with NullIntolerant {
+
+  override def child: Expression = srcExpr
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation)
+  override def dataType: DataType = BooleanType
+  override def nullable: Boolean = true
+
+  override def nullSafeEval(srcEval: Any): Any = {
+    srcEval.asInstanceOf[UTF8String].isValidUTF8
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    defineCodeGen(ctx, ev, c => s"${ev.value} = $c.isValidUTF8();")
+  }
+
+  override def prettyName: String = "is_valid_utf8"
+
+  override protected def withNewChildInternal(newChild: Expression): 
IsValidUTF8 = {
+    copy(srcExpr = newChild)
+  }
+
+}
+
+/**
+ * A function that converts an invalid UTF8 byte sequences to a valid UTF8 
byte sequence,
+ * according to the UNICODE standard rules (Section 3.9-D86). Valid sequences 
stay unchanged.
+ */
+// scalastyle:off
+@ExpressionDescription(
+  usage = "_FUNC_(str) - Returns the original string if `str` is a valid UTF-8 
sequence, " +
+    "otherwise returns a new string whose invalid UTF8 bytes sequences are 
replaced using the " +
+    "UNICODE replacement character U+FFFD.",
+  arguments = """
+    Arguments:
+      * str - a string expression
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_(x'61');
+       a
+      > SELECT _FUNC_(x'80');
+       �
+      > SELECT _FUNC_(x'61C262');
+       a�b
+  """,
+  since = "4.0.0",
+  group = "string_funcs")
+// scalastyle:on
+case class MakeValidUTF8(srcExpr: Expression) extends UnaryExpression with 
ImplicitCastInputTypes
+  with NullIntolerant {
+
+  override def child: Expression = srcExpr
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation)
+  override def dataType: DataType = child.dataType
+  override def nullable: Boolean = true
+
+  override def nullSafeEval(srcEval: Any): Any = {
+    srcEval.asInstanceOf[UTF8String].makeValidUTF8
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    defineCodeGen(ctx, ev, c => s"${ev.value} = $c.makeValidUTF8();")
+  }
+
+  override def prettyName: String = "make_valid_utf8"
+
+  override protected def withNewChildInternal(newChild: Expression): 
MakeValidUTF8 = {
+    copy(srcExpr = newChild)
+  }
+
+}
+
+/**
+ * A function that validates a UTF8 string.
+ */
+// scalastyle:off
+@ExpressionDescription(
+  usage = "_FUNC_(str) - Returns the original string if `str` is a valid UTF-8 
sequence, " +
+    "otherwise throws an exception.",
+  arguments = """
+    Arguments:
+      * str - a string expression
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_(x'61');
+       a
+  """,
+  since = "4.0.0",
+  group = "string_funcs")
+// scalastyle:on
+case class ValidateUTF8(srcExpr: Expression) extends UnaryExpression with 
ImplicitCastInputTypes
+  with NullIntolerant {
+
+  override def child: Expression = srcExpr
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation)
+  override def dataType: DataType = child.dataType
+  override def nullable: Boolean = true
+
+  override def nullSafeEval(srcEval: Any): Any = {
+    srcEval.asInstanceOf[UTF8String].validateUTF8
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    nullSafeCodeGen(ctx, ev, c =>
+      s"""
+        if (!$c.isValidUTF8()) {
+          throw new IllegalArgumentException("Invalid UTF-8 string");
+        } else {
+          ${ev.value} = $c;
+        }
+      """
+    )
+  }
+
+  override def prettyName: String = "validate_utf8"
+
+  override protected def withNewChildInternal(newChild: Expression): 
ValidateUTF8 = {
+    copy(srcExpr = newChild)
+  }
+
+}
+
+/**
+ * A function that tries to validate a UTF8 string.

Review Comment:
   modified a bit, although some more info is already found in expression 
description / usage



##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:
##########
@@ -686,6 +686,205 @@ case class EndsWith(left: Expression, right: Expression) 
extends StringPredicate
     newLeft: Expression, newRight: Expression): EndsWith = copy(left = 
newLeft, right = newRight)
 }
 
+/**
+ * A function that checks if a UTF8 string is valid.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(str) - Returns true if `str` is a valid UTF-8 sequence, 
otherwise returns false.",
+  arguments = """
+    Arguments:
+      * str - a string expression
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_(x'61');
+       true
+      > SELECT _FUNC_(x'80');
+       false
+      > SELECT _FUNC_(x'61C262');
+       false
+  """,
+  since = "4.0.0",
+  group = "string_funcs")
+case class IsValidUTF8(srcExpr: Expression) extends UnaryExpression with 
ImplicitCastInputTypes
+  with NullIntolerant {
+
+  override def child: Expression = srcExpr
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation)
+  override def dataType: DataType = BooleanType
+  override def nullable: Boolean = true
+
+  override def nullSafeEval(srcEval: Any): Any = {
+    srcEval.asInstanceOf[UTF8String].isValidUTF8
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    defineCodeGen(ctx, ev, c => s"${ev.value} = $c.isValidUTF8();")
+  }
+
+  override def prettyName: String = "is_valid_utf8"
+
+  override protected def withNewChildInternal(newChild: Expression): 
IsValidUTF8 = {
+    copy(srcExpr = newChild)
+  }
+
+}
+
+/**
+ * A function that converts an invalid UTF8 byte sequences to a valid UTF8 
byte sequence,
+ * according to the UNICODE standard rules (Section 3.9-D86). Valid sequences 
stay unchanged.
+ */
+// scalastyle:off
+@ExpressionDescription(
+  usage = "_FUNC_(str) - Returns the original string if `str` is a valid UTF-8 
sequence, " +
+    "otherwise returns a new string whose invalid UTF8 bytes sequences are 
replaced using the " +
+    "UNICODE replacement character U+FFFD.",
+  arguments = """
+    Arguments:
+      * str - a string expression
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_(x'61');
+       a
+      > SELECT _FUNC_(x'80');
+       �
+      > SELECT _FUNC_(x'61C262');
+       a�b
+  """,
+  since = "4.0.0",
+  group = "string_funcs")
+// scalastyle:on
+case class MakeValidUTF8(srcExpr: Expression) extends UnaryExpression with 
ImplicitCastInputTypes
+  with NullIntolerant {
+
+  override def child: Expression = srcExpr
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation)
+  override def dataType: DataType = child.dataType
+  override def nullable: Boolean = true
+
+  override def nullSafeEval(srcEval: Any): Any = {
+    srcEval.asInstanceOf[UTF8String].makeValidUTF8
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    defineCodeGen(ctx, ev, c => s"${ev.value} = $c.makeValidUTF8();")
+  }
+
+  override def prettyName: String = "make_valid_utf8"
+
+  override protected def withNewChildInternal(newChild: Expression): 
MakeValidUTF8 = {
+    copy(srcExpr = newChild)
+  }
+
+}
+
+/**
+ * A function that validates a UTF8 string.
+ */
+// scalastyle:off
+@ExpressionDescription(
+  usage = "_FUNC_(str) - Returns the original string if `str` is a valid UTF-8 
sequence, " +
+    "otherwise throws an exception.",
+  arguments = """
+    Arguments:
+      * str - a string expression
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_(x'61');
+       a
+  """,
+  since = "4.0.0",
+  group = "string_funcs")
+// scalastyle:on
+case class ValidateUTF8(srcExpr: Expression) extends UnaryExpression with 
ImplicitCastInputTypes
+  with NullIntolerant {
+
+  override def child: Expression = srcExpr
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation)
+  override def dataType: DataType = child.dataType
+  override def nullable: Boolean = true
+
+  override def nullSafeEval(srcEval: Any): Any = {
+    srcEval.asInstanceOf[UTF8String].validateUTF8
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    nullSafeCodeGen(ctx, ev, c =>
+      s"""
+        if (!$c.isValidUTF8()) {
+          throw new IllegalArgumentException("Invalid UTF-8 string");
+        } else {
+          ${ev.value} = $c;
+        }
+      """
+    )
+  }
+
+  override def prettyName: String = "validate_utf8"
+
+  override protected def withNewChildInternal(newChild: Expression): 
ValidateUTF8 = {
+    copy(srcExpr = newChild)
+  }
+
+}
+
+/**
+ * A function that tries to validate a UTF8 string.
+ */
+// scalastyle:off
+@ExpressionDescription(
+  usage = "_FUNC_(str) - Returns the original string if `str` is a valid UTF-8 
sequence, " +
+    "otherwise returns null.",
+  arguments = """
+    Arguments:
+      * str - a string expression
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_(x'61');
+       a
+      > SELECT _FUNC_(x'80');
+       NULL
+      > SELECT _FUNC_(x'61C262');
+       NULL
+  """,
+  since = "4.0.0",
+  group = "string_funcs")
+// scalastyle:on
+case class TryValidateUTF8(srcExpr: Expression) extends UnaryExpression with 
ImplicitCastInputTypes
+  with NullIntolerant {
+
+  override def child: Expression = srcExpr
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation)
+  override def dataType: DataType = child.dataType
+  override def nullable: Boolean = true
+
+  override def nullSafeEval(srcEval: Any): Any = {
+    srcEval.asInstanceOf[UTF8String].tryValidateUTF8
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    nullSafeCodeGen(ctx, ev, c =>
+      s"""
+        if (!$c.isValidUTF8()) {
+          ${ev.isNull} = true;
+        } else {
+          ${ev.value} = $c;
+        }
+      """
+    )
+  }

Review Comment:
   agreed



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to