uros-db commented on code in PR #46845:
URL: https://github.com/apache/spark/pull/46845#discussion_r1631166079


##########
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/stringExpressions.scala:
##########
@@ -686,6 +686,205 @@ case class EndsWith(left: Expression, right: Expression) 
extends StringPredicate
     newLeft: Expression, newRight: Expression): EndsWith = copy(left = 
newLeft, right = newRight)
 }
 
+/**
+ * A function that checks if a UTF8 string is valid.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(str) - Returns true if `str` is a valid UTF-8 sequence, 
otherwise returns false.",
+  arguments = """
+    Arguments:
+      * str - a string expression
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_(x'61');
+       true
+      > SELECT _FUNC_(x'80');
+       false
+      > SELECT _FUNC_(x'61C262');
+       false
+  """,
+  since = "4.0.0",
+  group = "string_funcs")
+case class IsValidUTF8(srcExpr: Expression) extends UnaryExpression with 
ImplicitCastInputTypes
+  with NullIntolerant {
+
+  override def child: Expression = srcExpr
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation)
+  override def dataType: DataType = BooleanType
+  override def nullable: Boolean = true
+
+  override def nullSafeEval(srcEval: Any): Any = {
+    srcEval.asInstanceOf[UTF8String].isValidUTF8
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    defineCodeGen(ctx, ev, c => s"${ev.value} = $c.isValidUTF8();")
+  }
+
+  override def prettyName: String = "is_valid_utf8"
+
+  override protected def withNewChildInternal(newChild: Expression): 
IsValidUTF8 = {
+    copy(srcExpr = newChild)
+  }
+
+}
+
+/**
+ * A function that converts an invalid UTF8 byte sequences to a valid UTF8 
byte sequence,
+ * according to the UNICODE standard rules (Section 3.9-D86). Valid sequences 
stay unchanged.
+ */
+// scalastyle:off
+@ExpressionDescription(
+  usage = "_FUNC_(str) - Returns the original string if `str` is a valid UTF-8 
sequence, " +
+    "otherwise returns a new string whose invalid UTF8 bytes sequences are 
replaced using the " +
+    "UNICODE replacement character U+FFFD.",
+  arguments = """
+    Arguments:
+      * str - a string expression
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_(x'61');
+       a
+      > SELECT _FUNC_(x'80');
+       �
+      > SELECT _FUNC_(x'61C262');
+       a�b
+  """,
+  since = "4.0.0",
+  group = "string_funcs")
+// scalastyle:on
+case class MakeValidUTF8(srcExpr: Expression) extends UnaryExpression with 
ImplicitCastInputTypes
+  with NullIntolerant {
+
+  override def child: Expression = srcExpr
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation)
+  override def dataType: DataType = child.dataType
+  override def nullable: Boolean = true
+
+  override def nullSafeEval(srcEval: Any): Any = {
+    srcEval.asInstanceOf[UTF8String].makeValidUTF8
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    defineCodeGen(ctx, ev, c => s"${ev.value} = $c.makeValidUTF8();")
+  }
+
+  override def prettyName: String = "make_valid_utf8"
+
+  override protected def withNewChildInternal(newChild: Expression): 
MakeValidUTF8 = {
+    copy(srcExpr = newChild)
+  }
+
+}
+
+/**
+ * A function that validates a UTF8 string.
+ */
+// scalastyle:off
+@ExpressionDescription(
+  usage = "_FUNC_(str) - Returns the original string if `str` is a valid UTF-8 
sequence, " +
+    "otherwise throws an exception.",
+  arguments = """
+    Arguments:
+      * str - a string expression
+  """,
+  examples = """
+    Examples:
+      > SELECT _FUNC_(x'61');
+       a
+  """,
+  since = "4.0.0",
+  group = "string_funcs")
+// scalastyle:on
+case class ValidateUTF8(srcExpr: Expression) extends UnaryExpression with 
ImplicitCastInputTypes
+  with NullIntolerant {
+
+  override def child: Expression = srcExpr
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringTypeAnyCollation)
+  override def dataType: DataType = child.dataType
+  override def nullable: Boolean = true
+
+  override def nullSafeEval(srcEval: Any): Any = {
+    srcEval.asInstanceOf[UTF8String].validateUTF8
+  }
+
+  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
+    nullSafeCodeGen(ctx, ev, c =>
+      s"""
+        if (!$c.isValidUTF8()) {
+          throw new IllegalArgumentException("Invalid UTF-8 string");
+        } else {
+          ${ev.value} = $c;
+        }
+      """
+    )
+  }

Review Comment:
   yup, removing UTF8String.validate and UTF8String.tryValidate



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org

Reply via email to