Github user xuanyuanking commented on a diff in the pull request:
https://github.com/apache/spark/pull/21985#discussion_r207712323
--- Diff:
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
---
@@ -446,3 +448,88 @@ case class RegExpExtract(subject: Expression, regexp:
Expression, idx: Expressio
})
}
}
+
+/**
+ * Extract all specific(idx) groups identified by a Java regex.
+ *
+ * NOTE: this expression is not THREAD-SAFE, as it has some internal
mutable status.
+ */
+@ExpressionDescription(
+ usage = "_FUNC_(str, regexp[, idx]) - Extracts all groups that matches
`regexp`.",
+ examples = """
+Examples:
+ > SELECT _FUNC_('100-200,300-400', '(\\d+)-(\\d+)', 1);
+ [100, 300]
+ """)
+case class RegExpExtractAll(subject: Expression, regexp: Expression, idx:
Expression)
+ extends TernaryExpression with ImplicitCastInputTypes {
+ def this(s: Expression, r: Expression) = this(s, r, Literal(1))
+
+ // last regex in string, we will update the pattern iff regexp value
changed.
+ @transient private var lastRegex: UTF8String = _
+ // last regex pattern, we cache it for performance concern
+ @transient private var pattern: Pattern = _
+
+ override def nullSafeEval(s: Any, p: Any, r: Any): Any = {
+if (!p.equals(lastRegex)) {
+ // regex value changed
+ lastRegex = p.asInstanceOf[UTF8String].clone()
+ pattern = Pattern.compile(lastRegex.toString)
+}
+val m = pattern.matcher(s.toString)
+var groupArrayBuffer = new ArrayBuffer[UTF8String]();
+
+while (m.find) {
+ val mr: MatchResult = m.toMatchResult
+ val group = mr.group(r.asInstanceOf[Int])
+ if (group == null) { // Pattern matched, but not optional group
+groupArrayBuffer += UTF8String.EMPTY_UTF8
+ } else {
+groupArrayBuffer += UTF8String.fromString(group)
+ }
+}
+
+new GenericArrayData(groupArrayBuffer.toArray.asInstanceOf[Array[Any]])
+ }
+
+ override def dataType: DataType = ArrayType(StringType)
+ override def inputTypes: Seq[AbstractDataType] = Seq(StringType,
StringType, IntegerType)
+ override def children: Seq[Expression] = subject :: regexp :: idx :: Nil
+ override def prettyName: String = "regexp_extract_all"
+
+ override protected def doGenCode(ctx: CodegenContext, ev: ExprCode):
ExprCode = {
+val classNamePattern = classOf[Pattern].getCanonicalName
+val matcher = ctx.freshName("matcher")
+val matchResult = ctx.freshName("matchResult")
+val groupArray = ctx.freshName("groupArray")
+
+val termLastRegex = ctx.addMutableState("UTF8String", "lastRegex")
+val termPattern = ctx.addMutableState(classNamePattern, "pattern")
+
+val arrayClass = classOf[GenericArrayData].getName
+
+nullSafeCodeGen(ctx, ev, (subject, regexp, idx) => {
+ s"""
+ if (!$regexp.equals($termLastRegex)) {
+// regex value changed
+$termLastRegex = $regexp.clone();
+$termPattern =
$classNamePattern.compile($termLastRegex.toString());
+ }
+ java.util.regex.Matcher $matcher =
+$termPattern.matcher($subject.toString());
+ java.util.ArrayList $groupArray =
+new java.util.ArrayList();
+
+ while ($matcher.find()) {
+java.util.regex.MatchResult $matchResult =
$matcher.toMatchResult();
+if ($matchResult.group($idx) == null) {
+ $groupArray.add(UTF8String.EMPTY_UTF8);
+} else {
+ $groupArray.add(UTF8String.fromString($matchResult.group($idx)));
+}
+ }
+ ${ev.value} = new $arrayClass($groupArray.toArray(new
UTF8String[$groupArray.size()]));
--- End diff --
Do we need consider about setting ev.isNull?
---
-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org