Github user xuanyuanking commented on a diff in the pull request: https://github.com/apache/spark/pull/21985#discussion_r207712323 --- Diff: sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala --- @@ -446,3 +448,88 @@ case class RegExpExtract(subject: Expression, regexp: Expression, idx: Expressio }) } } + +/** + * Extract all specific(idx) groups identified by a Java regex. + * + * NOTE: this expression is not THREAD-SAFE, as it has some internal mutable status. + */ +@ExpressionDescription( + usage = "_FUNC_(str, regexp[, idx]) - Extracts all groups that matches `regexp`.", + examples = """ + Examples: + > SELECT _FUNC_('100-200,300-400', '(\\d+)-(\\d+)', 1); + [100, 300] + """) +case class RegExpExtractAll(subject: Expression, regexp: Expression, idx: Expression) + extends TernaryExpression with ImplicitCastInputTypes { + def this(s: Expression, r: Expression) = this(s, r, Literal(1)) + + // last regex in string, we will update the pattern iff regexp value changed. + @transient private var lastRegex: UTF8String = _ + // last regex pattern, we cache it for performance concern + @transient private var pattern: Pattern = _ + + override def nullSafeEval(s: Any, p: Any, r: Any): Any = { + if (!p.equals(lastRegex)) { + // regex value changed + lastRegex = p.asInstanceOf[UTF8String].clone() + pattern = Pattern.compile(lastRegex.toString) + } + val m = pattern.matcher(s.toString) + var groupArrayBuffer = new ArrayBuffer[UTF8String](); + + while (m.find) { + val mr: MatchResult = m.toMatchResult + val group = mr.group(r.asInstanceOf[Int]) + if (group == null) { // Pattern matched, but not optional group + groupArrayBuffer += UTF8String.EMPTY_UTF8 + } else { + groupArrayBuffer += UTF8String.fromString(group) + } + } + + new GenericArrayData(groupArrayBuffer.toArray.asInstanceOf[Array[Any]]) + } + + override def dataType: DataType = ArrayType(StringType) + override def inputTypes: Seq[AbstractDataType] = Seq(StringType, StringType, IntegerType) + override def children: Seq[Expression] = subject :: regexp :: idx :: Nil + override def prettyName: String = "regexp_extract_all" + + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val classNamePattern = classOf[Pattern].getCanonicalName + val matcher = ctx.freshName("matcher") + val matchResult = ctx.freshName("matchResult") + val groupArray = ctx.freshName("groupArray") + + val termLastRegex = ctx.addMutableState("UTF8String", "lastRegex") + val termPattern = ctx.addMutableState(classNamePattern, "pattern") + + val arrayClass = classOf[GenericArrayData].getName + + nullSafeCodeGen(ctx, ev, (subject, regexp, idx) => { + s""" + if (!$regexp.equals($termLastRegex)) { + // regex value changed + $termLastRegex = $regexp.clone(); + $termPattern = $classNamePattern.compile($termLastRegex.toString()); + } + java.util.regex.Matcher $matcher = + $termPattern.matcher($subject.toString()); + java.util.ArrayList $groupArray = + new java.util.ArrayList<UTF8String>(); + + while ($matcher.find()) { + java.util.regex.MatchResult $matchResult = $matcher.toMatchResult(); + if ($matchResult.group($idx) == null) { + $groupArray.add(UTF8String.EMPTY_UTF8); + } else { + $groupArray.add(UTF8String.fromString($matchResult.group($idx))); + } + } + ${ev.value} = new $arrayClass($groupArray.toArray(new UTF8String[$groupArray.size()])); --- End diff -- Do we need consider about setting ev.isNull?
--- --------------------------------------------------------------------- To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org For additional commands, e-mail: reviews-h...@spark.apache.org