[GitHub] spark pull request #21985: [SPARK-24884][SQL] add regexp_extract_all support

2018-08-04 Thread xuanyuanking
Github user xuanyuanking commented on a diff in the pull request:

https://github.com/apache/spark/pull/21985#discussion_r207712639
  
--- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 ---
@@ -446,3 +448,88 @@ case class RegExpExtract(subject: Expression, regexp: 
Expression, idx: Expressio
 })
   }
 }
+
+/**
+ * Extract all specific(idx) groups identified by a Java regex.
+ *
+ * NOTE: this expression is not THREAD-SAFE, as it has some internal 
mutable status.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(str, regexp[, idx]) - Extracts all groups that matches 
`regexp`.",
+  examples = """
+Examples:
+  > SELECT _FUNC_('100-200,300-400', '(\\d+)-(\\d+)', 1);
+   [100, 300]
+  """)
+case class RegExpExtractAll(subject: Expression, regexp: Expression, idx: 
Expression)
--- End diff --

Add an abstract class to reduce duplicated code between `RegExpExtractAll` 
and `RegExpExtract`?


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #21985: [SPARK-24884][SQL] add regexp_extract_all support

2018-08-04 Thread xuanyuanking
Github user xuanyuanking commented on a diff in the pull request:

https://github.com/apache/spark/pull/21985#discussion_r207712323
  
--- Diff: 
sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
 ---
@@ -446,3 +448,88 @@ case class RegExpExtract(subject: Expression, regexp: 
Expression, idx: Expressio
 })
   }
 }
+
+/**
+ * Extract all specific(idx) groups identified by a Java regex.
+ *
+ * NOTE: this expression is not THREAD-SAFE, as it has some internal 
mutable status.
+ */
+@ExpressionDescription(
+  usage = "_FUNC_(str, regexp[, idx]) - Extracts all groups that matches 
`regexp`.",
+  examples = """
+Examples:
+  > SELECT _FUNC_('100-200,300-400', '(\\d+)-(\\d+)', 1);
+   [100, 300]
+  """)
+case class RegExpExtractAll(subject: Expression, regexp: Expression, idx: 
Expression)
+  extends TernaryExpression with ImplicitCastInputTypes {
+  def this(s: Expression, r: Expression) = this(s, r, Literal(1))
+
+  // last regex in string, we will update the pattern iff regexp value 
changed.
+  @transient private var lastRegex: UTF8String = _
+  // last regex pattern, we cache it for performance concern
+  @transient private var pattern: Pattern = _
+
+  override def nullSafeEval(s: Any, p: Any, r: Any): Any = {
+if (!p.equals(lastRegex)) {
+  // regex value changed
+  lastRegex = p.asInstanceOf[UTF8String].clone()
+  pattern = Pattern.compile(lastRegex.toString)
+}
+val m = pattern.matcher(s.toString)
+var groupArrayBuffer = new ArrayBuffer[UTF8String]();
+
+while (m.find) {
+  val mr: MatchResult = m.toMatchResult
+  val group = mr.group(r.asInstanceOf[Int])
+  if (group == null) { // Pattern matched, but not optional group
+groupArrayBuffer += UTF8String.EMPTY_UTF8
+  } else {
+groupArrayBuffer += UTF8String.fromString(group)
+  }
+}
+
+new GenericArrayData(groupArrayBuffer.toArray.asInstanceOf[Array[Any]])
+  }
+
+  override def dataType: DataType = ArrayType(StringType)
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType, 
StringType, IntegerType)
+  override def children: Seq[Expression] = subject :: regexp :: idx :: Nil
+  override def prettyName: String = "regexp_extract_all"
+
+  override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): 
ExprCode = {
+val classNamePattern = classOf[Pattern].getCanonicalName
+val matcher = ctx.freshName("matcher")
+val matchResult = ctx.freshName("matchResult")
+val groupArray = ctx.freshName("groupArray")
+
+val termLastRegex = ctx.addMutableState("UTF8String", "lastRegex")
+val termPattern = ctx.addMutableState(classNamePattern, "pattern")
+
+val arrayClass = classOf[GenericArrayData].getName
+
+nullSafeCodeGen(ctx, ev, (subject, regexp, idx) => {
+  s"""
+  if (!$regexp.equals($termLastRegex)) {
+// regex value changed
+$termLastRegex = $regexp.clone();
+$termPattern = 
$classNamePattern.compile($termLastRegex.toString());
+  }
+  java.util.regex.Matcher $matcher =
+$termPattern.matcher($subject.toString());
+  java.util.ArrayList $groupArray =
+new java.util.ArrayList();
+
+  while ($matcher.find()) {
+java.util.regex.MatchResult $matchResult = 
$matcher.toMatchResult();
+if ($matchResult.group($idx) == null) {
+  $groupArray.add(UTF8String.EMPTY_UTF8);
+} else {
+  $groupArray.add(UTF8String.fromString($matchResult.group($idx)));
+}
+  }
+  ${ev.value} = new $arrayClass($groupArray.toArray(new 
UTF8String[$groupArray.size()]));
--- End diff --

Do we need consider about setting ev.isNull?


---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org



[GitHub] spark pull request #21985: [SPARK-24884][SQL] add regexp_extract_all support

2018-08-03 Thread xueyumusic
GitHub user xueyumusic opened a pull request:

https://github.com/apache/spark/pull/21985

[SPARK-24884][SQL] add regexp_extract_all support

## What changes were proposed in this pull request?
This PR add regexp_extract_all support in catalyst as RegExpExtractAll. 

It finds all occurrences of the regular expression pattern in string and 
returns the capturing group number

## How was this patch tested?

unit test

You can merge this pull request into a Git repository by running:

$ git pull https://github.com/xueyumusic/spark RegExpExtractAll

Alternatively you can review and apply these changes as the patch at:

https://github.com/apache/spark/pull/21985.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

This closes #21985


commit 2a9623879d91a9b7f33e1f4d252b8633de2c9e8b
Author: xueyu <278006819@...>
Date:   2018-08-03T13:22:14Z

RegExpExtractAll




---

-
To unsubscribe, e-mail: reviews-unsubscr...@spark.apache.org
For additional commands, e-mail: reviews-h...@spark.apache.org