This is an automated email from the ASF dual-hosted git repository.

sankarh pushed a commit to branch branch-3
in repository https://gitbox.apache.org/repos/asf/hive.git


The following commit(s) were added to refs/heads/branch-3 by this push:
     new 6a5e6cd69bf HIVE-27605: Backport of HIVE-19661: Switch Hive UDFs to 
use Re2J regex engine (Rajkumar Singh via Ashutosh Chauhan)
6a5e6cd69bf is described below

commit 6a5e6cd69bf3928819e648ab8a98f6d78b6a64c7
Author: Aman Raj <104416558+amanraj2...@users.noreply.github.com>
AuthorDate: Mon Sep 4 14:58:09 2023 +0530

    HIVE-27605: Backport of HIVE-19661: Switch Hive UDFs to use Re2J regex 
engine (Rajkumar Singh via Ashutosh Chauhan)
    
    Signed-off-by: Sankar Hariappan <sank...@apache.org>
    Closes (#4584)
---
 LICENSE                                            | 30 +++++++++++
 .../java/org/apache/hadoop/hive/conf/HiveConf.java |  1 +
 pom.xml                                            |  6 +++
 ql/pom.xml                                         | 12 +++++
 .../hive/ql/udf/generic/GenericUDFRegExp.java      | 61 +++++++++++++++++-----
 5 files changed, 98 insertions(+), 12 deletions(-)

diff --git a/LICENSE b/LICENSE
index 3e7dc6b98cf..316afc629b8 100644
--- a/LICENSE
+++ b/LICENSE
@@ -404,4 +404,34 @@ products or services of Licensee, or any third party.
 agrees to be bound by the terms and conditions of this License
 Agreement.
 
+For google re2j (https://github.com/google/re2j/blob/master/LICENSE):
+
+Copyright (c) 2009 The Go Authors. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   * Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+   * Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+   * Neither the name of Google Inc. nor the names of its
+contributors may be used to endorse or promote products derived from
+this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 
diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java 
b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
index 33796a24d19..606eedd1c4d 100644
--- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
+++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java
@@ -3716,6 +3716,7 @@ public class HiveConf extends Configuration {
          "Time to wait to finish prewarming spark executors"),
     HIVESTAGEIDREARRANGE("hive.stageid.rearrange", "none", new 
StringSet("none", "idonly", "traverse", "execution"), ""),
     
HIVEEXPLAINDEPENDENCYAPPENDTASKTYPES("hive.explain.dependency.append.tasktype", 
false, ""),
+    HIVEUSEGOOGLEREGEXENGINE("hive.use.googleregex.engine",false,"whether to 
use google regex engine or not, default regex engine is java.util.regex"),
 
     HIVECOUNTERGROUP("hive.counters.group.name", "HIVE",
         "The name of counter group for internal Hive variables (CREATED_FILE, 
FATAL_ERROR, etc.)"),
diff --git a/pom.xml b/pom.xml
index b24f90da574..a07c7627a81 100644
--- a/pom.xml
+++ b/pom.xml
@@ -216,6 +216,7 @@
     <jsr305.version>3.0.0</jsr305.version>
     <tephra.version>0.6.0</tephra.version>
     <gson.version>2.8.9</gson.version>
+    <re2j.version>1.2</re2j.version>
   </properties>
 
   <repositories>
@@ -971,6 +972,11 @@
         <artifactId>snappy-java</artifactId>
         <version>${snappy.version}</version>
       </dependency>
+      <dependency>
+        <groupId>com.google.re2j</groupId>
+        <artifactId>re2j</artifactId>
+        <version>${re2j.version}</version>
+      </dependency>
     </dependencies>
   </dependencyManagement>
 
diff --git a/ql/pom.xml b/ql/pom.xml
index 1ed49bcde76..5df0873394f 100644
--- a/ql/pom.xml
+++ b/ql/pom.xml
@@ -768,6 +768,17 @@
       <version>${powermock.version}</version>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>com.google.guava</groupId>
+      <artifactId>guava-testlib</artifactId>
+      <version>${guava.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>com.google.re2j</groupId>
+      <artifactId>re2j</artifactId>
+      <version>${re2j.version}</version>
+    </dependency>
   </dependencies>
 
   <profiles>
@@ -969,6 +980,7 @@
                   <include>org.apache.orc:orc-shims</include>
                   <include>org.apache.orc:orc-tools</include>
                   <include>joda-time:joda-time</include>
+                  <include>com.google.re2j:re2j</include>
                 </includes>
               </artifactSet>
               <relocations>
diff --git 
a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java 
b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java
index d309c37cc15..3bf3cfd3d9e 100644
--- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java
+++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java
@@ -23,6 +23,9 @@ import static 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveO
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
+import org.apache.hadoop.hive.conf.HiveConf;
+import org.apache.hadoop.hive.ql.exec.MapredContext;
+import org.apache.hadoop.hive.ql.session.SessionState;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.apache.hadoop.hive.ql.exec.Description;
@@ -36,7 +39,6 @@ import 
org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.C
 import 
org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory;
 import 
org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory;
 import org.apache.hadoop.io.BooleanWritable;
-
 /**
  * UDF to extract a specific group identified by a java regex. Note that if a
  * regexp has a backslash ('\'), then need to specify '\\' For example,
@@ -54,11 +56,28 @@ public class GenericUDFRegExp extends GenericUDF {
   private final BooleanWritable output = new BooleanWritable();
   private transient boolean isRegexConst;
   private transient String regexConst;
-  private transient Pattern patternConst;
   private transient boolean warned;
+  private transient java.util.regex.Pattern patternConst;
+  private transient com.google.re2j.Pattern patternConstR2j;
+  private boolean useGoogleRegexEngine=false;
+
+  @Override
+  public void configure(MapredContext context) {
+    if (context != null) {
+      
if(context.getJobConf().get("hive.use.googleregex.engine").equals("true")){
+        this.useGoogleRegexEngine=true;
+      }
+    }
+
+  }
 
   @Override
   public ObjectInspector initialize(ObjectInspector[] arguments) throws 
UDFArgumentException {
+    SessionState ss = SessionState.get();
+    if (ss != null) {
+      this.useGoogleRegexEngine = 
ss.getConf().getBoolVar(HiveConf.ConfVars.HIVEUSEGOOGLEREGEXENGINE);
+    }
+
     checkArgsSize(arguments, 2, 2);
 
     checkArgPrimitive(arguments, 0);
@@ -73,7 +92,12 @@ public class GenericUDFRegExp extends GenericUDF {
     if (arguments[1] instanceof ConstantObjectInspector) {
       regexConst = getConstantStringValue(arguments, 1);
       if (regexConst != null) {
-        patternConst = Pattern.compile(regexConst);
+        if(!useGoogleRegexEngine){
+          //if(!HiveConf.getVar(hiveConf, 
HiveConf.ConfVars.HIVEUSEGOOGLEREGEXENGINE)){
+          patternConst = Pattern.compile(regexConst);
+        }else{
+          patternConstR2j = com.google.re2j.Pattern.compile(regexConst);
+        }
       }
       isRegexConst = true;
     }
@@ -109,16 +133,29 @@ public class GenericUDFRegExp extends GenericUDF {
       return output;
     }
 
-    Pattern p;
-    if (isRegexConst) {
-      p = patternConst;
-    } else {
-      p = Pattern.compile(regex);
-    }
+    if(!useGoogleRegexEngine){
+      Pattern p;
+      if (isRegexConst) {
+        p = patternConst;
+      } else {
+        p = Pattern.compile(regex);
+      }
 
-    Matcher m = p.matcher(s);
-    output.set(m.find(0));
-    return output;
+      Matcher m = p.matcher(s);
+      output.set(m.find(0));
+      return output;
+    }else{
+      com.google.re2j.Pattern patternR2j;
+      if (isRegexConst) {
+        patternR2j = patternConstR2j;
+      } else {
+        patternR2j = com.google.re2j.Pattern.compile(regex);
+      }
+
+      com.google.re2j.Matcher m = patternR2j.matcher(s);
+      output.set(m.find(0));
+      return output;
+    }
   }
 
   @Override

Reply via email to