This is an automated email from the ASF dual-hosted git repository. sankarh pushed a commit to branch branch-3 in repository https://gitbox.apache.org/repos/asf/hive.git
The following commit(s) were added to refs/heads/branch-3 by this push: new 6a5e6cd69bf HIVE-27605: Backport of HIVE-19661: Switch Hive UDFs to use Re2J regex engine (Rajkumar Singh via Ashutosh Chauhan) 6a5e6cd69bf is described below commit 6a5e6cd69bf3928819e648ab8a98f6d78b6a64c7 Author: Aman Raj <104416558+amanraj2...@users.noreply.github.com> AuthorDate: Mon Sep 4 14:58:09 2023 +0530 HIVE-27605: Backport of HIVE-19661: Switch Hive UDFs to use Re2J regex engine (Rajkumar Singh via Ashutosh Chauhan) Signed-off-by: Sankar Hariappan <sank...@apache.org> Closes (#4584) --- LICENSE | 30 +++++++++++ .../java/org/apache/hadoop/hive/conf/HiveConf.java | 1 + pom.xml | 6 +++ ql/pom.xml | 12 +++++ .../hive/ql/udf/generic/GenericUDFRegExp.java | 61 +++++++++++++++++----- 5 files changed, 98 insertions(+), 12 deletions(-) diff --git a/LICENSE b/LICENSE index 3e7dc6b98cf..316afc629b8 100644 --- a/LICENSE +++ b/LICENSE @@ -404,4 +404,34 @@ products or services of Licensee, or any third party. agrees to be bound by the terms and conditions of this License Agreement. +For google re2j (https://github.com/google/re2j/blob/master/LICENSE): + +Copyright (c) 2009 The Go Authors. All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + * Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above +copyright notice, this list of conditions and the following disclaimer +in the documentation and/or other materials provided with the +distribution. + * Neither the name of Google Inc. nor the names of its +contributors may be used to endorse or promote products derived from +this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + diff --git a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java index 33796a24d19..606eedd1c4d 100644 --- a/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java +++ b/common/src/java/org/apache/hadoop/hive/conf/HiveConf.java @@ -3716,6 +3716,7 @@ public class HiveConf extends Configuration { "Time to wait to finish prewarming spark executors"), HIVESTAGEIDREARRANGE("hive.stageid.rearrange", "none", new StringSet("none", "idonly", "traverse", "execution"), ""), HIVEEXPLAINDEPENDENCYAPPENDTASKTYPES("hive.explain.dependency.append.tasktype", false, ""), + HIVEUSEGOOGLEREGEXENGINE("hive.use.googleregex.engine",false,"whether to use google regex engine or not, default regex engine is java.util.regex"), HIVECOUNTERGROUP("hive.counters.group.name", "HIVE", "The name of counter group for internal Hive variables (CREATED_FILE, FATAL_ERROR, etc.)"), diff --git a/pom.xml b/pom.xml index b24f90da574..a07c7627a81 100644 --- a/pom.xml +++ b/pom.xml @@ -216,6 +216,7 @@ <jsr305.version>3.0.0</jsr305.version> <tephra.version>0.6.0</tephra.version> <gson.version>2.8.9</gson.version> + <re2j.version>1.2</re2j.version> </properties> <repositories> @@ -971,6 +972,11 @@ <artifactId>snappy-java</artifactId> <version>${snappy.version}</version> </dependency> + <dependency> + <groupId>com.google.re2j</groupId> + <artifactId>re2j</artifactId> + <version>${re2j.version}</version> + </dependency> </dependencies> </dependencyManagement> diff --git a/ql/pom.xml b/ql/pom.xml index 1ed49bcde76..5df0873394f 100644 --- a/ql/pom.xml +++ b/ql/pom.xml @@ -768,6 +768,17 @@ <version>${powermock.version}</version> <scope>test</scope> </dependency> + <dependency> + <groupId>com.google.guava</groupId> + <artifactId>guava-testlib</artifactId> + <version>${guava.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>com.google.re2j</groupId> + <artifactId>re2j</artifactId> + <version>${re2j.version}</version> + </dependency> </dependencies> <profiles> @@ -969,6 +980,7 @@ <include>org.apache.orc:orc-shims</include> <include>org.apache.orc:orc-tools</include> <include>joda-time:joda-time</include> + <include>com.google.re2j:re2j</include> </includes> </artifactSet> <relocations> diff --git a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java index d309c37cc15..3bf3cfd3d9e 100644 --- a/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java +++ b/ql/src/java/org/apache/hadoop/hive/ql/udf/generic/GenericUDFRegExp.java @@ -23,6 +23,9 @@ import static org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveO import java.util.regex.Matcher; import java.util.regex.Pattern; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.ql.exec.MapredContext; +import org.apache.hadoop.hive.ql.session.SessionState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.hadoop.hive.ql.exec.Description; @@ -36,7 +39,6 @@ import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorConverters.C import org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector.PrimitiveCategory; import org.apache.hadoop.hive.serde2.objectinspector.primitive.PrimitiveObjectInspectorFactory; import org.apache.hadoop.io.BooleanWritable; - /** * UDF to extract a specific group identified by a java regex. Note that if a * regexp has a backslash ('\'), then need to specify '\\' For example, @@ -54,11 +56,28 @@ public class GenericUDFRegExp extends GenericUDF { private final BooleanWritable output = new BooleanWritable(); private transient boolean isRegexConst; private transient String regexConst; - private transient Pattern patternConst; private transient boolean warned; + private transient java.util.regex.Pattern patternConst; + private transient com.google.re2j.Pattern patternConstR2j; + private boolean useGoogleRegexEngine=false; + + @Override + public void configure(MapredContext context) { + if (context != null) { + if(context.getJobConf().get("hive.use.googleregex.engine").equals("true")){ + this.useGoogleRegexEngine=true; + } + } + + } @Override public ObjectInspector initialize(ObjectInspector[] arguments) throws UDFArgumentException { + SessionState ss = SessionState.get(); + if (ss != null) { + this.useGoogleRegexEngine = ss.getConf().getBoolVar(HiveConf.ConfVars.HIVEUSEGOOGLEREGEXENGINE); + } + checkArgsSize(arguments, 2, 2); checkArgPrimitive(arguments, 0); @@ -73,7 +92,12 @@ public class GenericUDFRegExp extends GenericUDF { if (arguments[1] instanceof ConstantObjectInspector) { regexConst = getConstantStringValue(arguments, 1); if (regexConst != null) { - patternConst = Pattern.compile(regexConst); + if(!useGoogleRegexEngine){ + //if(!HiveConf.getVar(hiveConf, HiveConf.ConfVars.HIVEUSEGOOGLEREGEXENGINE)){ + patternConst = Pattern.compile(regexConst); + }else{ + patternConstR2j = com.google.re2j.Pattern.compile(regexConst); + } } isRegexConst = true; } @@ -109,16 +133,29 @@ public class GenericUDFRegExp extends GenericUDF { return output; } - Pattern p; - if (isRegexConst) { - p = patternConst; - } else { - p = Pattern.compile(regex); - } + if(!useGoogleRegexEngine){ + Pattern p; + if (isRegexConst) { + p = patternConst; + } else { + p = Pattern.compile(regex); + } - Matcher m = p.matcher(s); - output.set(m.find(0)); - return output; + Matcher m = p.matcher(s); + output.set(m.find(0)); + return output; + }else{ + com.google.re2j.Pattern patternR2j; + if (isRegexConst) { + patternR2j = patternConstR2j; + } else { + patternR2j = com.google.re2j.Pattern.compile(regex); + } + + com.google.re2j.Matcher m = patternR2j.matcher(s); + output.set(m.find(0)); + return output; + } } @Override