OOZIE-2457 Oozie log parsing regex consume more than 90% cpu (satishsaley)

Project: http://git-wip-us.apache.org/repos/asf/oozie/repo
Commit: http://git-wip-us.apache.org/repos/asf/oozie/commit/0f27f92d
Tree: http://git-wip-us.apache.org/repos/asf/oozie/tree/0f27f92d
Diff: http://git-wip-us.apache.org/repos/asf/oozie/diff/0f27f92d

Branch: refs/heads/oya
Commit: 0f27f92d26abfa9b9fa54681ace5486b79e9959e
Parents: cd699bf
Author: satishsaley <satishsa...@apache.org>
Authored: Fri Apr 7 11:57:44 2017 -0700
Committer: satishsaley <satishsa...@apache.org>
Committed: Fri Apr 7 11:57:44 2017 -0700

----------------------------------------------------------------------
 .../java/org/apache/oozie/util/LogLine.java     | 64 ++++++++++++++++
 .../util/SimpleTimestampedMessageParser.java    | 15 +++-
 .../oozie/util/TimestampedMessageParser.java    | 81 +++++++++++++-------
 .../java/org/apache/oozie/util/XLogFilter.java  | 76 ++++++++++++++++--
 release-log.txt                                 |  1 +
 5 files changed, 200 insertions(+), 37 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/oozie/blob/0f27f92d/core/src/main/java/org/apache/oozie/util/LogLine.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/oozie/util/LogLine.java 
b/core/src/main/java/org/apache/oozie/util/LogLine.java
new file mode 100644
index 0000000..88bf56b
--- /dev/null
+++ b/core/src/main/java/org/apache/oozie/util/LogLine.java
@@ -0,0 +1,64 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.oozie.util;
+
+import java.util.ArrayList;
+
+/**
+ * A wrapper over log line to save redundant use of regex
+ */
+public class LogLine {
+    /**
+     * List to hold parts of log message.
+     */
+    private ArrayList<String> logParts;
+    private String line;
+    /**
+     * Boolean to indicate if this log line has matched to the given log 
pattern
+     */
+    private MATCHED_PATTERN pattern = MATCHED_PATTERN.NONE;
+
+    public ArrayList<String> getLogParts() {
+        return logParts;
+    }
+
+    public void setLogParts(ArrayList<String> logParts) {
+        this.logParts = logParts;
+    }
+
+    public String getLine() {
+        return line;
+    }
+
+    public void setLine(String line) {
+        this.line = line;
+    }
+
+    public MATCHED_PATTERN getMatchedPattern() {
+        return pattern;
+    }
+
+    public void setMatchedPattern(MATCHED_PATTERN pattern) {
+        this.pattern = pattern;
+    }
+
+    enum MATCHED_PATTERN {
+        SPLIT, GENENRIC, NONE;
+    }
+}

http://git-wip-us.apache.org/repos/asf/oozie/blob/0f27f92d/core/src/main/java/org/apache/oozie/util/SimpleTimestampedMessageParser.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/oozie/util/SimpleTimestampedMessageParser.java 
b/core/src/main/java/org/apache/oozie/util/SimpleTimestampedMessageParser.java
index 78cb042..0b96ad4 100644
--- 
a/core/src/main/java/org/apache/oozie/util/SimpleTimestampedMessageParser.java
+++ 
b/core/src/main/java/org/apache/oozie/util/SimpleTimestampedMessageParser.java
@@ -35,11 +35,18 @@ public class SimpleTimestampedMessageParser extends 
TimestampedMessageParser {
     /**
      * This implementation simply returns the next line from the Reader.
      *
-     * @return The next line from the Reader
+     * @return LogLine containing the next line from the Reader
      * @throws IOException
      */
     @Override
-    protected String parseNextLine() throws IOException {
-        return reader.readLine();
+    protected LogLine parseNextLogLine() throws IOException {
+        String line = null;
+        if ((line = reader.readLine()) != null) {
+            LogLine logLine = new LogLine();
+            logLine.setLine(line);
+            logLine.setLogParts(splitLogMessage(line));
+            return logLine;
+        }
+        return null;
     }
-}
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/oozie/blob/0f27f92d/core/src/main/java/org/apache/oozie/util/TimestampedMessageParser.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/oozie/util/TimestampedMessageParser.java 
b/core/src/main/java/org/apache/oozie/util/TimestampedMessageParser.java
index a676f4d..830f287 100644
--- a/core/src/main/java/org/apache/oozie/util/TimestampedMessageParser.java
+++ b/core/src/main/java/org/apache/oozie/util/TimestampedMessageParser.java
@@ -22,8 +22,11 @@ import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.Writer;
 import java.util.ArrayList;
+import java.util.regex.Pattern;
+
 import org.apache.oozie.service.Services;
 import org.apache.oozie.service.XLogStreamingService;
+import org.apache.oozie.util.LogLine.MATCHED_PATTERN;
 
   /**
  * Encapsulates the parsing and filtering of the log messages from a 
BufferedReader. It makes sure not to read in the entire log
@@ -39,13 +42,14 @@ public class TimestampedMessageParser {
 
     static final String SYSTEM_LINE_SEPARATOR = 
System.getProperty("line.separator");
     protected BufferedReader reader;
-    private String nextLine = null;
+    private LogLine nextLine = null;
     private String lastTimestamp = null;
     private XLogFilter filter;
     private boolean empty = false;
     private String lastMessage = null;
     private boolean patternMatched = false;
     public int count = 0;
+    private Pattern splitPattern = null;
 
     /**
      * Creates a TimestampedMessageParser with the given BufferedReader and 
filter.
@@ -60,6 +64,8 @@ public class TimestampedMessageParser {
             filter = new XLogFilter();
         }
         filter.constructPattern();
+        String regEx = XLogFilter.PREFIX_REGEX + 
filter.getFilterPattern().pattern();
+        this.splitPattern = Pattern.compile(regEx);
     }
 
 
@@ -75,10 +81,10 @@ public class TimestampedMessageParser {
         }
 
         StringBuilder message = new StringBuilder();
-
         if (nextLine == null) {     // first time only
-            nextLine = parseNextLine();
-            if (nextLine == null) { // reader finished
+            nextLine = parseNextLogLine();
+            if (nextLine == null || nextLine.getLine() == null) {
+                // reader finished
                 empty = true;
                 return false;
             }
@@ -86,10 +92,11 @@ public class TimestampedMessageParser {
         lastTimestamp = parseTimestamp(nextLine);
         String nextTimestamp = null;
         while (nextTimestamp == null) {
-            message.append(nextLine).append(SYSTEM_LINE_SEPARATOR);
-            nextLine = parseNextLine();
-            if (nextLine != null) {
-                nextTimestamp = parseTimestamp(nextLine);   // exit loop if we 
have a timestamp, continue if not
+            message.append(nextLine.getLine()).append(SYSTEM_LINE_SEPARATOR);
+            nextLine = parseNextLogLine();
+            if (nextLine != null && nextLine.getLine() != null) {
+                // exit loop if we have a timestamp, continue if not
+                nextTimestamp = parseTimestamp(nextLine);
             }
             else {                                          // reader finished
                 empty = true;
@@ -129,52 +136,59 @@ public class TimestampedMessageParser {
     }
 
     /**
-     * Reads the next line from the Reader and checks if it matches the 
filter.  It can also handle multi-line messages (i.e.
-     * exception stack traces).  If it returns null, then there are no lines 
left in the Reader.
+     * Reads the next line from the Reader and checks if it matches the filter.
+     * It can also handle multi-line messages (i.e. exception stack traces). If
+     * it returns null, then there are no lines left in the Reader.
      *
-     * @return The next line, or null
+     * @return LogLine
      * @throws IOException
      */
-    protected String parseNextLine() throws IOException {
+    protected LogLine parseNextLogLine() throws IOException {
         String line;
+        LogLine logLine = new LogLine();
         while ((line = reader.readLine()) != null) {
-            ArrayList<String> logParts = filter.splitLogMessage(line);
-            if (logParts != null) {
-                patternMatched = filter.matches(logParts);
-            }
+            logLine.setLine(line);
+            logLine.setLogParts(null);
+            filter.splitLogMessage(logLine, splitPattern);
+            // check the splits if logLine matches with the splitPattern
+            // Otherwise, go with previous patternMatched value. This is needed
+            // in parsing stack trace
+            patternMatched = logLine.getMatchedPattern() == 
MATCHED_PATTERN.NONE ? patternMatched
+                    : filter.splitsMatches(logLine);
             if (patternMatched) {
                 if (filter.getLogLimit() != -1) {
-                    if (logParts != null) {
+                    if (logLine.getLogParts() != null) {
                         if (count >= filter.getLogLimit()) {
                             return null;
                         }
                         count++;
                     }
                 }
-                if (logParts != null) {
+                if (logLine.getLogParts() != null) {
                     if (filter.getEndDate() != null) {
-                        //Ignore the milli second part
-                        if (logParts.get(0).substring(0, 
19).compareTo(filter.getFormattedEndDate()) > 0)
+                        // Ignore the milli second part
+                        if (logLine.getLogParts().get(0).substring(0, 
19).compareTo(filter.getFormattedEndDate()) > 0)
                             return null;
                     }
                 }
-                return line;
+                return logLine;
             }
         }
-        return line;
+        logLine.setLine(null);
+        return logLine;
     }
 
     /**
-     * Parses the timestamp out of the passed in line.  If there isn't one, it 
returns null.
+     * Parses the timestamp out of the passed in line. If there isn't one, it
+     * returns null.
      *
-     * @param line The line to check
+     * @param logLine The LogLine to check
      * @return the timestamp of the line, or null
      */
-    private String parseTimestamp(String line) {
+    private String parseTimestamp(LogLine logLine) {
         String timestamp = null;
-        ArrayList<String> logParts = filter.splitLogMessage(line);
-        if (logParts != null) {
-            timestamp = logParts.get(0);
+        if (logLine != null && logLine.getLogParts() != null && 
logLine.getLogParts().size() > 0) {
+            timestamp = logLine.getLogParts().get(0);
         }
         return timestamp;
     }
@@ -223,4 +237,15 @@ public class TimestampedMessageParser {
         processRemaining(writer, 
Services.get().get(XLogStreamingService.class).getBufferLen());
     }
 
+    /**
+     * Splits the log message into parts
+     *
+     * @param line
+     * @return List of log parts
+     */
+    protected ArrayList<String> splitLogMessage(String line) {
+        return filter.splitLogMessage(line);
+    }
+
+
 }

http://git-wip-us.apache.org/repos/asf/oozie/blob/0f27f92d/core/src/main/java/org/apache/oozie/util/XLogFilter.java
----------------------------------------------------------------------
diff --git a/core/src/main/java/org/apache/oozie/util/XLogFilter.java 
b/core/src/main/java/org/apache/oozie/util/XLogFilter.java
index 3b49f77..cdf172b 100644
--- a/core/src/main/java/org/apache/oozie/util/XLogFilter.java
+++ b/core/src/main/java/org/apache/oozie/util/XLogFilter.java
@@ -24,12 +24,15 @@ import java.util.ArrayList;
 import java.util.Date;
 import java.util.HashMap;
 import java.util.List;
+import java.util.Locale;
 import java.util.Map;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import org.apache.commons.lang.StringUtils;
 import org.apache.oozie.service.ConfigurationService;
+import org.apache.oozie.util.LogLine.MATCHED_PATTERN;
+
 import com.google.common.annotations.VisibleForTesting;
 
 /**
@@ -61,7 +64,7 @@ public class XLogFilter {
     private static final String TIMESTAMP_REGEX = "(\\d\\d\\d\\d-\\d\\d-\\d\\d 
\\d\\d:\\d\\d:\\d\\d,\\d\\d\\d)";
     private static final String WHITE_SPACE_REGEX = "\\s+";
     private static final String LOG_LEVEL_REGEX = "(\\w+)";
-    private static final String PREFIX_REGEX = TIMESTAMP_REGEX + 
WHITE_SPACE_REGEX + LOG_LEVEL_REGEX
+    static final String PREFIX_REGEX = TIMESTAMP_REGEX + WHITE_SPACE_REGEX + 
LOG_LEVEL_REGEX
             + WHITE_SPACE_REGEX;
     private static final Pattern SPLITTER_PATTERN = 
Pattern.compile(PREFIX_REGEX + ALLOW_ALL_REGEX);
 
@@ -121,6 +124,33 @@ public class XLogFilter {
      * @param logParts
      * @return
      */
+    public boolean splitsMatches(LogLine logLine) {
+        // Check whether logLine matched with filter
+        if (logLine.getMatchedPattern() != MATCHED_PATTERN.SPLIT) {
+            return false;
+        }
+        ArrayList<String> logParts = logLine.getLogParts();
+        if (getStartDate() != null) {
+            if (logParts.get(0).substring(0, 
19).compareTo(getFormattedStartDate()) < 0) {
+                return false;
+            }
+        }
+        String logLevel = logParts.get(1);
+        if (this.logLevels == null || 
this.logLevels.containsKey(logLevel.toUpperCase(Locale.ENGLISH))) {
+            // line contains the permitted logLevel
+            return true;
+        }
+        else {
+            return false;
+        }
+    }
+
+    /**
+     * Checks if the logLevel and logMessage goes through the logFilter.
+     *
+     * @param logParts
+     * @return
+     */
     public boolean matches(ArrayList<String> logParts) {
         if (getStartDate() != null) {
             if (logParts.get(0).substring(0, 
19).compareTo(getFormattedStartDate()) < 0) {
@@ -139,8 +169,9 @@ public class XLogFilter {
     }
 
     /**
-     * Splits the log line into timestamp, logLevel and remaining log message. 
Returns array containing timestamp,
-     * logLevel, and logMessage if the pattern matches i.e A new log 
statement, else returns null.
+     * Splits the log line into timestamp, logLevel and remaining log message.
+     * Returns array containing timestamp, logLevel, and logMessage if the
+     * pattern matches i.e A new log statement, else returns null.
      *
      * @param logLine
      * @return Array containing log level and log message
@@ -160,8 +191,39 @@ public class XLogFilter {
     }
 
     /**
-     * Constructs the regular expression according to the filter and assigns 
it to fileterPattarn. ".*" will be assigned
-     * if no filters are set.
+     * If <code>logLine</code> matches with <code>splitPattern</code>,
+     * <ol>
+     * <li>Split the log line into timestamp, logLevel and remaining log
+     * message.</li>
+     * <li>Record the parts of message in <code>logLine</code> to avoid regex
+     * matching in future.</li>
+     * <li>Record the pattern to which <code>logLine</code> has matched.</li>
+     * </ol>
+     * @param logLine
+     * @param splitPattern
+     */
+    public void splitLogMessage(LogLine logLine, Pattern splitPattern) {
+        Matcher splitterWithJobId = splitPattern.matcher(logLine.getLine());
+        Matcher allowAll = SPLITTER_PATTERN.matcher(logLine.getLine());
+        if (splitterWithJobId.matches()) {
+            ArrayList<String> logParts = new ArrayList<String>(3);
+            logParts.add(splitterWithJobId.group(1));// timestamp
+            logParts.add(splitterWithJobId.group(2));// log level
+            logParts.add(splitterWithJobId.group(3));// log message
+            logLine.setLogParts(logParts);
+            logLine.setMatchedPattern(MATCHED_PATTERN.SPLIT);
+        }
+        else if (allowAll.matches()) {
+            logLine.setMatchedPattern(MATCHED_PATTERN.GENENRIC);
+        }
+        else {
+            logLine.setMatchedPattern(MATCHED_PATTERN.NONE);
+        }
+    }
+
+    /**
+     * Constructs the regular expression according to the filter and assigns it
+     * to fileterPattarn. ".*" will be assigned if no filters are set.
      */
     public void constructPattern() {
         if (noFilter && logLevels == null) {
@@ -346,4 +408,8 @@ public class XLogFilter {
         this.filterPattern = filterPattern;
     }
 
+    public Pattern getFilterPattern() {
+        return this.filterPattern;
+    }
+
 }

http://git-wip-us.apache.org/repos/asf/oozie/blob/0f27f92d/release-log.txt
----------------------------------------------------------------------
diff --git a/release-log.txt b/release-log.txt
index 060e742..cb81d42 100644
--- a/release-log.txt
+++ b/release-log.txt
@@ -1,5 +1,6 @@
 -- Oozie 4.4.0 release (trunk - unreleased)
 
+OOZIE-2457 Oozie log parsing regex consume more than 90% cpu (satishsaley)
 OOZIE-2844 Increase stability of Oozie actions when log4j.properties is 
missing or not readable (andras.piros via pbacsko)
 OOZIE-2701 Oozie to support Multiple HCatalog URIs (abhishekbafna)
 OOZIE-2850 Fix default callback notifications (asasvari via gezapeti)

Reply via email to