OOZIE-2457 Oozie log parsing regex consume more than 90% cpu (satishsaley)
Project: http://git-wip-us.apache.org/repos/asf/oozie/repo Commit: http://git-wip-us.apache.org/repos/asf/oozie/commit/0f27f92d Tree: http://git-wip-us.apache.org/repos/asf/oozie/tree/0f27f92d Diff: http://git-wip-us.apache.org/repos/asf/oozie/diff/0f27f92d Branch: refs/heads/oya Commit: 0f27f92d26abfa9b9fa54681ace5486b79e9959e Parents: cd699bf Author: satishsaley <satishsa...@apache.org> Authored: Fri Apr 7 11:57:44 2017 -0700 Committer: satishsaley <satishsa...@apache.org> Committed: Fri Apr 7 11:57:44 2017 -0700 ---------------------------------------------------------------------- .../java/org/apache/oozie/util/LogLine.java | 64 ++++++++++++++++ .../util/SimpleTimestampedMessageParser.java | 15 +++- .../oozie/util/TimestampedMessageParser.java | 81 +++++++++++++------- .../java/org/apache/oozie/util/XLogFilter.java | 76 ++++++++++++++++-- release-log.txt | 1 + 5 files changed, 200 insertions(+), 37 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/oozie/blob/0f27f92d/core/src/main/java/org/apache/oozie/util/LogLine.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/oozie/util/LogLine.java b/core/src/main/java/org/apache/oozie/util/LogLine.java new file mode 100644 index 0000000..88bf56b --- /dev/null +++ b/core/src/main/java/org/apache/oozie/util/LogLine.java @@ -0,0 +1,64 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.oozie.util; + +import java.util.ArrayList; + +/** + * A wrapper over log line to save redundant use of regex + */ +public class LogLine { + /** + * List to hold parts of log message. + */ + private ArrayList<String> logParts; + private String line; + /** + * Boolean to indicate if this log line has matched to the given log pattern + */ + private MATCHED_PATTERN pattern = MATCHED_PATTERN.NONE; + + public ArrayList<String> getLogParts() { + return logParts; + } + + public void setLogParts(ArrayList<String> logParts) { + this.logParts = logParts; + } + + public String getLine() { + return line; + } + + public void setLine(String line) { + this.line = line; + } + + public MATCHED_PATTERN getMatchedPattern() { + return pattern; + } + + public void setMatchedPattern(MATCHED_PATTERN pattern) { + this.pattern = pattern; + } + + enum MATCHED_PATTERN { + SPLIT, GENENRIC, NONE; + } +} http://git-wip-us.apache.org/repos/asf/oozie/blob/0f27f92d/core/src/main/java/org/apache/oozie/util/SimpleTimestampedMessageParser.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/oozie/util/SimpleTimestampedMessageParser.java b/core/src/main/java/org/apache/oozie/util/SimpleTimestampedMessageParser.java index 78cb042..0b96ad4 100644 --- a/core/src/main/java/org/apache/oozie/util/SimpleTimestampedMessageParser.java +++ b/core/src/main/java/org/apache/oozie/util/SimpleTimestampedMessageParser.java @@ -35,11 +35,18 @@ public class SimpleTimestampedMessageParser extends TimestampedMessageParser { /** * This implementation simply returns the next line from the Reader. * - * @return The next line from the Reader + * @return LogLine containing the next line from the Reader * @throws IOException */ @Override - protected String parseNextLine() throws IOException { - return reader.readLine(); + protected LogLine parseNextLogLine() throws IOException { + String line = null; + if ((line = reader.readLine()) != null) { + LogLine logLine = new LogLine(); + logLine.setLine(line); + logLine.setLogParts(splitLogMessage(line)); + return logLine; + } + return null; } -} +} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/oozie/blob/0f27f92d/core/src/main/java/org/apache/oozie/util/TimestampedMessageParser.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/oozie/util/TimestampedMessageParser.java b/core/src/main/java/org/apache/oozie/util/TimestampedMessageParser.java index a676f4d..830f287 100644 --- a/core/src/main/java/org/apache/oozie/util/TimestampedMessageParser.java +++ b/core/src/main/java/org/apache/oozie/util/TimestampedMessageParser.java @@ -22,8 +22,11 @@ import java.io.BufferedReader; import java.io.IOException; import java.io.Writer; import java.util.ArrayList; +import java.util.regex.Pattern; + import org.apache.oozie.service.Services; import org.apache.oozie.service.XLogStreamingService; +import org.apache.oozie.util.LogLine.MATCHED_PATTERN; /** * Encapsulates the parsing and filtering of the log messages from a BufferedReader. It makes sure not to read in the entire log @@ -39,13 +42,14 @@ public class TimestampedMessageParser { static final String SYSTEM_LINE_SEPARATOR = System.getProperty("line.separator"); protected BufferedReader reader; - private String nextLine = null; + private LogLine nextLine = null; private String lastTimestamp = null; private XLogFilter filter; private boolean empty = false; private String lastMessage = null; private boolean patternMatched = false; public int count = 0; + private Pattern splitPattern = null; /** * Creates a TimestampedMessageParser with the given BufferedReader and filter. @@ -60,6 +64,8 @@ public class TimestampedMessageParser { filter = new XLogFilter(); } filter.constructPattern(); + String regEx = XLogFilter.PREFIX_REGEX + filter.getFilterPattern().pattern(); + this.splitPattern = Pattern.compile(regEx); } @@ -75,10 +81,10 @@ public class TimestampedMessageParser { } StringBuilder message = new StringBuilder(); - if (nextLine == null) { // first time only - nextLine = parseNextLine(); - if (nextLine == null) { // reader finished + nextLine = parseNextLogLine(); + if (nextLine == null || nextLine.getLine() == null) { + // reader finished empty = true; return false; } @@ -86,10 +92,11 @@ public class TimestampedMessageParser { lastTimestamp = parseTimestamp(nextLine); String nextTimestamp = null; while (nextTimestamp == null) { - message.append(nextLine).append(SYSTEM_LINE_SEPARATOR); - nextLine = parseNextLine(); - if (nextLine != null) { - nextTimestamp = parseTimestamp(nextLine); // exit loop if we have a timestamp, continue if not + message.append(nextLine.getLine()).append(SYSTEM_LINE_SEPARATOR); + nextLine = parseNextLogLine(); + if (nextLine != null && nextLine.getLine() != null) { + // exit loop if we have a timestamp, continue if not + nextTimestamp = parseTimestamp(nextLine); } else { // reader finished empty = true; @@ -129,52 +136,59 @@ public class TimestampedMessageParser { } /** - * Reads the next line from the Reader and checks if it matches the filter. It can also handle multi-line messages (i.e. - * exception stack traces). If it returns null, then there are no lines left in the Reader. + * Reads the next line from the Reader and checks if it matches the filter. + * It can also handle multi-line messages (i.e. exception stack traces). If + * it returns null, then there are no lines left in the Reader. * - * @return The next line, or null + * @return LogLine * @throws IOException */ - protected String parseNextLine() throws IOException { + protected LogLine parseNextLogLine() throws IOException { String line; + LogLine logLine = new LogLine(); while ((line = reader.readLine()) != null) { - ArrayList<String> logParts = filter.splitLogMessage(line); - if (logParts != null) { - patternMatched = filter.matches(logParts); - } + logLine.setLine(line); + logLine.setLogParts(null); + filter.splitLogMessage(logLine, splitPattern); + // check the splits if logLine matches with the splitPattern + // Otherwise, go with previous patternMatched value. This is needed + // in parsing stack trace + patternMatched = logLine.getMatchedPattern() == MATCHED_PATTERN.NONE ? patternMatched + : filter.splitsMatches(logLine); if (patternMatched) { if (filter.getLogLimit() != -1) { - if (logParts != null) { + if (logLine.getLogParts() != null) { if (count >= filter.getLogLimit()) { return null; } count++; } } - if (logParts != null) { + if (logLine.getLogParts() != null) { if (filter.getEndDate() != null) { - //Ignore the milli second part - if (logParts.get(0).substring(0, 19).compareTo(filter.getFormattedEndDate()) > 0) + // Ignore the milli second part + if (logLine.getLogParts().get(0).substring(0, 19).compareTo(filter.getFormattedEndDate()) > 0) return null; } } - return line; + return logLine; } } - return line; + logLine.setLine(null); + return logLine; } /** - * Parses the timestamp out of the passed in line. If there isn't one, it returns null. + * Parses the timestamp out of the passed in line. If there isn't one, it + * returns null. * - * @param line The line to check + * @param logLine The LogLine to check * @return the timestamp of the line, or null */ - private String parseTimestamp(String line) { + private String parseTimestamp(LogLine logLine) { String timestamp = null; - ArrayList<String> logParts = filter.splitLogMessage(line); - if (logParts != null) { - timestamp = logParts.get(0); + if (logLine != null && logLine.getLogParts() != null && logLine.getLogParts().size() > 0) { + timestamp = logLine.getLogParts().get(0); } return timestamp; } @@ -223,4 +237,15 @@ public class TimestampedMessageParser { processRemaining(writer, Services.get().get(XLogStreamingService.class).getBufferLen()); } + /** + * Splits the log message into parts + * + * @param line + * @return List of log parts + */ + protected ArrayList<String> splitLogMessage(String line) { + return filter.splitLogMessage(line); + } + + } http://git-wip-us.apache.org/repos/asf/oozie/blob/0f27f92d/core/src/main/java/org/apache/oozie/util/XLogFilter.java ---------------------------------------------------------------------- diff --git a/core/src/main/java/org/apache/oozie/util/XLogFilter.java b/core/src/main/java/org/apache/oozie/util/XLogFilter.java index 3b49f77..cdf172b 100644 --- a/core/src/main/java/org/apache/oozie/util/XLogFilter.java +++ b/core/src/main/java/org/apache/oozie/util/XLogFilter.java @@ -24,12 +24,15 @@ import java.util.ArrayList; import java.util.Date; import java.util.HashMap; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import org.apache.commons.lang.StringUtils; import org.apache.oozie.service.ConfigurationService; +import org.apache.oozie.util.LogLine.MATCHED_PATTERN; + import com.google.common.annotations.VisibleForTesting; /** @@ -61,7 +64,7 @@ public class XLogFilter { private static final String TIMESTAMP_REGEX = "(\\d\\d\\d\\d-\\d\\d-\\d\\d \\d\\d:\\d\\d:\\d\\d,\\d\\d\\d)"; private static final String WHITE_SPACE_REGEX = "\\s+"; private static final String LOG_LEVEL_REGEX = "(\\w+)"; - private static final String PREFIX_REGEX = TIMESTAMP_REGEX + WHITE_SPACE_REGEX + LOG_LEVEL_REGEX + static final String PREFIX_REGEX = TIMESTAMP_REGEX + WHITE_SPACE_REGEX + LOG_LEVEL_REGEX + WHITE_SPACE_REGEX; private static final Pattern SPLITTER_PATTERN = Pattern.compile(PREFIX_REGEX + ALLOW_ALL_REGEX); @@ -121,6 +124,33 @@ public class XLogFilter { * @param logParts * @return */ + public boolean splitsMatches(LogLine logLine) { + // Check whether logLine matched with filter + if (logLine.getMatchedPattern() != MATCHED_PATTERN.SPLIT) { + return false; + } + ArrayList<String> logParts = logLine.getLogParts(); + if (getStartDate() != null) { + if (logParts.get(0).substring(0, 19).compareTo(getFormattedStartDate()) < 0) { + return false; + } + } + String logLevel = logParts.get(1); + if (this.logLevels == null || this.logLevels.containsKey(logLevel.toUpperCase(Locale.ENGLISH))) { + // line contains the permitted logLevel + return true; + } + else { + return false; + } + } + + /** + * Checks if the logLevel and logMessage goes through the logFilter. + * + * @param logParts + * @return + */ public boolean matches(ArrayList<String> logParts) { if (getStartDate() != null) { if (logParts.get(0).substring(0, 19).compareTo(getFormattedStartDate()) < 0) { @@ -139,8 +169,9 @@ public class XLogFilter { } /** - * Splits the log line into timestamp, logLevel and remaining log message. Returns array containing timestamp, - * logLevel, and logMessage if the pattern matches i.e A new log statement, else returns null. + * Splits the log line into timestamp, logLevel and remaining log message. + * Returns array containing timestamp, logLevel, and logMessage if the + * pattern matches i.e A new log statement, else returns null. * * @param logLine * @return Array containing log level and log message @@ -160,8 +191,39 @@ public class XLogFilter { } /** - * Constructs the regular expression according to the filter and assigns it to fileterPattarn. ".*" will be assigned - * if no filters are set. + * If <code>logLine</code> matches with <code>splitPattern</code>, + * <ol> + * <li>Split the log line into timestamp, logLevel and remaining log + * message.</li> + * <li>Record the parts of message in <code>logLine</code> to avoid regex + * matching in future.</li> + * <li>Record the pattern to which <code>logLine</code> has matched.</li> + * </ol> + * @param logLine + * @param splitPattern + */ + public void splitLogMessage(LogLine logLine, Pattern splitPattern) { + Matcher splitterWithJobId = splitPattern.matcher(logLine.getLine()); + Matcher allowAll = SPLITTER_PATTERN.matcher(logLine.getLine()); + if (splitterWithJobId.matches()) { + ArrayList<String> logParts = new ArrayList<String>(3); + logParts.add(splitterWithJobId.group(1));// timestamp + logParts.add(splitterWithJobId.group(2));// log level + logParts.add(splitterWithJobId.group(3));// log message + logLine.setLogParts(logParts); + logLine.setMatchedPattern(MATCHED_PATTERN.SPLIT); + } + else if (allowAll.matches()) { + logLine.setMatchedPattern(MATCHED_PATTERN.GENENRIC); + } + else { + logLine.setMatchedPattern(MATCHED_PATTERN.NONE); + } + } + + /** + * Constructs the regular expression according to the filter and assigns it + * to fileterPattarn. ".*" will be assigned if no filters are set. */ public void constructPattern() { if (noFilter && logLevels == null) { @@ -346,4 +408,8 @@ public class XLogFilter { this.filterPattern = filterPattern; } + public Pattern getFilterPattern() { + return this.filterPattern; + } + } http://git-wip-us.apache.org/repos/asf/oozie/blob/0f27f92d/release-log.txt ---------------------------------------------------------------------- diff --git a/release-log.txt b/release-log.txt index 060e742..cb81d42 100644 --- a/release-log.txt +++ b/release-log.txt @@ -1,5 +1,6 @@ -- Oozie 4.4.0 release (trunk - unreleased) +OOZIE-2457 Oozie log parsing regex consume more than 90% cpu (satishsaley) OOZIE-2844 Increase stability of Oozie actions when log4j.properties is missing or not readable (andras.piros via pbacsko) OOZIE-2701 Oozie to support Multiple HCatalog URIs (abhishekbafna) OOZIE-2850 Fix default callback notifications (asasvari via gezapeti)