[ https://issues.apache.org/jira/browse/OOZIE-2457?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Satish Subhashrao Saley reassigned OOZIE-2457: ---------------------------------------------- Assignee: Satish Subhashrao Saley > Oozie log parsing regex consume more than 90% cpu > ------------------------------------------------- > > Key: OOZIE-2457 > URL: https://issues.apache.org/jira/browse/OOZIE-2457 > Project: Oozie > Issue Type: Bug > Reporter: Satish Subhashrao Saley > Assignee: Satish Subhashrao Saley > Priority: Minor > > http-0.0.0.0-4080-26 TID=62215 STATE=RUNNABLE CPU_TIME=1992 (92.59%) > USER_TIME=1990 (92.46%) Allocted: 269156584 > java.util.regex.Pattern$Curly.match0(Pattern.java:4170) > java.util.regex.Pattern$Curly.match(Pattern.java:4132) > java.util.regex.Pattern$GroupHead.match(Pattern.java:4556) > java.util.regex.Matcher.match(Matcher.java:1221) > java.util.regex.Matcher.matches(Matcher.java:559) > org.apache.oozie.util.XLogFilter.matches(XLogFilter.java:136) > > org.apache.oozie.util.TimestampedMessageParser.parseNextLine(TimestampedMessageParser.java:145) > > org.apache.oozie.util.TimestampedMessageParser.increment(TimestampedMessageParser.java:92) > Regex > {code} > (.* USER\[[^\]]*\] GROUP\[[^\]]*\] TOKEN\[[^\]]*\] APP\[[^\]]*\] > JOB\[0000000-150625114739728-oozie-puru-W\] ACTION\[[^\]]*\] .*) > {code} > For single line parsing we use two regex. > 1. > {code} > public ArrayList<String> splitLogMessage(String logLine) { > Matcher splitter = SPLITTER_PATTERN.matcher(logLine); > if (splitter.matches()) { > ArrayList<String> logParts = new ArrayList<String>(); > logParts.add(splitter.group(1));// timestamp > logParts.add(splitter.group(2));// log level > logParts.add(splitter.group(3));// Log Message > return logParts; > } > else { > return null; > } > } > {code} > 2. > {code} > public boolean matches(ArrayList<String> logParts) { > if (getStartDate() != null) { > if (logParts.get(0).substring(0, > 19).compareTo(getFormattedStartDate()) < 0) { > return false; > } > } > String logLevel = logParts.get(1); > String logMessage = logParts.get(2); > if (this.logLevels == null || > this.logLevels.containsKey(logLevel.toUpperCase())) { > Matcher logMatcher = filterPattern.matcher(logMessage); > return logMatcher.matches(); > } > else { > return false; > } > } > {code} > Also there is repetitive parsing for same log message in > {code} > private String parseTimestamp(String line) { > String timestamp = null; > ArrayList<String> logParts = filter.splitLogMessage(line); > if (logParts != null) { > timestamp = logParts.get(0); > } > return timestamp; > } > {code} > where the {{line}} has already parsed using regex and we already know the > {{logParts}} if any. -- This message was sent by Atlassian JIRA (v6.3.4#6332)