Repository: incubator-eagle Updated Branches: refs/heads/master a6bc0a524 -> a710082d4
[EAGLE-615] Jsoup parse hive sql return String without line break "\n" EAGLE-615 Jsoup parse hive sql return String without line break "\n" - ADD "doc.outputSettings().prettyPrint(false);", get element value via getWholeText() not text() Author: chitin <chitin1...@gmail.com> Closes #499 from chitin/EAGLE615. Project: http://git-wip-us.apache.org/repos/asf/incubator-eagle/repo Commit: http://git-wip-us.apache.org/repos/asf/incubator-eagle/commit/a710082d Tree: http://git-wip-us.apache.org/repos/asf/incubator-eagle/tree/a710082d Diff: http://git-wip-us.apache.org/repos/asf/incubator-eagle/diff/a710082d Branch: refs/heads/master Commit: a710082d486e10b4732c00a06dd367dc556df60a Parents: a6bc0a5 Author: chitin <chitin1...@gmail.com> Authored: Mon Oct 17 11:42:27 2016 +0800 Committer: Hao Chen <h...@apache.org> Committed: Mon Oct 17 11:42:27 2016 +0800 ---------------------------------------------------------------------- .../hive/jobrunning/HiveJobFetchSpout.java | 35 +++++++++++++++----- 1 file changed, 26 insertions(+), 9 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/incubator-eagle/blob/a710082d/eagle-security/eagle-security-hive/src/main/java/org/apache/eagle/security/hive/jobrunning/HiveJobFetchSpout.java ---------------------------------------------------------------------- diff --git a/eagle-security/eagle-security-hive/src/main/java/org/apache/eagle/security/hive/jobrunning/HiveJobFetchSpout.java b/eagle-security/eagle-security-hive/src/main/java/org/apache/eagle/security/hive/jobrunning/HiveJobFetchSpout.java index c0673b3..af4599b 100644 --- a/eagle-security/eagle-security-hive/src/main/java/org/apache/eagle/security/hive/jobrunning/HiveJobFetchSpout.java +++ b/eagle-security/eagle-security-hive/src/main/java/org/apache/eagle/security/hive/jobrunning/HiveJobFetchSpout.java @@ -22,6 +22,7 @@ import backtype.storm.task.TopologyContext; import backtype.storm.topology.OutputFieldsDeclarer; import backtype.storm.topology.base.BaseRichSpout; import backtype.storm.tuple.Fields; +import org.apache.commons.lang.StringUtils; import org.apache.eagle.dataproc.impl.storm.ValuesArray; import org.apache.eagle.jpm.util.*; import org.apache.eagle.jpm.util.jobrecover.RunningJobManager; @@ -35,12 +36,14 @@ import org.apache.eagle.security.hive.config.RunningJobCrawlConfig; import org.codehaus.jackson.JsonParser; import org.codehaus.jackson.map.ObjectMapper; import org.jsoup.Jsoup; +import org.jsoup.nodes.TextNode; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.w3c.dom.Document; import org.w3c.dom.Element; import org.w3c.dom.Node; import org.w3c.dom.NodeList; + import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import java.io.InputStream; @@ -62,6 +65,7 @@ public class HiveJobFetchSpout extends BaseRichSpout { private Long lastFinishAppTime; private RunningJobManager runningJobManager; private int partitionId; + static { OBJ_MAPPER.configure(JsonParser.Feature.ALLOW_NON_NUMERIC_NUMBERS, true); } @@ -91,22 +95,22 @@ public class HiveJobFetchSpout extends BaseRichSpout { // sanity verify 0<=partitionId<=numTotalPartitions-1 if (partitionId < 0 || partitionId > crawlConfig.controlConfig.numTotalPartitions) { throw new IllegalStateException("partitionId should be less than numTotalPartitions with partitionId " + - partitionId + " and numTotalPartitions " + crawlConfig.controlConfig.numTotalPartitions); + partitionId + " and numTotalPartitions " + crawlConfig.controlConfig.numTotalPartitions); } Class<? extends JobIdPartitioner> partitionerCls = crawlConfig.controlConfig.partitionerCls; try { this.jobFilter = new JobIdFilterByPartition(partitionerCls.newInstance(), - crawlConfig.controlConfig.numTotalPartitions, partitionId); + crawlConfig.controlConfig.numTotalPartitions, partitionId); } catch (Exception e) { LOG.error("failing instantiating job partitioner class " + partitionerCls.getCanonicalName()); throw new IllegalStateException(e); } this.collector = collector; this.runningJobManager = new RunningJobManager(crawlConfig.zkStateConfig.zkQuorum, - crawlConfig.zkStateConfig.zkSessionTimeoutMs, - crawlConfig.zkStateConfig.zkRetryTimes, - crawlConfig.zkStateConfig.zkRetryInterval, - crawlConfig.zkStateConfig.zkRoot); + crawlConfig.zkStateConfig.zkSessionTimeoutMs, + crawlConfig.zkStateConfig.zkRetryTimes, + crawlConfig.zkStateConfig.zkRetryInterval, + crawlConfig.zkStateConfig.zkRoot); this.lastFinishAppTime = this.runningJobManager.recoverLastFinishedTime(partitionId); if (this.lastFinishAppTime == 0l) { this.lastFinishAppTime = Calendar.getInstance().getTimeInMillis() - 24 * 60 * 60000l;//one day ago @@ -119,7 +123,7 @@ public class HiveJobFetchSpout extends BaseRichSpout { LOG.info("start to fetch job list"); try { List<AppInfo> apps = rmResourceFetcher.getResource(Constants.ResourceType.RUNNING_MR_JOB); - if(apps == null){ + if (apps == null) { apps = new ArrayList<>(); } handleApps(apps, true); @@ -127,7 +131,7 @@ public class HiveJobFetchSpout extends BaseRichSpout { long fetchTime = Calendar.getInstance().getTimeInMillis(); if (fetchTime - this.lastFinishAppTime > 60000l) { apps = rmResourceFetcher.getResource(Constants.ResourceType.COMPLETE_MR_JOB, Long.toString(this.lastFinishAppTime)); - if(apps == null){ + if (apps == null) { apps = new ArrayList<>(); } handleApps(apps, false); @@ -230,6 +234,7 @@ public class HiveJobFetchSpout extends BaseRichSpout { LOG.info("fetch job conf from {}", urlString); is = InputStreamUtils.getInputStream(urlString, null, Constants.CompressionType.NONE); final org.jsoup.nodes.Document doc = Jsoup.parse(is, "UTF-8", urlString); + doc.outputSettings().prettyPrint(false); org.jsoup.select.Elements elements = doc.select("table[id=conf]").select("tbody").select("tr"); Map<String, String> hiveQueryLog = new HashMap<>(); Iterator<org.jsoup.nodes.Element> iter = elements.iterator(); @@ -237,7 +242,19 @@ public class HiveJobFetchSpout extends BaseRichSpout { org.jsoup.nodes.Element element = iter.next(); org.jsoup.select.Elements tds = element.children(); String key = tds.get(0).text(); - String value = tds.get(1).text(); + String value = ""; + org.jsoup.nodes.Element valueElement = tds.get(1); + if (Constants.HIVE_QUERY_STRING.equals(key)) { + for (org.jsoup.nodes.Node child : valueElement.childNodes()) { + if (child instanceof TextNode) { + TextNode valueTextNode = (TextNode) child; + value = valueTextNode.getWholeText(); + value = StringUtils.strip(value); + } + } + } else { + value = valueElement.text(); + } hiveQueryLog.put(key, value); } if (hiveQueryLog.containsKey(Constants.HIVE_QUERY_STRING)) {