Modified: nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilterException.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilterException.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilterException.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilterException.java Thu Jan 29 05:38:59 2015 @@ -1,19 +1,19 @@ /* -* Licensed to the Apache Software Foundation (ASF) under one or more -* contributor license agreements. See the NOTICE file distributed with -* this work for additional information regarding copyright ownership. -* The ASF licenses this file to You under the Apache License, Version 2.0 -* (the "License"); you may not use this file except in compliance with -* the License. You may obtain a copy of the License at -* -* http://www.apache.org/licenses/LICENSE-2.0 -* -* Unless required by applicable law or agreed to in writing, software -* distributed under the License is distributed on an "AS IS" BASIS, -* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -* See the License for the specific language governing permissions and -* limitations under the License. -*/ + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.apache.nutch.scoring; /**
Modified: nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/ScoringFilters.java Thu Jan 29 05:38:59 2015 @@ -49,7 +49,8 @@ public class ScoringFilters extends Conf } /** Calculate a sort value for Generate. */ - public float generatorSortValue(Text url, CrawlDatum datum, float initSort) throws ScoringFilterException { + public float generatorSortValue(Text url, CrawlDatum datum, float initSort) + throws ScoringFilterException { for (int i = 0; i < this.filters.length; i++) { initSort = this.filters[i].generatorSortValue(url, datum, initSort); } @@ -57,48 +58,59 @@ public class ScoringFilters extends Conf } /** Calculate a new initial score, used when adding newly discovered pages. */ - public void initialScore(Text url, CrawlDatum datum) throws ScoringFilterException { + public void initialScore(Text url, CrawlDatum datum) + throws ScoringFilterException { for (int i = 0; i < this.filters.length; i++) { this.filters[i].initialScore(url, datum); } } /** Calculate a new initial score, used when injecting new pages. */ - public void injectedScore(Text url, CrawlDatum datum) throws ScoringFilterException { + public void injectedScore(Text url, CrawlDatum datum) + throws ScoringFilterException { for (int i = 0; i < this.filters.length; i++) { this.filters[i].injectedScore(url, datum); } } /** Calculate updated page score during CrawlDb.update(). */ - public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, List<CrawlDatum> inlinked) throws ScoringFilterException { + public void updateDbScore(Text url, CrawlDatum old, CrawlDatum datum, + List<CrawlDatum> inlinked) throws ScoringFilterException { for (int i = 0; i < this.filters.length; i++) { this.filters[i].updateDbScore(url, old, datum, inlinked); } } - public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) throws ScoringFilterException { + public void passScoreBeforeParsing(Text url, CrawlDatum datum, Content content) + throws ScoringFilterException { for (int i = 0; i < this.filters.length; i++) { this.filters[i].passScoreBeforeParsing(url, datum, content); } } - - public void passScoreAfterParsing(Text url, Content content, Parse parse) throws ScoringFilterException { + + public void passScoreAfterParsing(Text url, Content content, Parse parse) + throws ScoringFilterException { for (int i = 0; i < this.filters.length; i++) { this.filters[i].passScoreAfterParsing(url, content, parse); } } - - public CrawlDatum distributeScoreToOutlinks(Text fromUrl, ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, CrawlDatum adjust, int allCount) throws ScoringFilterException { + + public CrawlDatum distributeScoreToOutlinks(Text fromUrl, + ParseData parseData, Collection<Entry<Text, CrawlDatum>> targets, + CrawlDatum adjust, int allCount) throws ScoringFilterException { for (int i = 0; i < this.filters.length; i++) { - adjust = this.filters[i].distributeScoreToOutlinks(fromUrl, parseData, targets, adjust, allCount); + adjust = this.filters[i].distributeScoreToOutlinks(fromUrl, parseData, + targets, adjust, allCount); } return adjust; } - public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) throws ScoringFilterException { + public float indexerScore(Text url, NutchDocument doc, CrawlDatum dbDatum, + CrawlDatum fetchDatum, Parse parse, Inlinks inlinks, float initScore) + throws ScoringFilterException { for (int i = 0; i < this.filters.length; i++) { - initScore = this.filters[i].indexerScore(url, doc, dbDatum, fetchDatum, parse, inlinks, initScore); + initScore = this.filters[i].indexerScore(url, doc, dbDatum, fetchDatum, + parse, inlinks, initScore); } return initScore; } Modified: nutch/trunk/src/java/org/apache/nutch/scoring/package-info.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/package-info.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/scoring/package-info.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/package-info.java Thu Jan 29 05:38:59 2015 @@ -19,3 +19,4 @@ * The {@link org.apache.nutch.scoring.ScoringFilter ScoringFilter} interface. */ package org.apache.nutch.scoring; + Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDatum.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDatum.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDatum.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDatum.java Thu Jan 29 05:38:59 2015 @@ -27,8 +27,7 @@ import org.apache.hadoop.io.Writable; * A class for holding link information including the url, anchor text, a score, * the timestamp of the link and a link type. */ -public class LinkDatum - implements Writable { +public class LinkDatum implements Writable { public final static byte INLINK = 1; public final static byte OUTLINK = 2; @@ -49,7 +48,8 @@ public class LinkDatum /** * Creates a LinkDatum with a given url. Timestamp is set to current time. * - * @param url The link url. + * @param url + * The link url. */ public LinkDatum(String url) { this(url, "", System.currentTimeMillis()); @@ -59,8 +59,10 @@ public class LinkDatum * Creates a LinkDatum with a url and an anchor text. Timestamp is set to * current time. * - * @param url The link url. - * @param anchor The link anchor text. + * @param url + * The link url. + * @param anchor + * The link anchor text. */ public LinkDatum(String url, String anchor) { this(url, anchor, System.currentTimeMillis()); @@ -112,8 +114,7 @@ public class LinkDatum this.linkType = linkType; } - public void readFields(DataInput in) - throws IOException { + public void readFields(DataInput in) throws IOException { url = Text.readString(in); anchor = Text.readString(in); score = in.readFloat(); @@ -121,8 +122,7 @@ public class LinkDatum linkType = in.readByte(); } - public void write(DataOutput out) - throws IOException { + public void write(DataOutput out) throws IOException { Text.writeString(out, url); Text.writeString(out, anchor != null ? anchor : ""); out.writeFloat(score); @@ -132,9 +132,9 @@ public class LinkDatum public String toString() { - String type = (linkType == INLINK ? "inlink" : (linkType == OUTLINK) - ? "outlink" : "unknown"); + String type = (linkType == INLINK ? "inlink" + : (linkType == OUTLINK) ? "outlink" : "unknown"); return "url: " + url + ", anchor: " + anchor + ", score: " + score - + ", timestamp: " + timestamp + ", link type: " + type; + + ", timestamp: " + timestamp + ", link type: " + type; } } Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java Thu Jan 29 05:38:59 2015 @@ -67,27 +67,24 @@ import org.apache.nutch.util.TimingUtil; /** * The LinkDumper tool creates a database of node to inlink information that can - * be read using the nested Reader class. This allows the inlink and scoring - * state of a single url to be reviewed quickly to determine why a given url is - * ranking a certain way. This tool is to be used with the LinkRank analysis. + * be read using the nested Reader class. This allows the inlink and scoring + * state of a single url to be reviewed quickly to determine why a given url is + * ranking a certain way. This tool is to be used with the LinkRank analysis. */ -public class LinkDumper - extends Configured - implements Tool { +public class LinkDumper extends Configured implements Tool { public static final Logger LOG = LoggerFactory.getLogger(LinkDumper.class); public static final String DUMP_DIR = "linkdump"; /** - * Reader class which will print out the url and all of its inlinks to system - * out. Each inlinkwill be displayed with its node information including - * score and number of in and outlinks. + * Reader class which will print out the url and all of its inlinks to system + * out. Each inlinkwill be displayed with its node information including score + * and number of in and outlinks. */ public static class Reader { - public static void main(String[] args) - throws Exception { - + public static void main(String[] args) throws Exception { + if (args == null || args.length < 2) { System.out.println("LinkDumper$Reader usage: <webgraphdb> <url>"); return; @@ -99,20 +96,20 @@ public class LinkDumper Path webGraphDb = new Path(args[0]); String url = args[1]; MapFile.Reader[] readers = MapFileOutputFormat.getReaders(fs, new Path( - webGraphDb, DUMP_DIR), conf); + webGraphDb, DUMP_DIR), conf); // get the link nodes for the url Text key = new Text(url); LinkNodes nodes = new LinkNodes(); MapFileOutputFormat.getEntry(readers, - new HashPartitioner<Text, LinkNodes>(), key, nodes); + new HashPartitioner<Text, LinkNodes>(), key, nodes); // print out the link nodes LinkNode[] linkNodesAr = nodes.getLinks(); System.out.println(url + ":"); for (LinkNode node : linkNodesAr) { System.out.println(" " + node.getUrl() + " - " - + node.getNode().toString()); + + node.getNode().toString()); } // close the readers @@ -123,8 +120,7 @@ public class LinkDumper /** * Bean class which holds url to node information. */ - public static class LinkNode - implements Writable { + public static class LinkNode implements Writable { private String url = null; private Node node = null; @@ -154,15 +150,13 @@ public class LinkDumper this.node = node; } - public void readFields(DataInput in) - throws IOException { + public void readFields(DataInput in) throws IOException { url = in.readUTF(); node = new Node(); node.readFields(in); } - public void write(DataOutput out) - throws IOException { + public void write(DataOutput out) throws IOException { out.writeUTF(url); node.write(out); } @@ -172,8 +166,7 @@ public class LinkDumper /** * Writable class which holds an array of LinkNode objects. */ - public static class LinkNodes - implements Writable { + public static class LinkNodes implements Writable { private LinkNode[] links; @@ -193,8 +186,7 @@ public class LinkDumper this.links = links; } - public void readFields(DataInput in) - throws IOException { + public void readFields(DataInput in) throws IOException { int numLinks = in.readInt(); if (numLinks > 0) { links = new LinkNode[numLinks]; @@ -206,8 +198,7 @@ public class LinkDumper } } - public void write(DataOutput out) - throws IOException { + public void write(DataOutput out) throws IOException { if (links != null && links.length > 0) { int numLinks = links.length; out.writeInt(numLinks); @@ -222,9 +213,9 @@ public class LinkDumper * Inverts outlinks from the WebGraph to inlinks and attaches node * information. */ - public static class Inverter - implements Mapper<Text, Writable, Text, ObjectWritable>, - Reducer<Text, ObjectWritable, Text, LinkNode> { + public static class Inverter implements + Mapper<Text, Writable, Text, ObjectWritable>, + Reducer<Text, ObjectWritable, Text, LinkNode> { private JobConf conf; @@ -236,8 +227,8 @@ public class LinkDumper * Wraps all values in ObjectWritables. */ public void map(Text key, Writable value, - OutputCollector<Text, ObjectWritable> output, Reporter reporter) - throws IOException { + OutputCollector<Text, ObjectWritable> output, Reporter reporter) + throws IOException { ObjectWritable objWrite = new ObjectWritable(); objWrite.set(value); @@ -245,12 +236,12 @@ public class LinkDumper } /** - * Inverts outlinks to inlinks while attaching node information to the + * Inverts outlinks to inlinks while attaching node information to the * outlink. */ public void reduce(Text key, Iterator<ObjectWritable> values, - OutputCollector<Text, LinkNode> output, Reporter reporter) - throws IOException { + OutputCollector<Text, LinkNode> output, Reporter reporter) + throws IOException { String fromUrl = key.toString(); List<LinkDatum> outlinks = new ArrayList<LinkDatum>(); @@ -262,13 +253,11 @@ public class LinkDumper ObjectWritable write = values.next(); Object obj = write.get(); if (obj instanceof Node) { - node = (Node)obj; - } - else if (obj instanceof LinkDatum) { - outlinks.add(WritableUtils.clone((LinkDatum)obj, conf)); - } - else if (obj instanceof LoopSet) { - loops = (LoopSet)obj; + node = (Node) obj; + } else if (obj instanceof LinkDatum) { + outlinks.add(WritableUtils.clone((LinkDatum) obj, conf)); + } else if (obj instanceof LoopSet) { + loops = (LoopSet) obj; } } @@ -280,13 +269,13 @@ public class LinkDumper for (int i = 0; i < outlinks.size(); i++) { LinkDatum outlink = outlinks.get(i); String toUrl = outlink.getUrl(); - + // remove any url that is in the loopset, same as LinkRank if (loopSet != null && loopSet.contains(toUrl)) { continue; } - - // collect the outlink as an inlink with the node + + // collect the outlink as an inlink with the node output.collect(new Text(toUrl), new LinkNode(fromUrl, node)); } } @@ -297,11 +286,11 @@ public class LinkDumper } /** - * Merges LinkNode objects into a single array value per url. This allows - * all values to be quickly retrieved and printed via the Reader tool. + * Merges LinkNode objects into a single array value per url. This allows all + * values to be quickly retrieved and printed via the Reader tool. */ - public static class Merger - implements Reducer<Text, LinkNode, Text, LinkNodes> { + public static class Merger implements + Reducer<Text, LinkNode, Text, LinkNodes> { private JobConf conf; private int maxInlinks = 50000; @@ -314,8 +303,8 @@ public class LinkDumper * Aggregate all LinkNode objects for a given url. */ public void reduce(Text key, Iterator<LinkNode> values, - OutputCollector<Text, LinkNodes> output, Reporter reporter) - throws IOException { + OutputCollector<Text, LinkNodes> output, Reporter reporter) + throws IOException { List<LinkNode> nodeList = new ArrayList<LinkNode>(); int numNodes = 0; @@ -325,8 +314,7 @@ public class LinkDumper if (numNodes < maxInlinks) { nodeList.add(WritableUtils.clone(cur, conf)); numNodes++; - } - else { + } else { break; } } @@ -342,11 +330,10 @@ public class LinkDumper } /** - * Runs the inverter and merger jobs of the LinkDumper tool to create the - * url to inlink node database. + * Runs the inverter and merger jobs of the LinkDumper tool to create the url + * to inlink node database. */ - public void dumpLinks(Path webGraphDb) - throws IOException { + public void dumpLinks(Path webGraphDb) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); @@ -362,7 +349,7 @@ public class LinkDumper // run the inverter job Path tempInverted = new Path(webGraphDb, "inverted-" - + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); + + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); JobConf inverter = new NutchJob(conf); inverter.setJobName("LinkDumper: inverter"); FileInputFormat.addInputPath(inverter, nodeDb); @@ -384,8 +371,7 @@ public class LinkDumper LOG.info("LinkDumper: running inverter"); JobClient.runJob(inverter); LOG.info("LinkDumper: finished inverter"); - } - catch (IOException e) { + } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } @@ -407,43 +393,41 @@ public class LinkDumper LOG.info("LinkDumper: running merger"); JobClient.runJob(merger); LOG.info("LinkDumper: finished merger"); - } - catch (IOException e) { + } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } fs.delete(tempInverted, true); long end = System.currentTimeMillis(); - LOG.info("LinkDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + LOG.info("LinkDumper: finished at " + sdf.format(end) + ", elapsed: " + + TimingUtil.elapsedTime(start, end)); } - public static void main(String[] args) - throws Exception { + public static void main(String[] args) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new LinkDumper(), - args); + args); System.exit(res); } /** - * Runs the LinkDumper tool. This simply creates the database, to read the + * Runs the LinkDumper tool. This simply creates the database, to read the * values the nested Reader tool must be used. */ - public int run(String[] args) - throws Exception { + public int run(String[] args) throws Exception { Options options = new Options(); OptionBuilder.withArgName("help"); OptionBuilder.withDescription("show this help message"); Option helpOpts = OptionBuilder.create("help"); options.addOption(helpOpts); - + OptionBuilder.withArgName("webgraphdb"); OptionBuilder.hasArg(); OptionBuilder.withDescription("the web graph database to use"); Option webGraphDbOpts = OptionBuilder.create("webgraphdb"); options.addOption(webGraphDbOpts); - + CommandLineParser parser = new GnuParser(); try { @@ -457,8 +441,7 @@ public class LinkDumper String webGraphDb = line.getOptionValue("webgraphdb"); dumpLinks(new Path(webGraphDb)); return 0; - } - catch (Exception e) { + } catch (Exception e) { LOG.error("LinkDumper: " + StringUtils.stringifyException(e)); return -2; } Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LinkRank.java Thu Jan 29 05:38:59 2015 @@ -68,9 +68,7 @@ import org.apache.nutch.util.NutchJob; import org.apache.nutch.util.TimingUtil; import org.apache.nutch.util.URLUtil; -public class LinkRank - extends Configured - implements Tool { +public class LinkRank extends Configured implements Tool { public static final Logger LOG = LoggerFactory.getLogger(LinkRank.class); private static final String NUM_NODES = "_num_nodes_"; @@ -79,14 +77,16 @@ public class LinkRank * Runs the counter job. The counter job determines the number of links in the * webgraph. This is used during analysis. * - * @param fs The job file system. - * @param webGraphDb The web graph database to use. + * @param fs + * The job file system. + * @param webGraphDb + * The web graph database to use. * * @return The number of nodes in the web graph. - * @throws IOException If an error occurs while running the counter job. + * @throws IOException + * If an error occurs while running the counter job. */ - private int runCounter(FileSystem fs, Path webGraphDb) - throws IOException { + private int runCounter(FileSystem fs, Path webGraphDb) throws IOException { // configure the counter job Path numLinksPath = new Path(webGraphDb, NUM_NODES); @@ -105,14 +105,14 @@ public class LinkRank counter.setOutputValueClass(LongWritable.class); counter.setNumReduceTasks(1); counter.setOutputFormat(TextOutputFormat.class); - counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); + counter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", + false); // run the counter job, outputs to a single reduce task and file LOG.info("Starting link counter job"); try { JobClient.runJob(counter); - } - catch (IOException e) { + } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } @@ -125,13 +125,13 @@ public class LinkRank BufferedReader buffer = new BufferedReader(new InputStreamReader(readLinks)); String numLinksLine = buffer.readLine(); readLinks.close(); - + // check if there are links to process, if none, webgraph might be empty if (numLinksLine == null || numLinksLine.length() == 0) { fs.delete(numLinksPath, true); throw new IOException("No links to process, is the webgraph empty?"); } - + // delete temp file and convert and return the number of links as an int LOG.info("Deleting numlinks temp file"); fs.delete(numLinksPath, true); @@ -143,13 +143,15 @@ public class LinkRank * Runs the initializer job. The initializer job sets up the nodes with a * default starting score for link analysis. * - * @param nodeDb The node database to use. - * @param output The job output directory. + * @param nodeDb + * The node database to use. + * @param output + * The job output directory. * - * @throws IOException If an error occurs while running the initializer job. + * @throws IOException + * If an error occurs while running the initializer job. */ - private void runInitializer(Path nodeDb, Path output) - throws IOException { + private void runInitializer(Path nodeDb, Path output) throws IOException { // configure the initializer JobConf initializer = new NutchJob(getConf()); @@ -163,14 +165,14 @@ public class LinkRank initializer.setOutputKeyClass(Text.class); initializer.setOutputValueClass(Node.class); initializer.setOutputFormat(MapFileOutputFormat.class); - initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); + initializer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", + false); // run the initializer LOG.info("Starting initialization job"); try { JobClient.runJob(initializer); - } - catch (IOException e) { + } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } @@ -186,15 +188,20 @@ public class LinkRank * space requirements but it can be very useful is weeding out and eliminating * link farms and other spam pages. * - * @param nodeDb The node database to use. - * @param outlinkDb The outlink database to use. - * @param loopDb The loop database to use if it exists. - * @param output The output directory. + * @param nodeDb + * The node database to use. + * @param outlinkDb + * The outlink database to use. + * @param loopDb + * The loop database to use if it exists. + * @param output + * The output directory. * - * @throws IOException If an error occurs while running the inverter job. + * @throws IOException + * If an error occurs while running the inverter job. */ private void runInverter(Path nodeDb, Path outlinkDb, Path loopDb, Path output) - throws IOException { + throws IOException { // configure the inverter JobConf inverter = new NutchJob(getConf()); @@ -215,14 +222,14 @@ public class LinkRank inverter.setOutputKeyClass(Text.class); inverter.setOutputValueClass(LinkDatum.class); inverter.setOutputFormat(SequenceFileOutputFormat.class); - inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); + inverter.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", + false); // run the inverter job LOG.info("Starting inverter job"); try { JobClient.runJob(inverter); - } - catch (IOException e) { + } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } @@ -236,23 +243,28 @@ public class LinkRank * Typically the link analysis job is run a number of times to allow the link * rank scores to converge. * - * @param nodeDb The node database from which we are getting previous link - * rank scores. - * @param inverted The inverted inlinks - * @param output The link analysis output. - * @param iteration The current iteration number. - * @param numIterations The total number of link analysis iterations + * @param nodeDb + * The node database from which we are getting previous link rank + * scores. + * @param inverted + * The inverted inlinks + * @param output + * The link analysis output. + * @param iteration + * The current iteration number. + * @param numIterations + * The total number of link analysis iterations * - * @throws IOException If an error occurs during link analysis. + * @throws IOException + * If an error occurs during link analysis. */ private void runAnalysis(Path nodeDb, Path inverted, Path output, - int iteration, int numIterations, float rankOne) - throws IOException { + int iteration, int numIterations, float rankOne) throws IOException { JobConf analyzer = new NutchJob(getConf()); analyzer.set("link.analyze.iteration", String.valueOf(iteration + 1)); analyzer.setJobName("LinkAnalysis Analyzer, iteration " + (iteration + 1) - + " of " + numIterations); + + " of " + numIterations); FileInputFormat.addInputPath(analyzer, nodeDb); FileInputFormat.addInputPath(analyzer, inverted); FileOutputFormat.setOutputPath(analyzer, output); @@ -265,13 +277,13 @@ public class LinkRank analyzer.setOutputKeyClass(Text.class); analyzer.setOutputValueClass(Node.class); analyzer.setOutputFormat(MapFileOutputFormat.class); - analyzer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", false); + analyzer.setBoolean("mapreduce.fileoutputcommitter.marksuccessfuljobs", + false); LOG.info("Starting analysis job"); try { JobClient.runJob(analyzer); - } - catch (IOException e) { + } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } @@ -283,9 +295,9 @@ public class LinkRank * This is used to determine a rank one score for pages with zero inlinks but * that contain outlinks. */ - private static class Counter - implements Mapper<Text, Node, Text, LongWritable>, - Reducer<Text, LongWritable, Text, LongWritable> { + private static class Counter implements + Mapper<Text, Node, Text, LongWritable>, + Reducer<Text, LongWritable, Text, LongWritable> { private static Text numNodes = new Text(NUM_NODES); private static LongWritable one = new LongWritable(1L); @@ -297,8 +309,8 @@ public class LinkRank * Outputs one for every node. */ public void map(Text key, Node value, - OutputCollector<Text, LongWritable> output, Reporter reporter) - throws IOException { + OutputCollector<Text, LongWritable> output, Reporter reporter) + throws IOException { output.collect(numNodes, one); } @@ -306,8 +318,8 @@ public class LinkRank * Totals the node number and outputs a single total value. */ public void reduce(Text key, Iterator<LongWritable> values, - OutputCollector<Text, LongWritable> output, Reporter reporter) - throws IOException { + OutputCollector<Text, LongWritable> output, Reporter reporter) + throws IOException { long total = 0; while (values.hasNext()) { @@ -320,8 +332,7 @@ public class LinkRank } } - private static class Initializer - implements Mapper<Text, Node, Text, Node> { + private static class Initializer implements Mapper<Text, Node, Text, Node> { private JobConf conf; private float initialScore = 1.0f; @@ -332,8 +343,7 @@ public class LinkRank } public void map(Text key, Node node, OutputCollector<Text, Node> output, - Reporter reporter) - throws IOException { + Reporter reporter) throws IOException { String url = key.toString(); Node outNode = WritableUtils.clone(node, conf); @@ -351,9 +361,9 @@ public class LinkRank * WebGraph. The link analysis process consists of inverting, analyzing and * scoring, in a loop for a given number of iterations. */ - private static class Inverter - implements Mapper<Text, Writable, Text, ObjectWritable>, - Reducer<Text, ObjectWritable, Text, LinkDatum> { + private static class Inverter implements + Mapper<Text, Writable, Text, ObjectWritable>, + Reducer<Text, ObjectWritable, Text, LinkDatum> { private JobConf conf; @@ -365,8 +375,8 @@ public class LinkRank * Convert values to ObjectWritable */ public void map(Text key, Writable value, - OutputCollector<Text, ObjectWritable> output, Reporter reporter) - throws IOException { + OutputCollector<Text, ObjectWritable> output, Reporter reporter) + throws IOException { ObjectWritable objWrite = new ObjectWritable(); objWrite.set(value); @@ -379,8 +389,8 @@ public class LinkRank * within the loopset. */ public void reduce(Text key, Iterator<ObjectWritable> values, - OutputCollector<Text, LinkDatum> output, Reporter reporter) - throws IOException { + OutputCollector<Text, LinkDatum> output, Reporter reporter) + throws IOException { String fromUrl = key.toString(); List<LinkDatum> outlinks = new ArrayList<LinkDatum>(); @@ -392,23 +402,25 @@ public class LinkRank ObjectWritable write = values.next(); Object obj = write.get(); if (obj instanceof Node) { - node = (Node)obj; - } - else if (obj instanceof LinkDatum) { - outlinks.add(WritableUtils.clone((LinkDatum)obj, conf)); - } - else if (obj instanceof LoopSet) { - loops = (LoopSet)obj; + node = (Node) obj; + } else if (obj instanceof LinkDatum) { + outlinks.add(WritableUtils.clone((LinkDatum) obj, conf)); + } else if (obj instanceof LoopSet) { + loops = (LoopSet) obj; } } - // Check for the possibility of a LoopSet object without Node and LinkDatum objects. This can happen - // with webgraphs that receive deletes (e.g. link.delete.gone and/or URL filters or normalizers) but + // Check for the possibility of a LoopSet object without Node and + // LinkDatum objects. This can happen + // with webgraphs that receive deletes (e.g. link.delete.gone and/or URL + // filters or normalizers) but // without an updated Loops database. // See: https://issues.apache.org/jira/browse/NUTCH-1299 if (node == null && loops != null) { // Nothing to do - LOG.warn("LoopSet without Node object received for " + key.toString() + " . You should either not use Loops as input of the LinkRank program or rerun the Loops program over the WebGraph."); + LOG.warn("LoopSet without Node object received for " + + key.toString() + + " . You should either not use Loops as input of the LinkRank program or rerun the Loops program over the WebGraph."); return; } @@ -430,7 +442,7 @@ public class LinkRank // remove any url that is contained in the loopset if (loopSet != null && loopSet.contains(toUrl)) { LOG.debug(fromUrl + ": Skipping inverting inlink from loop " - + toUrl); + + toUrl); continue; } outlink.setUrl(fromUrl); @@ -439,8 +451,8 @@ public class LinkRank // collect the inverted outlink output.collect(new Text(toUrl), outlink); LOG.debug(toUrl + ": inverting inlink from " + fromUrl - + " origscore: " + inlinkScore + " numOutlinks: " + numOutlinks - + " inlinkscore: " + outlinkScore); + + " origscore: " + inlinkScore + " numOutlinks: " + numOutlinks + + " inlinkscore: " + outlinkScore); } } } @@ -452,9 +464,9 @@ public class LinkRank /** * Runs a single link analysis iteration. */ - private static class Analyzer - implements Mapper<Text, Writable, Text, ObjectWritable>, - Reducer<Text, ObjectWritable, Text, Node> { + private static class Analyzer implements + Mapper<Text, Writable, Text, ObjectWritable>, + Reducer<Text, ObjectWritable, Text, Node> { private JobConf conf; private float dampingFactor = 0.85f; @@ -471,13 +483,13 @@ public class LinkRank try { this.conf = conf; - this.dampingFactor = conf.getFloat("link.analyze.damping.factor", 0.85f); + this.dampingFactor = conf + .getFloat("link.analyze.damping.factor", 0.85f); this.rankOne = conf.getFloat("link.analyze.rank.one", 0.0f); this.itNum = conf.getInt("link.analyze.iteration", 0); limitPages = conf.getBoolean("link.ignore.limit.page", true); limitDomains = conf.getBoolean("link.ignore.limit.domain", true); - } - catch (Exception e) { + } catch (Exception e) { LOG.error(StringUtils.stringifyException(e)); throw new IllegalArgumentException(e); } @@ -487,8 +499,8 @@ public class LinkRank * Convert values to ObjectWritable */ public void map(Text key, Writable value, - OutputCollector<Text, ObjectWritable> output, Reporter reporter) - throws IOException { + OutputCollector<Text, ObjectWritable> output, Reporter reporter) + throws IOException { ObjectWritable objWrite = new ObjectWritable(); objWrite.set(WritableUtils.clone(value, conf)); @@ -500,8 +512,8 @@ public class LinkRank * stored in a temporary NodeDb which replaces the NodeDb of the WebGraph. */ public void reduce(Text key, Iterator<ObjectWritable> values, - OutputCollector<Text, Node> output, Reporter reporter) - throws IOException { + OutputCollector<Text, Node> output, Reporter reporter) + throws IOException { String url = key.toString(); Set<String> domains = new HashSet<String>(); @@ -517,11 +529,10 @@ public class LinkRank ObjectWritable next = values.next(); Object value = next.get(); if (value instanceof Node) { - node = (Node)value; - } - else if (value instanceof LinkDatum) { + node = (Node) value; + } else if (value instanceof LinkDatum) { - LinkDatum linkDatum = (LinkDatum)value; + LinkDatum linkDatum = (LinkDatum) value; float scoreFromInlink = linkDatum.getScore(); String inlinkUrl = linkDatum.getUrl(); String inLinkDomain = URLUtil.getDomainName(inlinkUrl); @@ -529,9 +540,9 @@ public class LinkRank // limit counting duplicate inlinks by pages or domains if ((limitPages && pages.contains(inLinkPage)) - || (limitDomains && domains.contains(inLinkDomain))) { + || (limitDomains && domains.contains(inLinkDomain))) { LOG.debug(url + ": ignoring " + scoreFromInlink + " from " - + inlinkUrl + ", duplicate page or domain"); + + inlinkUrl + ", duplicate page or domain"); continue; } @@ -541,16 +552,16 @@ public class LinkRank domains.add(inLinkDomain); pages.add(inLinkPage); LOG.debug(url + ": adding " + scoreFromInlink + " from " + inlinkUrl - + ", total: " + totalInlinkScore); + + ", total: " + totalInlinkScore); } } // calculate linkRank score formula float linkRankScore = (1 - this.dampingFactor) - + (this.dampingFactor * totalInlinkScore); + + (this.dampingFactor * totalInlinkScore); LOG.debug(url + ": score: " + linkRankScore + " num inlinks: " - + numInlinks + " iteration: " + itNum); + + numInlinks + " iteration: " + itNum); // store the score in a temporary NodeDb Node outNode = WritableUtils.clone(node, conf); @@ -558,8 +569,7 @@ public class LinkRank output.collect(key, outNode); } - public void close() - throws IOException { + public void close() throws IOException { } } @@ -586,12 +596,13 @@ public class LinkRank * by default 10. And finally replaces the NodeDb in the WebGraph with the * link rank output. * - * @param webGraphDb The WebGraph to run link analysis on. + * @param webGraphDb + * The WebGraph to run link analysis on. * - * @throws IOException If an error occurs during link analysis. + * @throws IOException + * If an error occurs during link analysis. */ - public void analyze(Path webGraphDb) - throws IOException { + public void analyze(Path webGraphDb) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); @@ -621,7 +632,7 @@ public class LinkRank // initialze all urls with a default score int numLinks = runCounter(fs, webGraphDb); runInitializer(wgNodeDb, nodeDb); - float rankOneScore = (1f / (float)numLinks); + float rankOneScore = (1f / (float) numLinks); if (LOG.isInfoEnabled()) { LOG.info("Analysis: Number of links: " + numLinks); @@ -634,9 +645,10 @@ public class LinkRank for (int i = 0; i < numIterations; i++) { // the input to inverting is always the previous output from analysis - LOG.info("Analysis: Starting iteration " + (i + 1) + " of " + numIterations); + LOG.info("Analysis: Starting iteration " + (i + 1) + " of " + + numIterations); Path tempRank = new Path(linkRank + "-" - + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); + + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); fs.mkdirs(tempRank); Path tempInverted = new Path(tempRank, "inverted"); Path tempNodeDb = new Path(tempRank, WebGraph.NODE_DIR); @@ -644,13 +656,13 @@ public class LinkRank // run invert and analysis runInverter(nodeDb, wgOutlinkDb, loopDb, tempInverted); runAnalysis(nodeDb, tempInverted, tempNodeDb, i, numIterations, - rankOneScore); + rankOneScore); // replace the temporary NodeDb with the output from analysis LOG.info("Analysis: Installing new link scores"); FSUtils.replace(fs, linkRank, tempRank, true); LOG.info("Analysis: finished iteration " + (i + 1) + " of " - + numIterations); + + numIterations); } // replace the NodeDb in the WebGraph with the final output of analysis @@ -660,11 +672,11 @@ public class LinkRank // remove the temporary link rank folder fs.delete(linkRank, true); long end = System.currentTimeMillis(); - LOG.info("Analysis: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + LOG.info("Analysis: finished at " + sdf.format(end) + ", elapsed: " + + TimingUtil.elapsedTime(start, end)); } - public static void main(String[] args) - throws Exception { + public static void main(String[] args) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new LinkRank(), args); System.exit(res); } @@ -672,15 +684,14 @@ public class LinkRank /** * Runs the LinkRank tool. */ - public int run(String[] args) - throws Exception { + public int run(String[] args) throws Exception { Options options = new Options(); OptionBuilder.withArgName("help"); OptionBuilder.withDescription("show this help message"); Option helpOpts = OptionBuilder.create("help"); options.addOption(helpOpts); - + OptionBuilder.withArgName("webgraphdb"); OptionBuilder.hasArg(); OptionBuilder.withDescription("the web graph db to use"); @@ -701,8 +712,7 @@ public class LinkRank analyze(new Path(webGraphDb)); return 0; - } - catch (Exception e) { + } catch (Exception e) { LOG.error("LinkAnalysis: " + StringUtils.stringifyException(e)); return -2; } Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/LoopReader.java Thu Jan 29 05:38:59 2015 @@ -44,35 +44,38 @@ public class LoopReader extends Configur private FileSystem fs; private MapFile.Reader[] loopReaders; - - public LoopReader() { } - + + public LoopReader() { + } + public LoopReader(Configuration conf) { super(conf); } /** - * Prints loopset for a single url. The loopset information will show any + * Prints loopset for a single url. The loopset information will show any * outlink url the eventually forms a link cycle. * - * @param webGraphDb The WebGraph to check for loops - * @param url The url to check. + * @param webGraphDb + * The WebGraph to check for loops + * @param url + * The url to check. * - * @throws IOException If an error occurs while printing loopset information. + * @throws IOException + * If an error occurs while printing loopset information. */ - public void dumpUrl(Path webGraphDb, String url) - throws IOException { + public void dumpUrl(Path webGraphDb, String url) throws IOException { // open the readers fs = FileSystem.get(getConf()); loopReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb, - Loops.LOOPS_DIR), getConf()); + Loops.LOOPS_DIR), getConf()); // get the loopset for a given url, if any Text key = new Text(url); LoopSet loop = new LoopSet(); MapFileOutputFormat.getEntry(loopReaders, - new HashPartitioner<Text, LoopSet>(), key, loop); + new HashPartitioner<Text, LoopSet>(), key, loop); // print out each loop url in the set System.out.println(url + ":"); @@ -85,24 +88,23 @@ public class LoopReader extends Configur } /** - * Runs the LoopReader tool. For this tool to work the loops job must have + * Runs the LoopReader tool. For this tool to work the loops job must have * already been run on the corresponding WebGraph. */ - public static void main(String[] args) - throws Exception { + public static void main(String[] args) throws Exception { Options options = new Options(); OptionBuilder.withArgName("help"); OptionBuilder.withDescription("show this help message"); Option helpOpts = OptionBuilder.create("help"); options.addOption(helpOpts); - + OptionBuilder.withArgName("webgraphdb"); OptionBuilder.hasArg(); OptionBuilder.withDescription("the webgraphdb to use"); Option webGraphOpts = OptionBuilder.create("webgraphdb"); options.addOption(webGraphOpts); - + OptionBuilder.withArgName("url"); OptionBuilder.hasOptionalArg(); OptionBuilder.withDescription("the url to dump"); @@ -114,7 +116,7 @@ public class LoopReader extends Configur CommandLine line = parser.parse(options, args); if (line.hasOption("help") || !line.hasOption("webgraphdb") - || !line.hasOption("url")) { + || !line.hasOption("url")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("WebGraphReader", options); return; @@ -125,8 +127,7 @@ public class LoopReader extends Configur LoopReader reader = new LoopReader(NutchConfiguration.create()); reader.dumpUrl(new Path(webGraphDb), url); return; - } - catch (Exception e) { + } catch (Exception e) { e.printStackTrace(); return; } Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Loops.java Thu Jan 29 05:38:59 2015 @@ -76,9 +76,7 @@ import org.apache.nutch.util.TimingUtil; * rather small. Because of this the Loops job is optional and if it doesn't * exist then it won't be factored into the LinkRank program. */ -public class Loops - extends Configured - implements Tool { +public class Loops extends Configured implements Tool { public static final Logger LOG = LoggerFactory.getLogger(Loops.class); public static final String LOOPS_DIR = "loops"; @@ -87,8 +85,7 @@ public class Loops /** * A link path or route looking to identify a link cycle. */ - public static class Route - implements Writable { + public static class Route implements Writable { private String outlinkUrl = null; private String lookingFor = null; @@ -122,16 +119,14 @@ public class Loops this.found = found; } - public void readFields(DataInput in) - throws IOException { + public void readFields(DataInput in) throws IOException { outlinkUrl = Text.readString(in); lookingFor = Text.readString(in); found = in.readBoolean(); } - public void write(DataOutput out) - throws IOException { + public void write(DataOutput out) throws IOException { Text.writeString(out, outlinkUrl); Text.writeString(out, lookingFor); out.writeBoolean(found); @@ -141,8 +136,7 @@ public class Loops /** * A set of loops. */ - public static class LoopSet - implements Writable { + public static class LoopSet implements Writable { private Set<String> loopSet = new HashSet<String>(); @@ -158,8 +152,7 @@ public class Loops this.loopSet = loopSet; } - public void readFields(DataInput in) - throws IOException { + public void readFields(DataInput in) throws IOException { int numNodes = in.readInt(); loopSet = new HashSet<String>(); @@ -169,8 +162,7 @@ public class Loops } } - public void write(DataOutput out) - throws IOException { + public void write(DataOutput out) throws IOException { int numNodes = (loopSet != null ? loopSet.size() : 0); out.writeInt(numNodes); @@ -191,10 +183,9 @@ public class Loops /** * Initializes the Loop routes. */ - public static class Initializer - extends Configured - implements Mapper<Text, Writable, Text, ObjectWritable>, - Reducer<Text, ObjectWritable, Text, Route> { + public static class Initializer extends Configured implements + Mapper<Text, Writable, Text, ObjectWritable>, + Reducer<Text, ObjectWritable, Text, Route> { private JobConf conf; @@ -222,8 +213,8 @@ public class Loops * Wraps values in ObjectWritable. */ public void map(Text key, Writable value, - OutputCollector<Text, ObjectWritable> output, Reporter reporter) - throws IOException { + OutputCollector<Text, ObjectWritable> output, Reporter reporter) + throws IOException { ObjectWritable objWrite = new ObjectWritable(); objWrite.set(value); @@ -236,8 +227,8 @@ public class Loops * the Looper job. */ public void reduce(Text key, Iterator<ObjectWritable> values, - OutputCollector<Text, Route> output, Reporter reporter) - throws IOException { + OutputCollector<Text, Route> output, Reporter reporter) + throws IOException { String url = key.toString(); Node node = null; @@ -248,10 +239,9 @@ public class Loops ObjectWritable objWrite = values.next(); Object obj = objWrite.get(); if (obj instanceof LinkDatum) { - outlinkList.add((LinkDatum)obj); - } - else if (obj instanceof Node) { - node = (Node)obj; + outlinkList.add((LinkDatum) obj); + } else if (obj instanceof Node) { + node = (Node) obj; } } @@ -282,10 +272,9 @@ public class Loops * Follows a route path looking for the start url of the route. If the start * url is found then the route is a cyclical path. */ - public static class Looper - extends Configured - implements Mapper<Text, Writable, Text, ObjectWritable>, - Reducer<Text, ObjectWritable, Text, Route> { + public static class Looper extends Configured implements + Mapper<Text, Writable, Text, ObjectWritable>, + Reducer<Text, ObjectWritable, Text, Route> { private JobConf conf; private boolean last = false; @@ -315,15 +304,14 @@ public class Loops * Wrap values in ObjectWritable. */ public void map(Text key, Writable value, - OutputCollector<Text, ObjectWritable> output, Reporter reporter) - throws IOException { + OutputCollector<Text, ObjectWritable> output, Reporter reporter) + throws IOException { ObjectWritable objWrite = new ObjectWritable(); Writable cloned = null; if (value instanceof LinkDatum) { - cloned = new Text(((LinkDatum)value).getUrl()); - } - else { + cloned = new Text(((LinkDatum) value).getUrl()); + } else { cloned = WritableUtils.clone(value, conf); } objWrite.set(cloned); @@ -336,8 +324,8 @@ public class Loops * passes. */ public void reduce(Text key, Iterator<ObjectWritable> values, - OutputCollector<Text, Route> output, Reporter reporter) - throws IOException { + OutputCollector<Text, Route> output, Reporter reporter) + throws IOException { List<Route> routeList = new ArrayList<Route>(); Set<String> outlinkUrls = new LinkedHashSet<String>(); @@ -348,10 +336,9 @@ public class Loops ObjectWritable next = values.next(); Object value = next.get(); if (value instanceof Route) { - routeList.add(WritableUtils.clone((Route)value, conf)); - } - else if (value instanceof Text) { - String outlinkUrl = ((Text)value).toString(); + routeList.add(WritableUtils.clone((Route) value, conf)); + } else if (value instanceof Text) { + String outlinkUrl = ((Text) value).toString(); if (!outlinkUrls.contains(outlinkUrl)) { outlinkUrls.add(outlinkUrl); } @@ -375,16 +362,14 @@ public class Loops routeIt.remove(); if (route.isFound()) { output.collect(key, route); - } - else { + } else { // if the route start url is found, set route to found and collect String lookingFor = route.getLookingFor(); if (outlinkUrls.contains(lookingFor)) { route.setFound(true); output.collect(key, route); - } - else if (!last) { + } else if (!last) { // setup for next pass through the loop for (String outlink : outlinkUrls) { @@ -402,10 +387,8 @@ public class Loops /** * Finishes the Loops job by aggregating and collecting and found routes. */ - public static class Finalizer - extends Configured - implements Mapper<Text, Route, Text, Route>, - Reducer<Text, Route, Text, LoopSet> { + public static class Finalizer extends Configured implements + Mapper<Text, Route, Text, Route>, Reducer<Text, Route, Text, LoopSet> { private JobConf conf; @@ -433,8 +416,7 @@ public class Loops * Maps out and found routes, those will be the link cycles. */ public void map(Text key, Route value, OutputCollector<Text, Route> output, - Reporter reporter) - throws IOException { + Reporter reporter) throws IOException { if (value.isFound()) { String lookingFor = value.getLookingFor(); @@ -443,12 +425,12 @@ public class Loops } /** - * Aggregates all found routes for a given start url into a loopset and + * Aggregates all found routes for a given start url into a loopset and * collects the loopset. */ public void reduce(Text key, Iterator<Route> values, - OutputCollector<Text, LoopSet> output, Reporter reporter) - throws IOException { + OutputCollector<Text, LoopSet> output, Reporter reporter) + throws IOException { LoopSet loops = new LoopSet(); while (values.hasNext()) { @@ -465,8 +447,7 @@ public class Loops /** * Runs the various loop jobs. */ - public void findLoops(Path webGraphDb) - throws IOException { + public void findLoops(Path webGraphDb) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); @@ -481,7 +462,7 @@ public class Loops Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Path routes = new Path(webGraphDb, ROUTES_DIR); Path tempRoute = new Path(webGraphDb, ROUTES_DIR + "-" - + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); + + Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); // run the initializer JobConf init = new NutchJob(conf); @@ -504,8 +485,7 @@ public class Loops LOG.info("Loops: installing initializer " + routes); FSUtils.replace(fs, routes, tempRoute, true); LOG.info("Loops: finished initializer"); - } - catch (IOException e) { + } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } @@ -536,8 +516,7 @@ public class Loops LOG.info("Loops: installing looper " + routes); FSUtils.replace(fs, routes, tempRoute, true); LOG.info("Loops: finished looper"); - } - catch (IOException e) { + } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } @@ -561,17 +540,16 @@ public class Loops LOG.info("Loops: starting finalizer"); JobClient.runJob(finalizer); LOG.info("Loops: finished finalizer"); - } - catch (IOException e) { + } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } long end = System.currentTimeMillis(); - LOG.info("Loops: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + LOG.info("Loops: finished at " + sdf.format(end) + ", elapsed: " + + TimingUtil.elapsedTime(start, end)); } - public static void main(String[] args) - throws Exception { + public static void main(String[] args) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new Loops(), args); System.exit(res); } @@ -579,15 +557,14 @@ public class Loops /** * Runs the Loops tool. */ - public int run(String[] args) - throws Exception { + public int run(String[] args) throws Exception { Options options = new Options(); OptionBuilder.withArgName("help"); OptionBuilder.withDescription("show this help message"); Option helpOpts = OptionBuilder.create("help"); options.addOption(helpOpts); - + OptionBuilder.withArgName("webgraphdb"); OptionBuilder.hasArg(); OptionBuilder.withDescription("the web graph database to use"); @@ -607,8 +584,7 @@ public class Loops String webGraphDb = line.getOptionValue("webgraphdb"); findLoops(new Path(webGraphDb)); return 0; - } - catch (Exception e) { + } catch (Exception e) { LOG.error("Loops: " + StringUtils.stringifyException(e)); return -2; } Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Node.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Node.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Node.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/Node.java Thu Jan 29 05:38:59 2015 @@ -25,12 +25,11 @@ import org.apache.nutch.metadata.Metadat /** * A class which holds the number of inlinks and outlinks for a given url along - * with an inlink score from a link analysis program and any metadata. + * with an inlink score from a link analysis program and any metadata. * * The Node is the core unit of the NodeDb in the WebGraph. */ -public class Node - implements Writable { +public class Node implements Writable { private int numInlinks = 0; private int numOutlinks = 0; @@ -77,8 +76,7 @@ public class Node this.metadata = metadata; } - public void readFields(DataInput in) - throws IOException { + public void readFields(DataInput in) throws IOException { numInlinks = in.readInt(); numOutlinks = in.readInt(); @@ -87,8 +85,7 @@ public class Node metadata.readFields(in); } - public void write(DataOutput out) - throws IOException { + public void write(DataOutput out) throws IOException { out.writeInt(numInlinks); out.writeInt(numOutlinks); @@ -98,8 +95,8 @@ public class Node public String toString() { return "num inlinks: " + numInlinks + ", num outlinks: " + numOutlinks - + ", inlink score: " + inlinkScore + ", outlink score: " - + getOutlinkScore() + ", metadata: " + metadata.toString(); + + ", inlink score: " + inlinkScore + ", outlink score: " + + getOutlinkScore() + ", metadata: " + metadata.toString(); } } Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeDumper.java Thu Jan 29 05:38:59 2015 @@ -63,26 +63,20 @@ import org.apache.nutch.util.URLUtil; * have been run. For link analysis score a program such as LinkRank will need * to have been run which updates the NodeDb of the WebGraph. */ -public class NodeDumper - extends Configured - implements Tool { +public class NodeDumper extends Configured implements Tool { public static final Logger LOG = LoggerFactory.getLogger(NodeDumper.class); private static enum DumpType { - INLINKS, - OUTLINKS, - SCORES + INLINKS, OUTLINKS, SCORES } private static enum AggrType { - SUM, - MAX + SUM, MAX } private static enum NameType { - HOST, - DOMAIN + HOST, DOMAIN } /** @@ -90,10 +84,9 @@ public class NodeDumper * on the command line, the top urls could be for number of inlinks, for * number of outlinks, or for link analysis score. */ - public static class Sorter - extends Configured - implements Mapper<Text, Node, FloatWritable, Text>, - Reducer<FloatWritable, Text, Text, FloatWritable> { + public static class Sorter extends Configured implements + Mapper<Text, Node, FloatWritable, Text>, + Reducer<FloatWritable, Text, Text, FloatWritable> { private JobConf conf; private boolean inlinks = false; @@ -121,17 +114,15 @@ public class NodeDumper * score. */ public void map(Text key, Node node, - OutputCollector<FloatWritable, Text> output, Reporter reporter) - throws IOException { + OutputCollector<FloatWritable, Text> output, Reporter reporter) + throws IOException { float number = 0; if (inlinks) { number = node.getNumInlinks(); - } - else if (outlinks) { + } else if (outlinks) { number = node.getNumOutlinks(); - } - else { + } else { number = node.getInlinkScore(); } @@ -143,8 +134,8 @@ public class NodeDumper * Flips and collects the url and numeric sort value. */ public void reduce(FloatWritable key, Iterator<Text> values, - OutputCollector<Text, FloatWritable> output, Reporter reporter) - throws IOException { + OutputCollector<Text, FloatWritable> output, Reporter reporter) + throws IOException { // take the negative of the negative to get original value, sometimes 0 // value are a little weird @@ -162,14 +153,13 @@ public class NodeDumper } /** - * Outputs the hosts or domains with an associated value. This value consists of either - * the number of inlinks, the number of outlinks or the score. The computed value is then - * either the sum of all parts or the top value. + * Outputs the hosts or domains with an associated value. This value consists + * of either the number of inlinks, the number of outlinks or the score. The + * computed value is then either the sum of all parts or the top value. */ - public static class Dumper - extends Configured - implements Mapper<Text, Node, Text, FloatWritable>, - Reducer<Text, FloatWritable, Text, FloatWritable> { + public static class Dumper extends Configured implements + Mapper<Text, Node, Text, FloatWritable>, + Reducer<Text, FloatWritable, Text, FloatWritable> { private JobConf conf; private boolean inlinks = false; @@ -197,21 +187,19 @@ public class NodeDumper } /** - * Outputs the host or domain as key for this record and numInlinks, numOutlinks - * or score as the value. + * Outputs the host or domain as key for this record and numInlinks, + * numOutlinks or score as the value. */ public void map(Text key, Node node, - OutputCollector<Text, FloatWritable> output, Reporter reporter) - throws IOException { + OutputCollector<Text, FloatWritable> output, Reporter reporter) + throws IOException { float number = 0; if (inlinks) { number = node.getNumInlinks(); - } - else if (outlinks) { + } else if (outlinks) { number = node.getNumOutlinks(); - } - else { + } else { number = node.getInlinkScore(); } @@ -228,8 +216,8 @@ public class NodeDumper * Outputs either the sum or the top value for this record. */ public void reduce(Text key, Iterator<FloatWritable> values, - OutputCollector<Text, FloatWritable> output, Reporter reporter) - throws IOException { + OutputCollector<Text, FloatWritable> output, Reporter reporter) + throws IOException { long numCollected = 0; float sumOrMax = 0; @@ -256,16 +244,19 @@ public class NodeDumper /** * Runs the process to dump the top urls out to a text file. - * - * @param webGraphDb The WebGraph from which to pull values. - * + * + * @param webGraphDb + * The WebGraph from which to pull values. + * * @param topN * @param output - * - * @throws IOException If an error occurs while dumping the top values. + * + * @throws IOException + * If an error occurs while dumping the top values. */ - public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, boolean asEff, NameType nameType, AggrType aggrType, boolean asSequenceFile) - throws Exception { + public void dumpNodes(Path webGraphDb, DumpType type, long topN, Path output, + boolean asEff, NameType nameType, AggrType aggrType, + boolean asSequenceFile) throws Exception { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); @@ -320,77 +311,76 @@ public class NodeDumper try { LOG.info("NodeDumper: running"); JobClient.runJob(dumper); - } - catch (IOException e) { + } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); throw e; } long end = System.currentTimeMillis(); - LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + LOG.info("NodeDumper: finished at " + sdf.format(end) + ", elapsed: " + + TimingUtil.elapsedTime(start, end)); } - public static void main(String[] args) - throws Exception { + public static void main(String[] args) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new NodeDumper(), - args); + args); System.exit(res); } /** * Runs the node dumper tool. */ - public int run(String[] args) - throws Exception { + public int run(String[] args) throws Exception { Options options = new Options(); OptionBuilder.withArgName("help"); OptionBuilder.withDescription("show this help message"); Option helpOpts = OptionBuilder.create("help"); options.addOption(helpOpts); - + OptionBuilder.withArgName("webgraphdb"); OptionBuilder.hasArg(); OptionBuilder.withDescription("the web graph database to use"); Option webGraphDbOpts = OptionBuilder.create("webgraphdb"); options.addOption(webGraphDbOpts); - + OptionBuilder.withArgName("inlinks"); OptionBuilder.withDescription("show highest inlinks"); Option inlinkOpts = OptionBuilder.create("inlinks"); options.addOption(inlinkOpts); - + OptionBuilder.withArgName("outlinks"); OptionBuilder.withDescription("show highest outlinks"); Option outlinkOpts = OptionBuilder.create("outlinks"); options.addOption(outlinkOpts); - + OptionBuilder.withArgName("scores"); OptionBuilder.withDescription("show highest scores"); Option scoreOpts = OptionBuilder.create("scores"); options.addOption(scoreOpts); - + OptionBuilder.withArgName("topn"); OptionBuilder.hasOptionalArg(); OptionBuilder.withDescription("show topN scores"); Option topNOpts = OptionBuilder.create("topn"); options.addOption(topNOpts); - + OptionBuilder.withArgName("output"); OptionBuilder.hasArg(); OptionBuilder.withDescription("the output directory to use"); Option outputOpts = OptionBuilder.create("output"); options.addOption(outputOpts); - + OptionBuilder.withArgName("asEff"); - OptionBuilder.withDescription("Solr ExternalFileField compatible output format"); + OptionBuilder + .withDescription("Solr ExternalFileField compatible output format"); Option effOpts = OptionBuilder.create("asEff"); options.addOption(effOpts); - + OptionBuilder.hasArgs(2); OptionBuilder.withDescription("group <host|domain> <sum|max>"); Option groupOpts = OptionBuilder.create("group"); options.addOption(groupOpts); - + OptionBuilder.withArgName("asSequenceFile"); OptionBuilder.withDescription("whether to output as a sequencefile"); Option sequenceFileOpts = OptionBuilder.create("asSequenceFile"); @@ -410,32 +400,32 @@ public class NodeDumper boolean inlinks = line.hasOption("inlinks"); boolean outlinks = line.hasOption("outlinks"); - long topN = (line.hasOption("topn") - ? Long.parseLong(line.getOptionValue("topn")) : Long.MAX_VALUE); + long topN = (line.hasOption("topn") ? Long.parseLong(line + .getOptionValue("topn")) : Long.MAX_VALUE); // get the correct dump type String output = line.getOptionValue("output"); - DumpType type = (inlinks ? DumpType.INLINKS : outlinks - ? DumpType.OUTLINKS : DumpType.SCORES); + DumpType type = (inlinks ? DumpType.INLINKS + : outlinks ? DumpType.OUTLINKS : DumpType.SCORES); NameType nameType = null; AggrType aggrType = null; String[] group = line.getOptionValues("group"); if (group != null && group.length == 2) { - nameType = (group[0].equals("host") ? NameType.HOST : group[0].equals("domain") - ? NameType.DOMAIN : null); - aggrType = (group[1].equals("sum") ? AggrType.SUM : group[1].equals("sum") - ? AggrType.MAX : null); + nameType = (group[0].equals("host") ? NameType.HOST : group[0] + .equals("domain") ? NameType.DOMAIN : null); + aggrType = (group[1].equals("sum") ? AggrType.SUM : group[1] + .equals("sum") ? AggrType.MAX : null); } // Use ExternalFileField? boolean asEff = line.hasOption("asEff"); boolean asSequenceFile = line.hasOption("asSequenceFile"); - dumpNodes(new Path(webGraphDb), type, topN, new Path(output), asEff, nameType, aggrType, asSequenceFile); + dumpNodes(new Path(webGraphDb), type, topN, new Path(output), asEff, + nameType, aggrType, asSequenceFile); return 0; - } - catch (Exception e) { + } catch (Exception e) { LOG.error("NodeDumper: " + StringUtils.stringifyException(e)); return -2; } Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/NodeReader.java Thu Jan 29 05:38:59 2015 @@ -37,7 +37,7 @@ import org.apache.nutch.util.FSUtils; import org.apache.nutch.util.NutchConfiguration; /** - * Reads and prints to system out information for a single node from the NodeDb + * Reads and prints to system out information for a single node from the NodeDb * in the WebGraph. */ public class NodeReader extends Configured { @@ -46,33 +46,35 @@ public class NodeReader extends Configur private MapFile.Reader[] nodeReaders; public NodeReader() { - + } - + public NodeReader(Configuration conf) { super(conf); } - + /** * Prints the content of the Node represented by the url to system out. * - * @param webGraphDb The webgraph from which to get the node. - * @param url The url of the node. + * @param webGraphDb + * The webgraph from which to get the node. + * @param url + * The url of the node. * - * @throws IOException If an error occurs while getting the node. + * @throws IOException + * If an error occurs while getting the node. */ - public void dumpUrl(Path webGraphDb, String url) - throws IOException { + public void dumpUrl(Path webGraphDb, String url) throws IOException { fs = FileSystem.get(getConf()); nodeReaders = MapFileOutputFormat.getReaders(fs, new Path(webGraphDb, - WebGraph.NODE_DIR), getConf()); + WebGraph.NODE_DIR), getConf()); // open the readers, get the node, print out the info, and close the readers Text key = new Text(url); Node node = new Node(); MapFileOutputFormat.getEntry(nodeReaders, - new HashPartitioner<Text, Node>(), key, node); + new HashPartitioner<Text, Node>(), key, node); System.out.println(url + ":"); System.out.println(" inlink score: " + node.getInlinkScore()); System.out.println(" outlink score: " + node.getOutlinkScore()); @@ -82,25 +84,24 @@ public class NodeReader extends Configur } /** - * Runs the NodeReader tool. The command line arguments must contain a - * webgraphdb path and a url. The url must match the normalized url that is + * Runs the NodeReader tool. The command line arguments must contain a + * webgraphdb path and a url. The url must match the normalized url that is * contained in the NodeDb of the WebGraph. */ - public static void main(String[] args) - throws Exception { + public static void main(String[] args) throws Exception { Options options = new Options(); OptionBuilder.withArgName("help"); OptionBuilder.withDescription("show this help message"); Option helpOpts = OptionBuilder.create("help"); options.addOption(helpOpts); - + OptionBuilder.withArgName("webgraphdb"); OptionBuilder.hasArg(); OptionBuilder.withDescription("the webgraphdb to use"); Option webGraphOpts = OptionBuilder.create("webgraphdb"); options.addOption(webGraphOpts); - + OptionBuilder.withArgName("url"); OptionBuilder.hasOptionalArg(); OptionBuilder.withDescription("the url to dump"); @@ -113,7 +114,7 @@ public class NodeReader extends Configur // command line must take a webgraphdb and a url CommandLine line = parser.parse(options, args); if (line.hasOption("help") || !line.hasOption("webgraphdb") - || !line.hasOption("url")) { + || !line.hasOption("url")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("WebGraphReader", options); return; @@ -124,10 +125,9 @@ public class NodeReader extends Configur String url = line.getOptionValue("url"); NodeReader reader = new NodeReader(NutchConfiguration.create()); reader.dumpUrl(new Path(webGraphDb), url); - + return; - } - catch (Exception e) { + } catch (Exception e) { e.printStackTrace(); return; } Modified: nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java?rev=1655526&r1=1655525&r2=1655526&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java (original) +++ nutch/trunk/src/java/org/apache/nutch/scoring/webgraph/ScoreUpdater.java Thu Jan 29 05:38:59 2015 @@ -58,13 +58,12 @@ import org.apache.nutch.util.TimingUtil; /** * Updates the score from the WebGraph node database into the crawl database. - * Any score that is not in the node database is set to the clear score in the + * Any score that is not in the node database is set to the clear score in the * crawl database. */ -public class ScoreUpdater - extends Configured - implements Tool, Mapper<Text, Writable, Text, ObjectWritable>, - Reducer<Text, ObjectWritable, Text, CrawlDatum> { +public class ScoreUpdater extends Configured implements Tool, + Mapper<Text, Writable, Text, ObjectWritable>, + Reducer<Text, ObjectWritable, Text, CrawlDatum> { public static final Logger LOG = LoggerFactory.getLogger(ScoreUpdater.class); @@ -80,8 +79,8 @@ public class ScoreUpdater * Changes input into ObjectWritables. */ public void map(Text key, Writable value, - OutputCollector<Text, ObjectWritable> output, Reporter reporter) - throws IOException { + OutputCollector<Text, ObjectWritable> output, Reporter reporter) + throws IOException { ObjectWritable objWrite = new ObjectWritable(); objWrite.set(value); @@ -93,8 +92,8 @@ public class ScoreUpdater * with a cleared score. */ public void reduce(Text key, Iterator<ObjectWritable> values, - OutputCollector<Text, CrawlDatum> output, Reporter reporter) - throws IOException { + OutputCollector<Text, CrawlDatum> output, Reporter reporter) + throws IOException { String url = key.toString(); Node node = null; @@ -106,34 +105,31 @@ public class ScoreUpdater ObjectWritable next = values.next(); Object value = next.get(); if (value instanceof Node) { - node = (Node)value; - } - else if (value instanceof CrawlDatum) { - datum = (CrawlDatum)value; + node = (Node) value; + } else if (value instanceof CrawlDatum) { + datum = (CrawlDatum) value; } } - // datum should never be null, could happen if somehow the url was + // datum should never be null, could happen if somehow the url was // normalized or changed after being pulled from the crawldb if (datum != null) { if (node != null) { - + // set the inlink score in the nodedb float inlinkScore = node.getInlinkScore(); datum.setScore(inlinkScore); LOG.debug(url + ": setting to score " + inlinkScore); - } - else { - + } else { + // clear out the score in the crawldb datum.setScore(clearScore); LOG.debug(url + ": setting to clear score of " + clearScore); } output.collect(key, datum); - } - else { + } else { LOG.debug(url + ": no datum"); } } @@ -142,16 +138,18 @@ public class ScoreUpdater } /** - * Updates the inlink score in the web graph node databsae into the crawl + * Updates the inlink score in the web graph node databsae into the crawl * database. * - * @param crawlDb The crawl database to update - * @param webGraphDb The webgraph database to use. + * @param crawlDb + * The crawl database to update + * @param webGraphDb + * The webgraph database to use. * - * @throws IOException If an error occurs while updating the scores. + * @throws IOException + * If an error occurs while updating the scores. */ - public void update(Path crawlDb, Path webGraphDb) - throws IOException { + public void update(Path crawlDb, Path webGraphDb) throws IOException { SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); long start = System.currentTimeMillis(); @@ -164,8 +162,8 @@ public class ScoreUpdater LOG.info("Running crawldb update " + crawlDb); Path nodeDb = new Path(webGraphDb, WebGraph.NODE_DIR); Path crawlDbCurrent = new Path(crawlDb, CrawlDb.CURRENT_NAME); - Path newCrawlDb = new Path(crawlDb, - Integer.toString(new Random().nextInt(Integer.MAX_VALUE))); + Path newCrawlDb = new Path(crawlDb, Integer.toString(new Random() + .nextInt(Integer.MAX_VALUE))); // run the updater job outputting to the temp crawl database JobConf updater = new NutchJob(conf); @@ -184,10 +182,9 @@ public class ScoreUpdater try { JobClient.runJob(updater); - } - catch (IOException e) { + } catch (IOException e) { LOG.error(StringUtils.stringifyException(e)); - + // remove the temp crawldb on error if (fs.exists(newCrawlDb)) { fs.delete(newCrawlDb, true); @@ -200,34 +197,33 @@ public class ScoreUpdater CrawlDb.install(updater, crawlDb); long end = System.currentTimeMillis(); - LOG.info("ScoreUpdater: finished at " + sdf.format(end) + ", elapsed: " + TimingUtil.elapsedTime(start, end)); + LOG.info("ScoreUpdater: finished at " + sdf.format(end) + ", elapsed: " + + TimingUtil.elapsedTime(start, end)); } - public static void main(String[] args) - throws Exception { + public static void main(String[] args) throws Exception { int res = ToolRunner.run(NutchConfiguration.create(), new ScoreUpdater(), - args); + args); System.exit(res); } /** * Runs the ScoreUpdater tool. */ - public int run(String[] args) - throws Exception { + public int run(String[] args) throws Exception { Options options = new Options(); OptionBuilder.withArgName("help"); OptionBuilder.withDescription("show this help message"); Option helpOpts = OptionBuilder.create("help"); options.addOption(helpOpts); - + OptionBuilder.withArgName("crawldb"); OptionBuilder.hasArg(); OptionBuilder.withDescription("the crawldb to use"); Option crawlDbOpts = OptionBuilder.create("crawldb"); options.addOption(crawlDbOpts); - + OptionBuilder.withArgName("webgraphdb"); OptionBuilder.hasArg(); OptionBuilder.withDescription("the webgraphdb to use"); @@ -239,7 +235,7 @@ public class ScoreUpdater CommandLine line = parser.parse(options, args); if (line.hasOption("help") || !line.hasOption("webgraphdb") - || !line.hasOption("crawldb")) { + || !line.hasOption("crawldb")) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp("ScoreUpdater", options); return -1; @@ -249,8 +245,7 @@ public class ScoreUpdater String webGraphDb = line.getOptionValue("webgraphdb"); update(new Path(crawlDb), new Path(webGraphDb)); return 0; - } - catch (Exception e) { + } catch (Exception e) { LOG.error("ScoreUpdater: " + StringUtils.stringifyException(e)); return -1; }