Author: jnioche Date: Thu Aug 29 11:27:45 2013 New Revision: 1518594 URL: http://svn.apache.org/r1518594 Log: (NUTCH-1622) Create Outlinks with metadata
Modified: nutch/trunk/CHANGES.txt nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1518594&r1=1518593&r2=1518594&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Thu Aug 29 11:27:45 2013 @@ -2,6 +2,8 @@ Nutch Change Log Nutch Development Trunk +* NUTCH-1622 Create Outlinks with metadata (jnioche) + * NUTCH-1629 Injector skips empty lines in seed files (kaveh minooie via jnioche) * NUTCH-911 protocol-file to return proper protocol status (Peter Lundberg via snagel) Modified: nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java?rev=1518594&r1=1518593&r2=1518594&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java Thu Aug 29 11:27:45 2013 @@ -17,65 +17,116 @@ package org.apache.nutch.parse; -import java.io.*; +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; import java.net.MalformedURLException; +import java.util.Map.Entry; -import org.apache.hadoop.io.*; +import org.apache.hadoop.io.MapWritable; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; /* An outgoing link from a page. */ public class Outlink implements Writable { - private String toUrl; - private String anchor; - - public Outlink() {} - - public Outlink(String toUrl, String anchor) throws MalformedURLException { - this.toUrl = toUrl; - if (anchor == null) anchor = ""; - this.anchor = anchor; - } - - public void readFields(DataInput in) throws IOException { - toUrl = Text.readString(in); - anchor = Text.readString(in); - } - - /** Skips over one Outlink in the input. */ - public static void skip(DataInput in) throws IOException { - Text.skip(in); // skip toUrl - Text.skip(in); // skip anchor - } - - public void write(DataOutput out) throws IOException { - Text.writeString(out, toUrl); - Text.writeString(out, anchor); - } - - public static Outlink read(DataInput in) throws IOException { - Outlink outlink = new Outlink(); - outlink.readFields(in); - return outlink; - } - - public String getToUrl() { return toUrl; } - public String getAnchor() { return anchor; } - - public void setUrl(String toUrl) { - this.toUrl = toUrl; - } - - public boolean equals(Object o) { - if (!(o instanceof Outlink)) - return false; - Outlink other = (Outlink)o; - return - this.toUrl.equals(other.toUrl) && - this.anchor.equals(other.anchor); - } - - public String toString() { - return "toUrl: " + toUrl + " anchor: " + anchor; // removed "\n". toString, not printLine... WD. - } + private String toUrl; + private String anchor; + private MapWritable md; + + public Outlink() { + } + + public Outlink(String toUrl, String anchor) throws MalformedURLException { + this.toUrl = toUrl; + if (anchor == null) + anchor = ""; + this.anchor = anchor; + md = null; + } + + public void readFields(DataInput in) throws IOException { + toUrl = Text.readString(in); + anchor = Text.readString(in); + boolean hasMD = in.readBoolean(); + if (hasMD) { + md = new org.apache.hadoop.io.MapWritable(); + md.readFields(in); + } else + md = null; + } + + /** Skips over one Outlink in the input. */ + public static void skip(DataInput in) throws IOException { + Text.skip(in); // skip toUrl + Text.skip(in); // skip anchor + boolean hasMD = in.readBoolean(); + if (hasMD) { + MapWritable metadata = new org.apache.hadoop.io.MapWritable(); + metadata.readFields(in); + ; + } + } + + public void write(DataOutput out) throws IOException { + Text.writeString(out, toUrl); + Text.writeString(out, anchor); + if (md != null && md.size() > 0) { + out.writeBoolean(true); + md.write(out); + } else { + out.writeBoolean(false); + } + } + + public static Outlink read(DataInput in) throws IOException { + Outlink outlink = new Outlink(); + outlink.readFields(in); + return outlink; + } + + public String getToUrl() { + return toUrl; + } + + public void setUrl(String toUrl) { + this.toUrl = toUrl; + } + + public String getAnchor() { + return anchor; + } + + public MapWritable getMetadata() { + return md; + } + + public void setMetadata(MapWritable md) { + this.md = md; + } + + public boolean equals(Object o) { + if (!(o instanceof Outlink)) + return false; + Outlink other = (Outlink) o; + return this.toUrl.equals(other.toUrl) + && this.anchor.equals(other.anchor); + } + + public String toString() { + StringBuffer repr = new StringBuffer("toUrl: "); + repr.append(toUrl); + repr.append(" anchor: "); + repr.append(anchor); + if (md != null && !md.isEmpty()) { + for (Entry<Writable, Writable> e : md.entrySet()) { + repr.append(" "); + repr.append(e.getKey()); + repr.append(": "); + repr.append(e.getValue()); + } + } + return repr.toString(); + } } Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=1518594&r1=1518593&r2=1518594&view=diff ============================================================================== --- nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java (original) +++ nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Thu Aug 29 11:27:45 2013 @@ -233,6 +233,15 @@ public class ParseOutputFormat implement CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, interval); Text targetUrl = new Text(toUrl); + + // see if the outlink has any metadata attached + // and if so pass that to the crawldatum so that + // the initial score or distribution can use that + MapWritable outlinkMD = links[i].getMetadata(); + if (outlinkMD!=null){ + target.getMetaData().putAll(outlinkMD); + } + try { scfilters.initialScore(targetUrl, target); } catch (ScoringFilterException e) {