Author: jnioche
Date: Thu Aug 29 11:27:45 2013
New Revision: 1518594

URL: http://svn.apache.org/r1518594
Log:
(NUTCH-1622) Create Outlinks with metadata

Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java
    nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1518594&r1=1518593&r2=1518594&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Thu Aug 29 11:27:45 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Development Trunk
 
+* NUTCH-1622 Create Outlinks with metadata (jnioche)
+
 * NUTCH-1629 Injector skips empty lines in seed files (kaveh minooie via 
jnioche)
 
 * NUTCH-911 protocol-file to return proper protocol status (Peter Lundberg via 
snagel)

Modified: nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java?rev=1518594&r1=1518593&r2=1518594&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/Outlink.java Thu Aug 29 
11:27:45 2013
@@ -17,65 +17,116 @@
 
 package org.apache.nutch.parse;
 
-import java.io.*;
+import java.io.DataInput;
+import java.io.DataOutput;
+import java.io.IOException;
 import java.net.MalformedURLException;
+import java.util.Map.Entry;
 
-import org.apache.hadoop.io.*;
+import org.apache.hadoop.io.MapWritable;
+import org.apache.hadoop.io.Text;
+import org.apache.hadoop.io.Writable;
 
 /* An outgoing link from a page. */
 public class Outlink implements Writable {
 
-  private String toUrl;
-  private String anchor;
-
-  public Outlink() {}
-
-  public Outlink(String toUrl, String anchor) throws MalformedURLException {
-    this.toUrl = toUrl;
-    if (anchor == null) anchor = "";
-    this.anchor = anchor;
-  }
-
-  public void readFields(DataInput in) throws IOException {
-    toUrl = Text.readString(in);
-    anchor = Text.readString(in);
-  }
-
-  /** Skips over one Outlink in the input. */
-  public static void skip(DataInput in) throws IOException {
-    Text.skip(in);                                // skip toUrl
-    Text.skip(in);                                // skip anchor
-  }
-
-  public void write(DataOutput out) throws IOException {
-    Text.writeString(out, toUrl);
-    Text.writeString(out, anchor);
-  }
-
-  public static Outlink read(DataInput in) throws IOException {
-    Outlink outlink = new Outlink();
-    outlink.readFields(in);
-    return outlink;
-  }
-
-  public String getToUrl() { return toUrl; }
-  public String getAnchor() { return anchor; }
-
-  public void setUrl(String toUrl) {
-    this.toUrl = toUrl;
-  }
-
-  public boolean equals(Object o) {
-    if (!(o instanceof Outlink))
-      return false;
-    Outlink other = (Outlink)o;
-    return
-      this.toUrl.equals(other.toUrl) &&
-      this.anchor.equals(other.anchor);
-  }
-
-  public String toString() {
-    return "toUrl: " + toUrl + " anchor: " + anchor;  // removed "\n". 
toString, not printLine... WD.
-  }
+    private String toUrl;
+    private String anchor;
+    private MapWritable md;
+
+    public Outlink() {
+    }
+
+    public Outlink(String toUrl, String anchor) throws MalformedURLException {
+        this.toUrl = toUrl;
+        if (anchor == null)
+            anchor = "";
+        this.anchor = anchor;
+        md = null;
+    }
+
+    public void readFields(DataInput in) throws IOException {
+        toUrl = Text.readString(in);
+        anchor = Text.readString(in);
+        boolean hasMD = in.readBoolean();
+        if (hasMD) {
+            md = new org.apache.hadoop.io.MapWritable();
+            md.readFields(in);
+        } else
+            md = null;
+    }
+
+    /** Skips over one Outlink in the input. */
+    public static void skip(DataInput in) throws IOException {
+        Text.skip(in); // skip toUrl
+        Text.skip(in); // skip anchor
+        boolean hasMD = in.readBoolean();
+        if (hasMD) {
+            MapWritable metadata = new org.apache.hadoop.io.MapWritable();
+            metadata.readFields(in);
+            ;
+        }
+    }
+
+    public void write(DataOutput out) throws IOException {
+        Text.writeString(out, toUrl);
+        Text.writeString(out, anchor);
+        if (md != null && md.size() > 0) {
+            out.writeBoolean(true);
+            md.write(out);
+        } else {
+            out.writeBoolean(false);
+        }
+    }
+
+    public static Outlink read(DataInput in) throws IOException {
+        Outlink outlink = new Outlink();
+        outlink.readFields(in);
+        return outlink;
+    }
+
+    public String getToUrl() {
+        return toUrl;
+    }
+
+    public void setUrl(String toUrl) {
+        this.toUrl = toUrl;
+    }
+
+    public String getAnchor() {
+        return anchor;
+    }
+
+    public MapWritable getMetadata() {
+        return md;
+    }
+
+    public void setMetadata(MapWritable md) {
+        this.md = md;
+    }
+
+    public boolean equals(Object o) {
+        if (!(o instanceof Outlink))
+            return false;
+        Outlink other = (Outlink) o;
+        return this.toUrl.equals(other.toUrl)
+                && this.anchor.equals(other.anchor);
+    }
+
+    public String toString() {
+        StringBuffer repr = new StringBuffer("toUrl: ");
+        repr.append(toUrl);
+        repr.append(" anchor: ");
+        repr.append(anchor);
+        if (md != null && !md.isEmpty()) {
+            for (Entry<Writable, Writable> e : md.entrySet()) {
+                repr.append(" ");
+                repr.append(e.getKey());
+                repr.append(": ");
+                repr.append(e.getValue());
+            }
+        }
+        return repr.toString();
+    }
 
 }

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java?rev=1518594&r1=1518593&r2=1518594&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseOutputFormat.java Thu Aug 
29 11:27:45 2013
@@ -233,6 +233,15 @@ public class ParseOutputFormat implement
 
             CrawlDatum target = new CrawlDatum(CrawlDatum.STATUS_LINKED, 
interval);
             Text targetUrl = new Text(toUrl);
+            
+            // see if the outlink has any metadata attached 
+            // and if so pass that to the crawldatum so that 
+            // the initial score or distribution can use that 
+            MapWritable outlinkMD = links[i].getMetadata();
+            if (outlinkMD!=null){
+               target.getMetaData().putAll(outlinkMD);
+            }
+            
             try {
               scfilters.initialScore(targetUrl, target);
             } catch (ScoringFilterException e) {


Reply via email to