Author: lewismc
Date: Sun Mar 18 16:46:33 2012
New Revision: 1302161

URL: http://svn.apache.org/viewvc?rev=1302161&view=rev
Log:
commit to address NUTCH-1273

Modified:
    nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
    nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
    nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
    nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java
    nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
    nutch/trunk/src/java/org/apache/nutch/plugin/PluginDescriptor.java
    nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java
    nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
    nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
    nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
    
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java?rev=1302161&r1=1302160&r2=1302161&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDatum.java Sun Mar 18 
16:46:33 2012
@@ -262,7 +262,7 @@ public class CrawlDatum implements Writa
     if (version > 3) {
       boolean hasMetadata = false;
       if (version < 7) {
-        MapWritable oldMetaData = new MapWritable();
+        org.apache.hadoop.io.MapWritable oldMetaData = new 
org.apache.hadoop.io.MapWritable();
         if (in.readBoolean()) {
           hasMetadata = true;
           metaData = new org.apache.hadoop.io.MapWritable();

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java?rev=1302161&r1=1302160&r2=1302161&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbReader.java Sun Mar 18 
16:46:33 2012
@@ -19,6 +19,7 @@ package org.apache.nutch.crawl;
 
 import java.io.DataOutputStream;
 import java.io.IOException;
+import java.io.Closeable;
 import java.net.URL;
 import java.util.Date;
 import java.util.Iterator;
@@ -35,7 +36,6 @@ import org.slf4j.LoggerFactory;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
-import org.apache.hadoop.io.Closeable;
 import org.apache.hadoop.io.FloatWritable;
 import org.apache.hadoop.io.LongWritable;
 import org.apache.hadoop.io.MapFile;

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java?rev=1302161&r1=1302160&r2=1302161&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/LinkDbReader.java Sun Mar 18 
16:46:33 2012
@@ -37,6 +37,7 @@ import org.apache.nutch.util.TimingUtil;
 
 import java.text.SimpleDateFormat;
 import java.util.Iterator;
+import java.io.Closeable;
 
 /** . */
 public class LinkDbReader extends Configured implements Tool, Closeable {

Modified: nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java?rev=1302161&r1=1302160&r2=1302161&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/crawl/NutchWritable.java Sun Mar 18 
16:46:33 2012
@@ -31,12 +31,12 @@ public class NutchWritable extends Gener
       org.apache.hadoop.io.BytesWritable.class,
       org.apache.hadoop.io.FloatWritable.class,
       org.apache.hadoop.io.IntWritable.class,
+      org.apache.hadoop.io.MapWritable.class,
       org.apache.hadoop.io.Text.class,
       org.apache.hadoop.io.MD5Hash.class,
       org.apache.nutch.crawl.CrawlDatum.class,
       org.apache.nutch.crawl.Inlink.class,
       org.apache.nutch.crawl.Inlinks.class,
-      org.apache.nutch.crawl.MapWritable.class,
       org.apache.nutch.fetcher.FetcherOutput.class,
       org.apache.nutch.metadata.Metadata.class,
       org.apache.nutch.parse.Outlink.class,

Modified: nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java?rev=1302161&r1=1302160&r2=1302161&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/parse/ParseSegment.java Sun Mar 18 
16:46:33 2012
@@ -75,7 +75,7 @@ public class ParseSegment extends Config
                   OutputCollector<Text, ParseImpl> output, Reporter reporter)
     throws IOException {
     // convert on the fly from old UTF8 keys
-    if (key instanceof UTF8) {
+    if (key instanceof Text) {
       newKey.set(key.toString());
       key = newKey;
     }

Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginDescriptor.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginDescriptor.java?rev=1302161&r1=1302160&r2=1302161&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/plugin/PluginDescriptor.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginDescriptor.java Sun Mar 
18 16:46:33 2012
@@ -20,6 +20,7 @@ import java.io.File;
 import java.io.IOException;
 import java.net.MalformedURLException;
 import java.net.URL;
+import java.net.URI;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.Locale;
@@ -214,12 +215,17 @@ public class PluginDescriptor {
 
   /**
    * Adds a exported library with a relative path to the plugin directory.
+   * We automatically escape characters that are illegal in URLs. It is 
+   * recommended that code converts an abstract pathname into a URL by 
+   * first converting it into a URI, via the toURI method, and then 
+   * converting the URI into a URL via the URI.toURL method.
    * 
    * @param pLibPath
    */
   public void addExportedLibRelative(String pLibPath)
       throws MalformedURLException {
-    URL url = new File(getPluginPath() + File.separator + pLibPath).toURL();
+    URI uri = new File(getPluginPath() + File.separator + pLibPath).toURI();
+    URL url = uri.toURL();
     fExportedLibs.add(url);
   }
 
@@ -242,13 +248,18 @@ public class PluginDescriptor {
   }
 
   /**
-   * Adds a not exported library with a plugin directory relative path.
+   * Adds a exported library with a relative path to the plugin directory.
+   * We automatically escape characters that are illegal in URLs. It is 
+   * recommended that code converts an abstract pathname into a URL by 
+   * first converting it into a URI, via the toURI method, and then 
+   * converting the URI into a URL via the URI.toURL method.
    * 
    * @param pLibPath
    */
   public void addNotExportedLibRelative(String pLibPath)
       throws MalformedURLException {
-    URL url = new File(getPluginPath() + File.separator + pLibPath).toURL();
+    URI uri = new File(getPluginPath() + File.separator + pLibPath).toURI();
+    URL url = uri.toURL();
     fNotExportedLibs.add(url);
   }
 
@@ -279,7 +290,7 @@ public class PluginDescriptor {
     try {
       for (File file2 : file.listFiles()) {
         if (file2.getAbsolutePath().endsWith("properties"))
-          arrayList.add(file2.getParentFile().toURL());
+          arrayList.add(file2.getParentFile().toURI().toURL());
       }
     } catch (MalformedURLException e) {
       LOG.debug(getPluginId() + " " + e.toString());

Modified: nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java?rev=1302161&r1=1302160&r2=1302161&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/plugin/PluginManifestParser.java Sun 
Mar 18 16:46:33 2012
@@ -21,6 +21,7 @@ import java.io.IOException;
 import java.io.UnsupportedEncodingException;
 import java.net.MalformedURLException;
 import java.net.URL;
+import java.net.URI;
 import java.net.URLDecoder;
 import java.util.HashMap;
 import java.util.Map;
@@ -147,7 +148,7 @@ public class PluginManifestParser {
   private PluginDescriptor parseManifestFile(String pManifestPath)
       throws MalformedURLException, SAXException, IOException,
       ParserConfigurationException {
-    Document document = parseXML(new File(pManifestPath).toURL());
+    Document document = parseXML(new File(pManifestPath).toURI().toURL());
     String pPath = new File(pManifestPath).getParent();
     return parsePlugin(document, pPath);
   }

Modified: nutch/trunk/src/java/org/apache/nutch/protocol/Content.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/Content.java?rev=1302161&r1=1302160&r2=1302161&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/protocol/Content.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/Content.java Sun Mar 18 
16:46:33 2012
@@ -33,7 +33,6 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.ArrayFile;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.UTF8;
 import org.apache.hadoop.io.VersionMismatchException;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.util.GenericOptionsParser;
@@ -93,21 +92,21 @@ public final class Content implements Wr
     switch (oldVersion) {
     case 0:
     case 1:
-      url = UTF8.readString(in); // read url
-      base = UTF8.readString(in); // read base
+      url = Text.readString(in); // read url
+      base = Text.readString(in); // read base
 
       content = new byte[in.readInt()]; // read content
       in.readFully(content);
 
-      contentType = UTF8.readString(in); // read contentType
+      contentType = Text.readString(in); // read contentType
       // reconstruct metadata
       int keySize = in.readInt();
       String key;
       for (int i = 0; i < keySize; i++) {
-        key = UTF8.readString(in);
+        key = Text.readString(in);
         int valueSize = in.readInt();
         for (int j = 0; j < valueSize; j++) {
-          metadata.add(key, UTF8.readString(in));
+          metadata.add(key, Text.readString(in));
         }
       }
       break;

Modified: nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java?rev=1302161&r1=1302160&r2=1302161&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/segment/SegmentReader.java Sun Mar 18 
16:46:33 2012
@@ -43,7 +43,6 @@ import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.io.MapFile;
 import org.apache.hadoop.io.SequenceFile;
 import org.apache.hadoop.io.Text;
-import org.apache.hadoop.io.UTF8;
 import org.apache.hadoop.io.Writable;
 import org.apache.hadoop.io.WritableComparable;
 import org.apache.hadoop.mapred.FileInputFormat;
@@ -86,8 +85,9 @@ public class SegmentReader extends Confi
 
     public void map(WritableComparable key, Writable value,
         OutputCollector<Text, NutchWritable> collector, Reporter reporter) 
throws IOException {
-      // convert on the fly from old formats with UTF8 keys
-      if (key instanceof UTF8) {
+      // convert on the fly from old formats with UTF8 keys.
+      // UTF8 deprecated and replaced by Text.
+      if (key instanceof Text) {
         newKey.set(key.toString());
         key = newKey;
       }
@@ -252,7 +252,7 @@ public class SegmentReader extends Confi
         writer.close();
       }
     }
-    fs.delete(tempDir);
+    fs.delete(tempDir, true);
     if (LOG.isInfoEnabled()) { LOG.info("SegmentReader: done"); }
   }
 

Modified: nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java?rev=1302161&r1=1302160&r2=1302161&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java 
(original)
+++ nutch/trunk/src/java/org/apache/nutch/tools/arc/ArcSegmentCreator.java Sun 
Mar 18 16:46:33 2012
@@ -315,7 +315,7 @@ public class ArcSegmentCreator
         // the arc file,  TODO: currently this doesn't handle text of errors
         // pages (i.e. 404, etc.). We assume we won't get those.
         ProtocolStatus status = ProtocolStatus.STATUS_SUCCESS;
-        Content content = new Content(urlStr, urlStr, bytes.get(), contentType,
+        Content content = new Content(urlStr, urlStr, bytes.getBytes(), 
contentType,
           new Metadata(), getConf());
         
         // set the url version into the metadata

Modified: 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java?rev=1302161&r1=1302160&r2=1302161&view=diff
==============================================================================
--- 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 (original)
+++ 
nutch/trunk/src/plugin/protocol-file/src/java/org/apache/nutch/protocol/file/FileResponse.java
 Sun Mar 18 16:46:33 2012
@@ -19,6 +19,7 @@ package org.apache.nutch.protocol.file;
 
 // JDK imports
 import java.net.URL;
+import java.net.URI;
 import java.util.Date;
 import java.util.TreeMap;
 import java.io.IOException;
@@ -151,7 +152,12 @@ public class FileResponse {
       if (!f.equals(f.getCanonicalFile())) {
         // set headers
         //hdrs.put("Location", f.getCanonicalFile().toURI());
-        headers.set(Response.LOCATION, 
f.getCanonicalFile().toURL().toString());
+        //
+        // we want to automatically escape characters that are illegal in 
URLs. 
+        // It is recommended that new code convert an abstract pathname into a 
URL 
+        // by first converting it into a URI, via the toURI method, and then 
+        // converting the URI into a URL via the URI.toURL method.
+        headers.set(Response.LOCATION, 
f.getCanonicalFile().toURI().toURL().toString());
 
         this.code = 300;  // http redirect
         return;


Reply via email to