Author: ab
Date: Thu Jan 11 13:51:20 2007
New Revision: 495392

URL: http://svn.apache.org/viewvc?view=rev&rev=495392
Log:
Upgrade to Hadoop 0.10.1. HTTPClient is now a dependency - move it
to lib/ and remove it as a plugin.

Add also native Linux libraries for Hadoop compression, plus corresponding
logic in bin/nutch.

Hadoop uses larger buffers now - explicitly set large heap size for
JUnit tests. All tests should pass now.

Added:
    lucene/nutch/trunk/lib/commons-codec-1.3.jar   (with props)
    lucene/nutch/trunk/lib/commons-httpclient-3.0.1.jar   (with props)
    lucene/nutch/trunk/lib/hadoop-0.10.1-core.jar   (with props)
    lucene/nutch/trunk/lib/jets3t.jar   (with props)
    lucene/nutch/trunk/lib/native/
    lucene/nutch/trunk/lib/native/Linux-i386-32/
    lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a   (with props)
    lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so   (with props)
    lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1   (with props)
    lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0   (with 
props)
Removed:
    lucene/nutch/trunk/lib/hadoop-0.9.1.jar
    lucene/nutch/trunk/src/plugin/lib-commons-httpclient/
    lucene/nutch/trunk/src/plugin/protocol-httpclient/lib/
Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/bin/nutch
    lucene/nutch/trunk/build.xml
    lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
    lucene/nutch/trunk/src/plugin/build.xml
    lucene/nutch/trunk/src/plugin/parse-rss/build.xml
    lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml
    lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml
    lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml

Modified: lucene/nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=495392&r1=495391&r2=495392
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu Jan 11 13:51:20 2007
@@ -126,6 +126,8 @@
 40. When indexing pages with redirection, drop all intermediate pages and
     index only the final page. (ab)
 
+41. Upgrade to Hadoop 0.10.1. (ab)
+
 
 Release 0.8 - 2006-07-25
 

Modified: lucene/nutch/trunk/bin/nutch
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/bin/nutch?view=diff&rev=495392&r1=495391&r2=495392
==============================================================================
--- lucene/nutch/trunk/bin/nutch (original)
+++ lucene/nutch/trunk/bin/nutch Thu Jan 11 13:51:20 2007
@@ -123,6 +123,25 @@
   CLASSPATH=${CLASSPATH}:$f;
 done
 
+
+# setup 'java.library.path' for native-hadoop code if necessary
+JAVA_LIBRARY_PATH=''
+if [ -d "${NUTCH_HOME}/build/native" -o -d "${NUTCH_HOME}/lib/native" ]; then
+  JAVA_PLATFORM=`CLASSPATH=${CLASSPATH} ${JAVA} 
org.apache.hadoop.util.PlatformName`
+  
+  if [ -d "$NUTCH_HOME/build/native" ]; then
+    JAVA_LIBRARY_PATH=${HADOOP_HOME}/build/native/${JAVA_PLATFORM}/lib
+  fi
+  
+  if [ -d "${NUTCH_HOME}/lib/native" ]; then
+    if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+      
JAVA_LIBRARY_PATH=${JAVA_LIBRARY_PATH}:${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}
+    else
+      JAVA_LIBRARY_PATH=${NUTCH_HOME}/lib/native/${JAVA_PLATFORM}
+    fi
+  fi
+fi
+
 # restore ordinary behaviour
 unset IFS
 
@@ -142,6 +161,10 @@
 NUTCH_OPTS="$NUTCH_OPTS -Dhadoop.log.dir=$NUTCH_LOG_DIR"
 NUTCH_OPTS="$NUTCH_OPTS -Dhadoop.log.file=$NUTCH_LOGFILE"
 
+if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then
+  NUTCH_OPTS="$NUTCH_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
+fi
+
 # figure out which class to run
 if [ "$COMMAND" = "crawl" ] ; then
   CLASS=org.apache.nutch.crawl.Crawl
@@ -194,5 +217,6 @@
 fi
 
 # run it
+echo "$JAVA" $JAVA_HEAP_MAX $NUTCH_OPTS -classpath "$CLASSPATH" $CLASS "$@"
 exec "$JAVA" $JAVA_HEAP_MAX $NUTCH_OPTS -classpath "$CLASSPATH" $CLASS "$@"
 

Modified: lucene/nutch/trunk/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?view=diff&rev=495392&r1=495391&r2=495392
==============================================================================
--- lucene/nutch/trunk/build.xml (original)
+++ lucene/nutch/trunk/build.xml Thu Jan 11 13:51:20 2007
@@ -262,7 +262,7 @@
           todir="${test.build.classes}"/>
 
     <junit printsummary="yes" haltonfailure="no" fork="yes" dir="${basedir}"
-      errorProperty="tests.failed" failureProperty="tests.failed">
+      errorProperty="tests.failed" failureProperty="tests.failed" 
maxmemory="1000m">
       <sysproperty key="test.build.data" value="${test.build.data}"/>
       <sysproperty key="test.src.dir" value="${test.src.dir}"/>
       <classpath refid="test.classpath"/>

Added: lucene/nutch/trunk/lib/commons-codec-1.3.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/commons-codec-1.3.jar?view=auto&rev=495392
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/commons-codec-1.3.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/commons-httpclient-3.0.1.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/commons-httpclient-3.0.1.jar?view=auto&rev=495392
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/commons-httpclient-3.0.1.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/hadoop-0.10.1-core.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.10.1-core.jar?view=auto&rev=495392
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/hadoop-0.10.1-core.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/jets3t.jar
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/jets3t.jar?view=auto&rev=495392
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/jets3t.jar
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a?view=auto&rev=495392
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so?view=auto&rev=495392
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1?view=auto&rev=495392
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0?view=auto&rev=495392
==============================================================================
Binary file - no diff available.

Propchange: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?view=diff&rev=495392&r1=495391&r2=495392
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Thu 
Jan 11 13:51:20 2007
@@ -120,8 +120,6 @@
     job.setJobName("crawldb merge " + output);
 
     job.setInputFormat(SequenceFileInputFormat.class);
-    job.setInputKeyClass(Text.class);
-    job.setInputValueClass(CrawlDatum.class);
 
     job.setMapperClass(CrawlDbFilter.class);
     job.setBoolean(CrawlDbFilter.URL_FILTERING, filter);

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=495392&r1=495391&r2=495392
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Jan 
11 13:51:20 2007
@@ -48,12 +48,12 @@
   
   public static class InputFormat extends SequenceFileInputFormat {
     /** Don't split inputs, to keep things polite. */
-    public FileSplit[] getSplits(FileSystem fs, JobConf job, int nSplits)
+    public InputSplit[] getSplits(FileSystem fs, JobConf job, int nSplits)
       throws IOException {
-      Path[] files = listPaths(fs, job);
-      FileSplit[] splits = new FileSplit[files.length];
+      Path[] files = listPaths(job);
+      InputSplit[] splits = new InputSplit[files.length];
       for (int i = 0; i < files.length; i++) {
-        splits[i] = new FileSplit(files[i], 0, fs.getLength(files[i]));
+        splits[i] = new FileSplit(files[i], 0, fs.getLength(files[i]), job);
       }
       return splits;
     }

Modified: 
lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?view=diff&rev=495392&r1=495391&r2=495392
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java 
(original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java 
Thu Jan 11 13:51:20 2007
@@ -145,13 +145,12 @@
     private static final long INDEX_LENGTH = Integer.MAX_VALUE;
 
     /** Return each index as a split. */
-    public FileSplit[] getSplits(FileSystem fs, JobConf job,
-                                 int numSplits)
+    public InputSplit[] getSplits(JobConf job, int numSplits)
       throws IOException {
-      Path[] files = listPaths(fs, job);
-      FileSplit[] splits = new FileSplit[files.length];
+      Path[] files = listPaths(job);
+      InputSplit[] splits = new InputSplit[files.length];
       for (int i = 0; i < files.length; i++) {
-        splits[i] = new FileSplit(files[i], 0, INDEX_LENGTH);
+        splits[i] = new FileSplit(files[i], 0, INDEX_LENGTH, job);
       }
       return splits;
     }
@@ -163,9 +162,9 @@
       private int doc;
       private Text index;
       
-      public DDRecordReader(FileSystem fs, FileSplit split, JobConf job,
+      public DDRecordReader(FileSplit split, JobConf job,
           Text index) throws IOException {
-        indexReader = IndexReader.open(new FsDirectory(fs, split.getPath(), 
false, job));
+        indexReader = IndexReader.open(new FsDirectory(FileSystem.get(job), 
split.getPath(), false, job));
         maxDoc = indexReader.maxDoc();
         this.index = index;
       }
@@ -211,7 +210,7 @@
       }
 
       public long getPos() throws IOException {
-        return maxDoc==0 ? 0 : (doc*INDEX_LENGTH)/maxDoc;
+        return maxDoc == 0 ? 0 : (doc*INDEX_LENGTH)/maxDoc;
       }
 
       public void close() throws IOException {
@@ -225,16 +224,20 @@
       public Writable createValue() {
         return new IndexDoc();
       }
+
+      public float getProgress() throws IOException {
+        return maxDoc == 0 ? 0.0f : (float)doc / (float)maxDoc;
+      }
     }
     
     /** Return each index as a split. */
-    public RecordReader getRecordReader(final FileSystem fs,
-                                        final FileSplit split,
-                                        final JobConf job,
+    public RecordReader getRecordReader(InputSplit split,
+                                        JobConf job,
                                         Reporter reporter) throws IOException {
-      final Text index = new Text(split.getPath().toString());
+      FileSplit fsplit = (FileSplit)split;
+      Text index = new Text(fsplit.getPath().toString());
       reporter.setStatus(index.toString());
-      return new DDRecordReader(fs, split, job, index);
+      return new DDRecordReader(fsplit, job, index);
     }
   }
   

Modified: lucene/nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?view=diff&rev=495392&r1=495391&r2=495392
==============================================================================
--- lucene/nutch/trunk/src/plugin/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/build.xml Thu Jan 11 13:51:20 2007
@@ -16,7 +16,6 @@
      <ant dir="index-basic" target="deploy"/>
      <ant dir="index-more" target="deploy"/>
      <ant dir="languageidentifier" target="deploy"/>
-     <ant dir="lib-commons-httpclient" target="deploy"/>
      <ant dir="lib-http" target="deploy"/>
      <ant dir="lib-jakarta-poi" target="deploy"/>
      <ant dir="lib-log4j" target="deploy"/>

Modified: lucene/nutch/trunk/src/plugin/parse-rss/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-rss/build.xml?view=diff&rev=495392&r1=495391&r2=495392
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/build.xml Thu Jan 11 13:51:20 2007
@@ -23,7 +23,6 @@
    <ant target="deploy" inheritall="false" dir="../lib-xml"/>
    <ant target="deploy" inheritall="false" dir="../lib-log4j"/>
    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
-   <ant target="deploy" inheritall="false" dir="../lib-commons-httpclient"/>
    <ant target="deploy" inheritall="false" dir="../protocol-file"/>
  </target>
 

Modified: lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml?view=diff&rev=495392&r1=495391&r2=495392
==============================================================================
--- lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml Thu Jan 11 13:51:20 2007
@@ -19,7 +19,6 @@
       <import plugin="nutch-extensionpoints"/>
       <import plugin="lib-xml"/>
       <import plugin="lib-log4j"/>
-      <import plugin="lib-commons-httpclient"/>
    </requires>
 
    <extension id="org.apache.nutch.parse.rss"

Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml?view=diff&rev=495392&r1=495391&r2=495392
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml Thu Jan 11 
13:51:20 2007
@@ -5,14 +5,12 @@
   <import file="../build-plugin.xml"/>
 
   <target name="deps-jar">
-    <ant target="jar" inheritall="false" dir="../lib-commons-httpclient"/>
     <ant target="jar" inheritall="false" dir="../lib-http"/>
   </target>
 
   <path id="plugin.deps">
     <fileset dir="${nutch.root}/build">
       <include name="**/lib-http/*.jar" />
-      <include name="**/lib-commons-httpclient/*.jar" />
     </fileset>
   </path>
 

Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml
URL: 
http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml?view=diff&rev=495392&r1=495391&r2=495392
==============================================================================
--- lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml (original)
+++ lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml Thu Jan 11 
13:51:20 2007
@@ -10,13 +10,11 @@
       <library name="protocol-httpclient.jar">
          <export name="*"/>
       </library>
-      <library name="commons-codec.jar" />
    </runtime>
 
    <requires>
       <import plugin="nutch-extensionpoints"/>
       <import plugin="lib-http"/>
-      <import plugin="lib-commons-httpclient"/>
    </requires>
 
    <extension id="org.apache.nutch.protocol.httpclient"


Reply via email to