Author: ab Date: Thu Jan 11 13:51:20 2007 New Revision: 495392 URL: http://svn.apache.org/viewvc?view=rev&rev=495392 Log: Upgrade to Hadoop 0.10.1. HTTPClient is now a dependency - move it to lib/ and remove it as a plugin.
Add also native Linux libraries for Hadoop compression, plus corresponding logic in bin/nutch. Hadoop uses larger buffers now - explicitly set large heap size for JUnit tests. All tests should pass now. Added: lucene/nutch/trunk/lib/commons-codec-1.3.jar (with props) lucene/nutch/trunk/lib/commons-httpclient-3.0.1.jar (with props) lucene/nutch/trunk/lib/hadoop-0.10.1-core.jar (with props) lucene/nutch/trunk/lib/jets3t.jar (with props) lucene/nutch/trunk/lib/native/ lucene/nutch/trunk/lib/native/Linux-i386-32/ lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a (with props) lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so (with props) lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1 (with props) lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0 (with props) Removed: lucene/nutch/trunk/lib/hadoop-0.9.1.jar lucene/nutch/trunk/src/plugin/lib-commons-httpclient/ lucene/nutch/trunk/src/plugin/protocol-httpclient/lib/ Modified: lucene/nutch/trunk/CHANGES.txt lucene/nutch/trunk/bin/nutch lucene/nutch/trunk/build.xml lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java lucene/nutch/trunk/src/plugin/build.xml lucene/nutch/trunk/src/plugin/parse-rss/build.xml lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml Modified: lucene/nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=495392&r1=495391&r2=495392 ============================================================================== --- lucene/nutch/trunk/CHANGES.txt (original) +++ lucene/nutch/trunk/CHANGES.txt Thu Jan 11 13:51:20 2007 @@ -126,6 +126,8 @@ 40. When indexing pages with redirection, drop all intermediate pages and index only the final page. (ab) +41. Upgrade to Hadoop 0.10.1. (ab) + Release 0.8 - 2006-07-25 Modified: lucene/nutch/trunk/bin/nutch URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/bin/nutch?view=diff&rev=495392&r1=495391&r2=495392 ============================================================================== --- lucene/nutch/trunk/bin/nutch (original) +++ lucene/nutch/trunk/bin/nutch Thu Jan 11 13:51:20 2007 @@ -123,6 +123,25 @@ CLASSPATH=${CLASSPATH}:$f; done + +# setup 'java.library.path' for native-hadoop code if necessary +JAVA_LIBRARY_PATH='' +if [ -d "${NUTCH_HOME}/build/native" -o -d "${NUTCH_HOME}/lib/native" ]; then + JAVA_PLATFORM=`CLASSPATH=${CLASSPATH} ${JAVA} org.apache.hadoop.util.PlatformName` + + if [ -d "$NUTCH_HOME/build/native" ]; then + JAVA_LIBRARY_PATH=${HADOOP_HOME}/build/native/${JAVA_PLATFORM}/lib + fi + + if [ -d "${NUTCH_HOME}/lib/native" ]; then + if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then + JAVA_LIBRARY_PATH=${JAVA_LIBRARY_PATH}:${NUTCH_HOME}/lib/native/${JAVA_PLATFORM} + else + JAVA_LIBRARY_PATH=${NUTCH_HOME}/lib/native/${JAVA_PLATFORM} + fi + fi +fi + # restore ordinary behaviour unset IFS @@ -142,6 +161,10 @@ NUTCH_OPTS="$NUTCH_OPTS -Dhadoop.log.dir=$NUTCH_LOG_DIR" NUTCH_OPTS="$NUTCH_OPTS -Dhadoop.log.file=$NUTCH_LOGFILE" +if [ "x$JAVA_LIBRARY_PATH" != "x" ]; then + NUTCH_OPTS="$NUTCH_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH" +fi + # figure out which class to run if [ "$COMMAND" = "crawl" ] ; then CLASS=org.apache.nutch.crawl.Crawl @@ -194,5 +217,6 @@ fi # run it +echo "$JAVA" $JAVA_HEAP_MAX $NUTCH_OPTS -classpath "$CLASSPATH" $CLASS "$@" exec "$JAVA" $JAVA_HEAP_MAX $NUTCH_OPTS -classpath "$CLASSPATH" $CLASS "$@" Modified: lucene/nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/build.xml?view=diff&rev=495392&r1=495391&r2=495392 ============================================================================== --- lucene/nutch/trunk/build.xml (original) +++ lucene/nutch/trunk/build.xml Thu Jan 11 13:51:20 2007 @@ -262,7 +262,7 @@ todir="${test.build.classes}"/> <junit printsummary="yes" haltonfailure="no" fork="yes" dir="${basedir}" - errorProperty="tests.failed" failureProperty="tests.failed"> + errorProperty="tests.failed" failureProperty="tests.failed" maxmemory="1000m"> <sysproperty key="test.build.data" value="${test.build.data}"/> <sysproperty key="test.src.dir" value="${test.src.dir}"/> <classpath refid="test.classpath"/> Added: lucene/nutch/trunk/lib/commons-codec-1.3.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/commons-codec-1.3.jar?view=auto&rev=495392 ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/commons-codec-1.3.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/commons-httpclient-3.0.1.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/commons-httpclient-3.0.1.jar?view=auto&rev=495392 ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/commons-httpclient-3.0.1.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/hadoop-0.10.1-core.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/hadoop-0.10.1-core.jar?view=auto&rev=495392 ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/hadoop-0.10.1-core.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/jets3t.jar URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/jets3t.jar?view=auto&rev=495392 ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/jets3t.jar ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a?view=auto&rev=495392 ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.a ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so?view=auto&rev=495392 ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1 URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1?view=auto&rev=495392 ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1 ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Added: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0 URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0?view=auto&rev=495392 ============================================================================== Binary file - no diff available. Propchange: lucene/nutch/trunk/lib/native/Linux-i386-32/libhadoop.so.1.0.0 ------------------------------------------------------------------------------ svn:mime-type = application/octet-stream Modified: lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java?view=diff&rev=495392&r1=495391&r2=495392 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/crawl/CrawlDbMerger.java Thu Jan 11 13:51:20 2007 @@ -120,8 +120,6 @@ job.setJobName("crawldb merge " + output); job.setInputFormat(SequenceFileInputFormat.class); - job.setInputKeyClass(Text.class); - job.setInputValueClass(CrawlDatum.class); job.setMapperClass(CrawlDbFilter.class); job.setBoolean(CrawlDbFilter.URL_FILTERING, filter); Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?view=diff&rev=495392&r1=495391&r2=495392 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Thu Jan 11 13:51:20 2007 @@ -48,12 +48,12 @@ public static class InputFormat extends SequenceFileInputFormat { /** Don't split inputs, to keep things polite. */ - public FileSplit[] getSplits(FileSystem fs, JobConf job, int nSplits) + public InputSplit[] getSplits(FileSystem fs, JobConf job, int nSplits) throws IOException { - Path[] files = listPaths(fs, job); - FileSplit[] splits = new FileSplit[files.length]; + Path[] files = listPaths(job); + InputSplit[] splits = new InputSplit[files.length]; for (int i = 0; i < files.length; i++) { - splits[i] = new FileSplit(files[i], 0, fs.getLength(files[i])); + splits[i] = new FileSplit(files[i], 0, fs.getLength(files[i]), job); } return splits; } Modified: lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java?view=diff&rev=495392&r1=495391&r2=495392 ============================================================================== --- lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java (original) +++ lucene/nutch/trunk/src/java/org/apache/nutch/indexer/DeleteDuplicates.java Thu Jan 11 13:51:20 2007 @@ -145,13 +145,12 @@ private static final long INDEX_LENGTH = Integer.MAX_VALUE; /** Return each index as a split. */ - public FileSplit[] getSplits(FileSystem fs, JobConf job, - int numSplits) + public InputSplit[] getSplits(JobConf job, int numSplits) throws IOException { - Path[] files = listPaths(fs, job); - FileSplit[] splits = new FileSplit[files.length]; + Path[] files = listPaths(job); + InputSplit[] splits = new InputSplit[files.length]; for (int i = 0; i < files.length; i++) { - splits[i] = new FileSplit(files[i], 0, INDEX_LENGTH); + splits[i] = new FileSplit(files[i], 0, INDEX_LENGTH, job); } return splits; } @@ -163,9 +162,9 @@ private int doc; private Text index; - public DDRecordReader(FileSystem fs, FileSplit split, JobConf job, + public DDRecordReader(FileSplit split, JobConf job, Text index) throws IOException { - indexReader = IndexReader.open(new FsDirectory(fs, split.getPath(), false, job)); + indexReader = IndexReader.open(new FsDirectory(FileSystem.get(job), split.getPath(), false, job)); maxDoc = indexReader.maxDoc(); this.index = index; } @@ -211,7 +210,7 @@ } public long getPos() throws IOException { - return maxDoc==0 ? 0 : (doc*INDEX_LENGTH)/maxDoc; + return maxDoc == 0 ? 0 : (doc*INDEX_LENGTH)/maxDoc; } public void close() throws IOException { @@ -225,16 +224,20 @@ public Writable createValue() { return new IndexDoc(); } + + public float getProgress() throws IOException { + return maxDoc == 0 ? 0.0f : (float)doc / (float)maxDoc; + } } /** Return each index as a split. */ - public RecordReader getRecordReader(final FileSystem fs, - final FileSplit split, - final JobConf job, + public RecordReader getRecordReader(InputSplit split, + JobConf job, Reporter reporter) throws IOException { - final Text index = new Text(split.getPath().toString()); + FileSplit fsplit = (FileSplit)split; + Text index = new Text(fsplit.getPath().toString()); reporter.setStatus(index.toString()); - return new DDRecordReader(fs, split, job, index); + return new DDRecordReader(fsplit, job, index); } } Modified: lucene/nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/build.xml?view=diff&rev=495392&r1=495391&r2=495392 ============================================================================== --- lucene/nutch/trunk/src/plugin/build.xml (original) +++ lucene/nutch/trunk/src/plugin/build.xml Thu Jan 11 13:51:20 2007 @@ -16,7 +16,6 @@ <ant dir="index-basic" target="deploy"/> <ant dir="index-more" target="deploy"/> <ant dir="languageidentifier" target="deploy"/> - <ant dir="lib-commons-httpclient" target="deploy"/> <ant dir="lib-http" target="deploy"/> <ant dir="lib-jakarta-poi" target="deploy"/> <ant dir="lib-log4j" target="deploy"/> Modified: lucene/nutch/trunk/src/plugin/parse-rss/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-rss/build.xml?view=diff&rev=495392&r1=495391&r2=495392 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rss/build.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-rss/build.xml Thu Jan 11 13:51:20 2007 @@ -23,7 +23,6 @@ <ant target="deploy" inheritall="false" dir="../lib-xml"/> <ant target="deploy" inheritall="false" dir="../lib-log4j"/> <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/> - <ant target="deploy" inheritall="false" dir="../lib-commons-httpclient"/> <ant target="deploy" inheritall="false" dir="../protocol-file"/> </target> Modified: lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml?view=diff&rev=495392&r1=495391&r2=495392 ============================================================================== --- lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/parse-rss/plugin.xml Thu Jan 11 13:51:20 2007 @@ -19,7 +19,6 @@ <import plugin="nutch-extensionpoints"/> <import plugin="lib-xml"/> <import plugin="lib-log4j"/> - <import plugin="lib-commons-httpclient"/> </requires> <extension id="org.apache.nutch.parse.rss" Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml?view=diff&rev=495392&r1=495391&r2=495392 ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/build.xml Thu Jan 11 13:51:20 2007 @@ -5,14 +5,12 @@ <import file="../build-plugin.xml"/> <target name="deps-jar"> - <ant target="jar" inheritall="false" dir="../lib-commons-httpclient"/> <ant target="jar" inheritall="false" dir="../lib-http"/> </target> <path id="plugin.deps"> <fileset dir="${nutch.root}/build"> <include name="**/lib-http/*.jar" /> - <include name="**/lib-commons-httpclient/*.jar" /> </fileset> </path> Modified: lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml?view=diff&rev=495392&r1=495391&r2=495392 ============================================================================== --- lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml (original) +++ lucene/nutch/trunk/src/plugin/protocol-httpclient/plugin.xml Thu Jan 11 13:51:20 2007 @@ -10,13 +10,11 @@ <library name="protocol-httpclient.jar"> <export name="*"/> </library> - <library name="commons-codec.jar" /> </runtime> <requires> <import plugin="nutch-extensionpoints"/> <import plugin="lib-http"/> - <import plugin="lib-commons-httpclient"/> </requires> <extension id="org.apache.nutch.protocol.httpclient"