Author: ferdy Date: Tue Aug 14 07:30:29 2012 New Revision: 1372752 URL: http://svn.apache.org/viewvc?rev=1372752&view=rev Log: NUTCH-1365 Fix crawlId functionalilty by making using of new gora configuration
Modified: nutch/branches/2.x/CHANGES.txt nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java Modified: nutch/branches/2.x/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1372752&r1=1372751&r2=1372752&view=diff ============================================================================== --- nutch/branches/2.x/CHANGES.txt (original) +++ nutch/branches/2.x/CHANGES.txt Tue Aug 14 07:30:29 2012 @@ -2,6 +2,8 @@ Nutch Change Log Release 2.1 - Current Development +* NUTCH-1365 Fix crawlId functionalilty by making using of new gora configuration (ferdy) + * NUTCH-1442 indexingfilter.order is property is misread in code (ferdy via lewismc) * NUTCH-1450 Upgrade to gora deps to 0.2.1 (lewismc) Modified: nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java?rev=1372752&r1=1372751&r2=1372752&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/host/HostInjectorJob.java Tue Aug 14 07:30:29 2012 @@ -150,7 +150,7 @@ public class HostInjectorJob implements job.setMapOutputValueClass(Host.class); job.setOutputFormatClass(GoraOutputFormat.class); GoraOutputFormat.setOutput(job, - StorageUtils.createWebStore(getConf(), String.class, Host.class), true); + StorageUtils.createWebStore(job.getConfiguration(), String.class, Host.class), true); job.setReducerClass(Reducer.class); job.setNumReduceTasks(0); return job.waitForCompletion(true); Modified: nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java?rev=1372752&r1=1372751&r2=1372752&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/storage/StorageUtils.java Tue Aug 14 07:30:29 2012 @@ -33,6 +33,10 @@ import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Partitioner; import org.apache.nutch.metadata.Nutch; +/** + * Entry point to Gora store/mapreduce functionality. + * Translates the concept of "crawlid" to the corresponding Gora support. + */ public class StorageUtils { /** Creates a store for the given persistentClass. @@ -61,7 +65,9 @@ public class StorageUtils { String crawlId = conf.get(Nutch.CRAWL_ID_KEY, ""); if (!crawlId.isEmpty()) { - schema = crawlId + "_" + schema; + conf.set("schema.prefix", crawlId + "_"); + } else { + conf.set("schema.prefix", ""); } Class<? extends DataStore<K, V>> dataStoreClass = @@ -71,7 +77,7 @@ public class StorageUtils { } @SuppressWarnings("unchecked") - public static <K, V extends Persistent> Class<? extends DataStore<K, V>> + private static <K, V extends Persistent> Class<? extends DataStore<K, V>> getDataStoreClass(Configuration conf) throws ClassNotFoundException { return (Class<? extends DataStore<K, V>>) Class.forName(conf.get("storage.data.store.class", @@ -81,15 +87,6 @@ public class StorageUtils { public static <K, V> void initMapperJob(Job job, Collection<WebPage.Field> fields, Class<K> outKeyClass, Class<V> outValueClass, - Class<? extends GoraMapper<String, WebPage, K, V>> mapperClass, boolean reuseObjects) - throws ClassNotFoundException, IOException { - initMapperJob(job, fields, outKeyClass, outValueClass, - mapperClass, null, reuseObjects); - } - - public static <K, V> void initMapperJob(Job job, - Collection<WebPage.Field> fields, - Class<K> outKeyClass, Class<V> outValueClass, Class<? extends GoraMapper<String, WebPage, K, V>> mapperClass) throws ClassNotFoundException, IOException { initMapperJob(job, fields, outKeyClass, outValueClass, Modified: nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java?rev=1372752&r1=1372751&r2=1372752&view=diff ============================================================================== --- nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java (original) +++ nutch/branches/2.x/src/java/org/apache/nutch/util/NutchJob.java Tue Aug 14 07:30:29 2012 @@ -20,6 +20,7 @@ package org.apache.nutch.util; import java.io.IOException; import org.apache.avro.util.Utf8; +import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.Job; import org.apache.nutch.metadata.Nutch; @@ -34,6 +35,12 @@ public class NutchJob extends Job { public NutchJob(Configuration conf, String jobName) throws IOException { super(conf, jobName); + //prefix jobName with crawlId if not empty + String crawlId = conf.get("storage.crawl.id"); + if (!StringUtils.isEmpty(crawlId)) { + jobName = "["+crawlId+"]"+jobName; + setJobName(jobName); + } setJarByClass(this.getClass()); }