Author: lewismc
Date: Fri Aug 10 14:11:30 2012
New Revision: 1371708

URL: http://svn.apache.org/viewvc?rev=1371708&view=rev
Log:
NUTCH-1160 Write JUnit test for index-basic

Added:
    nutch/branches/2.x/src/plugin/index-basic/src/test/
    nutch/branches/2.x/src/plugin/index-basic/src/test/org/
    nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/
    nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/
    nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/
    
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/
    
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
Modified:
    nutch/branches/2.x/conf/schema-solr4.xml
    nutch/branches/2.x/src/plugin/build.xml
    
nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
    
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java

Modified: nutch/branches/2.x/conf/schema-solr4.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/conf/schema-solr4.xml?rev=1371708&r1=1371707&r2=1371708&view=diff
==============================================================================
--- nutch/branches/2.x/conf/schema-solr4.xml (original)
+++ nutch/branches/2.x/conf/schema-solr4.xml Fri Aug 10 14:11:30 2012
@@ -311,6 +311,8 @@
     <!-- fields for index-basic plugin -->
     <field name="host" type="url" stored="false" indexed="true"/>
     <field name="url" type="url" stored="true" indexed="true" required="true"/>
+    <field name="orig" type="url" stored="true" indexed="true" />
+    <field name="site" type="string" stored="false" indexed="true"/>
     <!-- stored=true for highlighting, use term vectors  and positions for 
fast highlighting -->
     <field name="content" type="text_general" stored="true" indexed="true"/>
     <field name="title" type="text_general" stored="true" indexed="true"/>

Modified: nutch/branches/2.x/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/build.xml?rev=1371708&r1=1371707&r2=1371708&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/build.xml (original)
+++ nutch/branches/2.x/src/plugin/build.xml Fri Aug 10 14:11:30 2012
@@ -74,7 +74,8 @@
      <ant dir="parse-tika" target="test"/>
      <ant dir="protocol-file" target="test"/>
      <ant dir="parse-html" target="test"/>
-        <ant dir="index-anchor" target="test"/>
+     <ant dir="index-anchor" target="test"/>
+     <ant dir="index-basic" target="test"/>
      <ant dir="index-more" target="test"/>
      <ant dir="language-identifier" target="test"/>
      <ant dir="protocol-httpclient" target="test"/>

Modified: 
nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java?rev=1371708&r1=1371707&r2=1371708&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/index-anchor/src/test/org/apache/nutch/indexer/anchor/TestAnchorIndexingFilter.java
 Fri Aug 10 14:11:30 2012
@@ -16,15 +16,10 @@
  */
 package org.apache.nutch.indexer.anchor;
 
-import java.io.FileNotFoundException;
-import java.io.FileReader;
-import java.util.Collection;
-
 import junit.framework.TestCase;
 
 import org.apache.avro.util.Utf8;
 import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.indexer.IndexingException;
 import org.apache.nutch.indexer.NutchDocument;
 import org.apache.nutch.storage.WebPage;
 import org.apache.nutch.util.NutchConfiguration;

Modified: 
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java?rev=1371708&r1=1371707&r2=1371708&view=diff
==============================================================================
--- 
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
 (original)
+++ 
nutch/branches/2.x/src/plugin/index-basic/src/java/org/apache/nutch/indexer/basic/BasicIndexingFilter.java
 Fri Aug 10 14:11:30 2012
@@ -36,7 +36,17 @@ import org.apache.nutch.util.Bytes;
 import org.apache.nutch.util.TableUtil;
 import org.apache.solr.common.util.DateUtil;
 
-/** Adds basic searchable fields to a document. */
+/** Adds basic searchable fields to a document. The fields are:
+ * host - add host as un-stored, indexed and tokenized
+ * site - add site as un-stored, indexed and un-tokenized
+ * url - url is both stored and indexed, so it's both searchable and returned. 
+ * This is also a required field.
+ * orig - also store original url as both stored and indexed
+ * content - content is indexed, so that it's searchable, but not stored in 
index
+ * title - title is stored and indexed
+ * cache - add cached content/summary display policy, if available
+ * tstamp - add timestamp when fetched, for deduplication
+ */
 public class BasicIndexingFilter implements IndexingFilter {
   public static final Logger LOG = 
LoggerFactory.getLogger(BasicIndexingFilter.class);
 
@@ -51,6 +61,16 @@ public class BasicIndexingFilter impleme
     FIELDS.add(WebPage.Field.FETCH_TIME);
   }
 
+  /**
+   * The {@link BasicIndexingFilter} filter object which supports boolean 
+   * configurable value for length of characters permitted within the 
+   * title @see {@code indexer.max.title.length} in nutch-default.xml
+   *  
+   * @param doc The {@link NutchDocument} object
+   * @param url URL to be filtered for anchor text
+   * @param page {@link WebPage} object relative to the URL
+   * @return filtered NutchDocument
+   */
   public NutchDocument filter(NutchDocument doc, String url, WebPage page)
       throws IndexingException {
 
@@ -83,7 +103,7 @@ public class BasicIndexingFilter impleme
     doc.add("url", reprUrl == null ? url : reprUrl);
 
     if (reprUrl != null) {
-      // also store original url as both stored and indexes
+      // also store original url as both stored and indexed
       doc.add("orig", url);
     }
 
@@ -118,15 +138,28 @@ public class BasicIndexingFilter impleme
   public void addIndexBackendOptions(Configuration conf) {
   }
 
+  /**
+   * Set the {@link Configuration} object
+   */
   public void setConf(Configuration conf) {
     this.conf = conf;
     this.MAX_TITLE_LENGTH = conf.getInt("indexer.max.title.length", 100);
+    LOG.info("Maximum title length for indexing set to: " + 
this.MAX_TITLE_LENGTH);
   }
 
+  /**
+   * Get the {@link Configuration} object
+   */
   public Configuration getConf() {
     return this.conf;
   }
 
+  /**
+   * Gets all the fields for a given {@link WebPage}
+   * Many datastores need to setup the mapreduce job by specifying the fields
+   * needed. All extensions that work on WebPage are able to specify what 
fields
+   * they need.
+   */
   @Override
   public Collection<WebPage.Field> getFields() {
     return FIELDS;

Added: 
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
URL: 
http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java?rev=1371708&view=auto
==============================================================================
--- 
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
 (added)
+++ 
nutch/branches/2.x/src/plugin/index-basic/src/test/org/apache/nutch/indexer/basic/TestBasicIndexingFilter.java
 Fri Aug 10 14:11:30 2012
@@ -0,0 +1,95 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.indexer.basic;
+
+import java.nio.ByteBuffer;
+
+import org.apache.avro.util.Utf8;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.fetcher.FetcherJob;
+import org.apache.nutch.indexer.NutchDocument;
+import org.apache.nutch.storage.WebPage;
+import org.apache.nutch.util.NutchConfiguration;
+import org.junit.Test;
+import junit.framework.TestCase;
+
+/**
+ * JUnit test case which tests
+ * 1. that the host, site, url, orig, content, title, cache and tstamp fields 
+ * are obtained by the filter.
+ * 2. that configurable maximum length functionality for titles actually 
works. .
+ * This property defaults at 100 characters @see {@code 
indexer.max.title.length} 
+ * in nutch-default.xml but has been set to 10 for this test.
+ * 
+ * @author lewismc
+ */
+
+public class TestBasicIndexingFilter extends TestCase {
+  
+  @Test
+  public void testBasicFields() throws Exception {
+       Configuration conf = NutchConfiguration.create();
+       BasicIndexingFilter filter = new BasicIndexingFilter();
+       filter.setConf(conf);
+       assertNotNull(filter);
+       NutchDocument doc = new NutchDocument();
+       WebPage page = new WebPage();
+       page.putToInlinks(new Utf8("http://nutch.apache.org/";), new 
Utf8("Welcome to Nutch"));
+       page.setTitle(new Utf8("Welcome to Nutch"));
+    page.setReprUrl(new Utf8("http://www.urldoesnotmatter.org";));
+    //ByteBuffer bbuf = ByteBuffer.allocate(10);
+    //bbuf.putInt(123456789);
+    //page.putToMetadata(new Utf8("Cache_policy"), bbuf);
+    page.setFetchTime(System.currentTimeMillis());
+       try {
+         filter.filter(doc, "http://www.apache.org/";, page);
+       } catch(Exception e) {
+         e.printStackTrace();
+         fail(e.getMessage());
+       }
+       assertNotNull(doc);
+       assertTrue("check for host field ", 
doc.getFieldNames().contains("host"));
+       assertTrue("check for site field", 
doc.getFieldNames().contains("site"));
+       assertTrue("check for url field", doc.getFieldNames().contains("url"));
+       assertTrue("check for orig field", 
doc.getFieldNames().contains("orig"));
+       assertTrue("check for content field", 
doc.getFieldNames().contains("content"));
+       assertTrue("check for title field", 
doc.getFieldNames().contains("title"));
+       //assertTrue("check for cache field", 
doc.getFieldNames().contains("cache"));
+       assertTrue("check for tstamp field", 
doc.getFieldNames().contains("tstamp"));
+  }
+  
+  @Test
+  public void testTitleFieldLength() throws Exception {
+       Configuration conf = NutchConfiguration.create();
+       conf.setInt("indexer.max.title.length", 10);
+       BasicIndexingFilter filter = new BasicIndexingFilter();
+       filter.setConf(conf);
+       assertNotNull(filter);
+       NutchDocument doc = new NutchDocument();
+       WebPage page = new WebPage();
+       page.putToInlinks(new Utf8("http://exceedmaximumtitleurl.org/";), new 
Utf8("exceeding title site"));
+       page.setTitle(new Utf8("This title exceeds maximum characters"));
+       try {
+         filter.filter(doc, "http://www.apache.org/";, page);
+       } catch (Exception e) {
+         e.printStackTrace();
+         fail(e.getMessage());
+       }
+       assertNotNull(doc);
+       assertEquals("assert title field only has 10 characters", 10, 
doc.getFieldValue("title").length());
+  }
+}


Reply via email to