Author: jnioche
Date: Wed Mar 21 12:43:19 2012
New Revision: 1303371

URL: http://svn.apache.org/viewvc?rev=1303371&view=rev
Log:
NUTCH-809 Parse-metatags plugin (jnioche)

Added:
    nutch/trunk/src/plugin/parse-metatags/
    nutch/trunk/src/plugin/parse-metatags/README.txt
    nutch/trunk/src/plugin/parse-metatags/build.xml
    nutch/trunk/src/plugin/parse-metatags/ivy.xml
    nutch/trunk/src/plugin/parse-metatags/plugin.xml
    nutch/trunk/src/plugin/parse-metatags/sample/
    nutch/trunk/src/plugin/parse-metatags/sample/testMetatags.html
    nutch/trunk/src/plugin/parse-metatags/src/
    nutch/trunk/src/plugin/parse-metatags/src/java/
    nutch/trunk/src/plugin/parse-metatags/src/java/org/
    nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/
    nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/
    nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/
    
nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
    nutch/trunk/src/plugin/parse-metatags/src/test/
    nutch/trunk/src/plugin/parse-metatags/src/test/org/
    nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/
    nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/
    nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/
    nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/
    
nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/plugin/build.xml

Modified: nutch/trunk/CHANGES.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1303371&r1=1303370&r2=1303371&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Wed Mar 21 12:43:19 2012
@@ -1,5 +1,7 @@
 Nutch Change Log
 
+* NUTCH-809 Parse-metatags plugin (jnioche)
+
 * NUTCH-1310 Nutch to send HTTP-accept header (markus)
 
 * NUTCH-1305 Domain(blacklist)URLFilter to trim entries (markus)

Modified: nutch/trunk/conf/nutch-default.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1303371&r1=1303370&r2=1303371&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Wed Mar 21 12:43:19 2012
@@ -1198,11 +1198,11 @@
 
 <property>
   <name>index.parse.md</name>
-  <value></value>
+  <value>metatag.description,metatag.keywords</value>
   <description>
   Comma-separated list of keys to be taken from the parse metadata to generate 
fields.
   Can be used e.g. for 'description' or 'keywords' provided that these values 
are generated
-  by a parser.  
+  by a parser (see parse-metatags plugin)  
   </description>
 </property>
 
@@ -1223,6 +1223,18 @@
   </description>
 </property>
 
+<!-- parse-metatags plugin properties -->
+<property>
+  <name>metatags.names</name>
+  <value>description;keywords</value>
+  <description> Names of the metatags to extract, separated by;. 
+  Use '*' to extract all metatags. Prefixes the names with 'metatag.'
+  in the parse-metadata. For instance to index description and keywords, 
+  you need to activate the plugin index-metadata and set the value of the 
+  parameter 'index.parse.md' to 'metatag.description;metatag.keywords'.
+  </description>
+</property>
+
 <!-- Temporary Hadoop 0.17.x workaround. -->
 
 <property>

Modified: nutch/trunk/src/plugin/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1303371&r1=1303370&r2=1303371&view=diff
==============================================================================
--- nutch/trunk/src/plugin/build.xml (original)
+++ nutch/trunk/src/plugin/build.xml Wed Mar 21 12:43:19 2012
@@ -48,6 +48,7 @@
      <ant dir="parse-ext" target="deploy"/>
      <ant dir="parse-js" target="deploy"/>
      <ant dir="parse-html" target="deploy"/>
+     <ant dir="parse-metatags" target="deploy"/>
      <ant dir="parse-swf" target="deploy"/>
      <ant dir="parse-tika" target="deploy"/>
      <ant dir="parse-zip" target="deploy"/>
@@ -82,6 +83,7 @@
      <!--ant dir="parse-ext" target="test"/-->
      <ant dir="feed" target="test"/>
      <ant dir="parse-html" target="test"/>
+     <ant dir="parse-metatags" target="test"/>
      <ant dir="parse-swf" target="test"/>
      <ant dir="parse-tika" target="test"/>
      <ant dir="parse-zip" target="test"/>
@@ -125,6 +127,7 @@
     <ant dir="parse-ext" target="clean"/>
     <ant dir="parse-js" target="clean"/>
     <ant dir="parse-html" target="clean"/>
+    <ant dir="parse-metatags" target="clean"/>
     <ant dir="parse-swf" target="clean"/>
     <ant dir="parse-tika" target="clean"/>
     <ant dir="parse-zip" target="clean"/>

Added: nutch/trunk/src/plugin/parse-metatags/README.txt
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/README.txt?rev=1303371&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/README.txt (added)
+++ nutch/trunk/src/plugin/parse-metatags/README.txt Wed Mar 21 12:43:19 2012
@@ -0,0 +1,17 @@
+Parse-metatags plugin
+
+The parse-metatags plugin consists of a HTMLParserFilter which takes as 
parameter a list of metatag names with '*' as default value. The values are 
separated by ';'.
+In order to extract the values of the metatags description and keywords, you 
must specify in nutch-site.xml
+
+<property>
+  <name>metatags.names</name>
+  <value>description;keywords</value>
+</property>
+
+Prefixes the names with 'metatag.' in the parse-metadata. For instance to 
index description and keywords, you need to activate the plugin index-metadata 
and set the value of the parameter 'index.parse.md' to 
'metatag.description;metatag.keywords'.
+  
+This code has been developed by DigitalPebble Ltd and offered to the community 
by ANT.com
+
+
+
+

Added: nutch/trunk/src/plugin/parse-metatags/build.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/build.xml?rev=1303371&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/build.xml (added)
+++ nutch/trunk/src/plugin/parse-metatags/build.xml Wed Mar 21 12:43:19 2012
@@ -0,0 +1,33 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="parse-metatags" default="jar-core">
+
+       <import file="../build-plugin.xml" />
+
+       <!-- Deploy Unit test dependencies -->
+       <target name="deps-test">
+               <ant target="deploy" inheritall="false" 
dir="../nutch-extensionpoints" />
+               <ant target="deploy" inheritall="false" dir="../protocol-file" 
/>
+       </target>
+
+
+       <!-- for junit test -->
+       <mkdir dir="${build.test}/data" />
+       <copy file="sample/testMetatags.html" todir="${build.test}/data" />
+
+</project>

Added: nutch/trunk/src/plugin/parse-metatags/ivy.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/ivy.xml?rev=1303371&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/ivy.xml (added)
+++ nutch/trunk/src/plugin/parse-metatags/ivy.xml Wed Mar 21 12:43:19 2012
@@ -0,0 +1,41 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+  <dependencies>
+  </dependencies>
+  
+</ivy-module>

Added: nutch/trunk/src/plugin/parse-metatags/plugin.xml
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/plugin.xml?rev=1303371&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/plugin.xml (added)
+++ nutch/trunk/src/plugin/parse-metatags/plugin.xml Wed Mar 21 12:43:19 2012
@@ -0,0 +1,22 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<plugin
+   id="parse-metatags"
+   name="MetaTags"
+   version="1.0"
+   provider-name="digitalpebble.com">
+
+   <runtime>
+      <library name="parse-metatags.jar">
+         <export name="*"/>
+      </library>
+   </runtime>
+
+   <extension id="org.apache.nutch.parse.metatags.parser"
+              name="MetaTags Parser"
+              point="org.apache.nutch.parse.HtmlParseFilter">
+      <implementation id="MetaTagsParser"
+                      class="org.apache.nutch.parse.MetaTagsParser"/>
+   </extension>
+
+</plugin>
+

Added: nutch/trunk/src/plugin/parse-metatags/sample/testMetatags.html
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/sample/testMetatags.html?rev=1303371&view=auto
==============================================================================
--- nutch/trunk/src/plugin/parse-metatags/sample/testMetatags.html (added)
+++ nutch/trunk/src/plugin/parse-metatags/sample/testMetatags.html Wed Mar 21 
12:43:19 2012
@@ -0,0 +1,9 @@
+<html>
+<head>
+<meta name="Keywords" content="This is a test of keywords" />
+<meta name="Description" content="This is a test of description" />
+</head>
+<body>
+text of the document
+</body>
+

Added: 
nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java?rev=1303371&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
 (added)
+++ 
nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java
 Wed Mar 21 12:43:19 2012
@@ -0,0 +1,107 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.parse;
+
+import java.util.Enumeration;
+import java.util.HashSet;
+import java.util.Properties;
+import java.util.Set;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.protocol.Content;
+import org.w3c.dom.DocumentFragment;
+
+/** 
+ * Parse HTML meta tags (keywords, description) and store them in the parse 
metadata so that 
+ * they can be indexed with the index-metadata plugin with the prefix 
'metatag.'
+ ***/
+
+public class MetaTagsParser implements HtmlParseFilter {
+
+  private static final Log LOG = LogFactory.getLog(MetaTagsParser.class
+      .getName());
+
+  private Configuration conf;
+
+  private Set<String> metatagset = new HashSet<String>();
+
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    // specify whether we want a specific subset of metadata
+    // by default take everything we can find
+    String metatags = conf.get("metatags.names", "*");
+    String[] values = metatags.split(";");
+    for (String val : values)
+      metatagset.add(val.toLowerCase());
+  }
+
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  public ParseResult filter(Content content, ParseResult parseResult,
+      HTMLMetaTags metaTags, DocumentFragment doc) {
+
+    Parse parse = parseResult.get(content.getUrl());
+    Metadata metadata = parse.getData().getParseMeta();
+
+    // check in the metadata first : the tika-parser
+    // might have stored the values there already
+
+    for (String mdName : metadata.names()) {
+      String value = metadata.get(mdName);
+      // check whether the name is in the list of what we want or if
+      // specified *
+      if (metatagset.contains("*") || 
metatagset.contains(mdName.toLowerCase())) {
+        LOG.debug("Found meta tag : " + mdName + "\t" + value);
+        metadata.add("metatag." + mdName.toLowerCase(), value);
+      }
+    }
+
+    Properties generalMetaTags = metaTags.getGeneralTags();
+    for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames
+        .hasMoreElements();) {
+      String name = (String) tagNames.nextElement();
+      String value = generalMetaTags.getProperty(name);
+      // check whether the name is in the list of what we want or if
+      // specified *
+      if (metatagset.contains("*") || metatagset.contains(name.toLowerCase())) 
{
+        LOG.debug("Found meta tag : " + name + "\t" + value);
+        metadata.add("metatag." + name.toLowerCase(), value);
+      }
+    }
+
+    Properties httpequiv = metaTags.getHttpEquivTags();
+    for (Enumeration tagNames = httpequiv.propertyNames(); tagNames
+        .hasMoreElements();) {
+      String name = (String) tagNames.nextElement();
+      String value = httpequiv.getProperty(name);
+      // check whether the name is in the list of what we want or if
+      // specified *
+      if (metatagset.contains("*") || metatagset.contains(name.toLowerCase())) 
{
+        LOG.debug("Found meta tag : " + name + "\t" + value);
+        metadata.add("metatag." + name.toLowerCase(), value);
+      }
+    }
+
+    return parseResult;
+  }
+
+}

Added: 
nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java
URL: 
http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java?rev=1303371&view=auto
==============================================================================
--- 
nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java
 (added)
+++ 
nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java
 Wed Mar 21 12:43:19 2012
@@ -0,0 +1,68 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.parse.html;
+
+import junit.framework.TestCase;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.parse.Parse;
+import org.apache.nutch.parse.ParseUtil;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolFactory;
+import org.apache.nutch.util.NutchConfiguration;
+
+public class TestMetatagParser extends TestCase {
+  
+  private String fileSeparator = System.getProperty("file.separator");
+  private String sampleDir = System.getProperty("test.data", ".");
+  private String sampleFile = "testMetatags.html";
+  private String description = "This is a test of description";
+  private String keywords = "This is a test of keywords";
+  
+  public TestMetatagParser(String name) {
+    super(name);
+  }
+  
+  public void testIt() {
+    Configuration conf = NutchConfiguration.create();
+    
+    String urlString = "file:" + sampleDir + fileSeparator + sampleFile;
+    
+    try {
+      Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString);
+      Content content = protocol.getProtocolOutput(new Text(urlString),
+          new CrawlDatum()).getContent();
+      
+      Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl());
+      
+      // check that we get the same values
+      Metadata parseMeta = parse.getData().getParseMeta();
+      
+      assertEquals(description, parseMeta.get("metatag.description"));
+      assertEquals(keywords, parseMeta.get("metatag.keywords"));
+    } catch (Exception e) {
+      e.printStackTrace();
+      fail(e.toString());
+    }
+  }
+  
+}


Reply via email to