Author: jnioche Date: Wed Mar 21 12:43:19 2012 New Revision: 1303371 URL: http://svn.apache.org/viewvc?rev=1303371&view=rev Log: NUTCH-809 Parse-metatags plugin (jnioche)
Added: nutch/trunk/src/plugin/parse-metatags/ nutch/trunk/src/plugin/parse-metatags/README.txt nutch/trunk/src/plugin/parse-metatags/build.xml nutch/trunk/src/plugin/parse-metatags/ivy.xml nutch/trunk/src/plugin/parse-metatags/plugin.xml nutch/trunk/src/plugin/parse-metatags/sample/ nutch/trunk/src/plugin/parse-metatags/sample/testMetatags.html nutch/trunk/src/plugin/parse-metatags/src/ nutch/trunk/src/plugin/parse-metatags/src/java/ nutch/trunk/src/plugin/parse-metatags/src/java/org/ nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/ nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/ nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/ nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java nutch/trunk/src/plugin/parse-metatags/src/test/ nutch/trunk/src/plugin/parse-metatags/src/test/org/ nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/ nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/ nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/ nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/ nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java Modified: nutch/trunk/CHANGES.txt nutch/trunk/conf/nutch-default.xml nutch/trunk/src/plugin/build.xml Modified: nutch/trunk/CHANGES.txt URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1303371&r1=1303370&r2=1303371&view=diff ============================================================================== --- nutch/trunk/CHANGES.txt (original) +++ nutch/trunk/CHANGES.txt Wed Mar 21 12:43:19 2012 @@ -1,5 +1,7 @@ Nutch Change Log +* NUTCH-809 Parse-metatags plugin (jnioche) + * NUTCH-1310 Nutch to send HTTP-accept header (markus) * NUTCH-1305 Domain(blacklist)URLFilter to trim entries (markus) Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1303371&r1=1303370&r2=1303371&view=diff ============================================================================== --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Wed Mar 21 12:43:19 2012 @@ -1198,11 +1198,11 @@ <property> <name>index.parse.md</name> - <value></value> + <value>metatag.description,metatag.keywords</value> <description> Comma-separated list of keys to be taken from the parse metadata to generate fields. Can be used e.g. for 'description' or 'keywords' provided that these values are generated - by a parser. + by a parser (see parse-metatags plugin) </description> </property> @@ -1223,6 +1223,18 @@ </description> </property> +<!-- parse-metatags plugin properties --> +<property> + <name>metatags.names</name> + <value>description;keywords</value> + <description> Names of the metatags to extract, separated by;. + Use '*' to extract all metatags. Prefixes the names with 'metatag.' + in the parse-metadata. For instance to index description and keywords, + you need to activate the plugin index-metadata and set the value of the + parameter 'index.parse.md' to 'metatag.description;metatag.keywords'. + </description> +</property> + <!-- Temporary Hadoop 0.17.x workaround. --> <property> Modified: nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1303371&r1=1303370&r2=1303371&view=diff ============================================================================== --- nutch/trunk/src/plugin/build.xml (original) +++ nutch/trunk/src/plugin/build.xml Wed Mar 21 12:43:19 2012 @@ -48,6 +48,7 @@ <ant dir="parse-ext" target="deploy"/> <ant dir="parse-js" target="deploy"/> <ant dir="parse-html" target="deploy"/> + <ant dir="parse-metatags" target="deploy"/> <ant dir="parse-swf" target="deploy"/> <ant dir="parse-tika" target="deploy"/> <ant dir="parse-zip" target="deploy"/> @@ -82,6 +83,7 @@ <!--ant dir="parse-ext" target="test"/--> <ant dir="feed" target="test"/> <ant dir="parse-html" target="test"/> + <ant dir="parse-metatags" target="test"/> <ant dir="parse-swf" target="test"/> <ant dir="parse-tika" target="test"/> <ant dir="parse-zip" target="test"/> @@ -125,6 +127,7 @@ <ant dir="parse-ext" target="clean"/> <ant dir="parse-js" target="clean"/> <ant dir="parse-html" target="clean"/> + <ant dir="parse-metatags" target="clean"/> <ant dir="parse-swf" target="clean"/> <ant dir="parse-tika" target="clean"/> <ant dir="parse-zip" target="clean"/> Added: nutch/trunk/src/plugin/parse-metatags/README.txt URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/README.txt?rev=1303371&view=auto ============================================================================== --- nutch/trunk/src/plugin/parse-metatags/README.txt (added) +++ nutch/trunk/src/plugin/parse-metatags/README.txt Wed Mar 21 12:43:19 2012 @@ -0,0 +1,17 @@ +Parse-metatags plugin + +The parse-metatags plugin consists of a HTMLParserFilter which takes as parameter a list of metatag names with '*' as default value. The values are separated by ';'. +In order to extract the values of the metatags description and keywords, you must specify in nutch-site.xml + +<property> + <name>metatags.names</name> + <value>description;keywords</value> +</property> + +Prefixes the names with 'metatag.' in the parse-metadata. For instance to index description and keywords, you need to activate the plugin index-metadata and set the value of the parameter 'index.parse.md' to 'metatag.description;metatag.keywords'. + +This code has been developed by DigitalPebble Ltd and offered to the community by ANT.com + + + + Added: nutch/trunk/src/plugin/parse-metatags/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/build.xml?rev=1303371&view=auto ============================================================================== --- nutch/trunk/src/plugin/parse-metatags/build.xml (added) +++ nutch/trunk/src/plugin/parse-metatags/build.xml Wed Mar 21 12:43:19 2012 @@ -0,0 +1,33 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="parse-metatags" default="jar-core"> + + <import file="../build-plugin.xml" /> + + <!-- Deploy Unit test dependencies --> + <target name="deps-test"> + <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints" /> + <ant target="deploy" inheritall="false" dir="../protocol-file" /> + </target> + + + <!-- for junit test --> + <mkdir dir="${build.test}/data" /> + <copy file="sample/testMetatags.html" todir="${build.test}/data" /> + +</project> Added: nutch/trunk/src/plugin/parse-metatags/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/ivy.xml?rev=1303371&view=auto ============================================================================== --- nutch/trunk/src/plugin/parse-metatags/ivy.xml (added) +++ nutch/trunk/src/plugin/parse-metatags/ivy.xml Wed Mar 21 12:43:19 2012 @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="../../..//ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> Added: nutch/trunk/src/plugin/parse-metatags/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/plugin.xml?rev=1303371&view=auto ============================================================================== --- nutch/trunk/src/plugin/parse-metatags/plugin.xml (added) +++ nutch/trunk/src/plugin/parse-metatags/plugin.xml Wed Mar 21 12:43:19 2012 @@ -0,0 +1,22 @@ +<?xml version="1.0" encoding="UTF-8"?> +<plugin + id="parse-metatags" + name="MetaTags" + version="1.0" + provider-name="digitalpebble.com"> + + <runtime> + <library name="parse-metatags.jar"> + <export name="*"/> + </library> + </runtime> + + <extension id="org.apache.nutch.parse.metatags.parser" + name="MetaTags Parser" + point="org.apache.nutch.parse.HtmlParseFilter"> + <implementation id="MetaTagsParser" + class="org.apache.nutch.parse.MetaTagsParser"/> + </extension> + +</plugin> + Added: nutch/trunk/src/plugin/parse-metatags/sample/testMetatags.html URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/sample/testMetatags.html?rev=1303371&view=auto ============================================================================== --- nutch/trunk/src/plugin/parse-metatags/sample/testMetatags.html (added) +++ nutch/trunk/src/plugin/parse-metatags/sample/testMetatags.html Wed Mar 21 12:43:19 2012 @@ -0,0 +1,9 @@ +<html> +<head> +<meta name="Keywords" content="This is a test of keywords" /> +<meta name="Description" content="This is a test of description" /> +</head> +<body> +text of the document +</body> + Added: nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java?rev=1303371&view=auto ============================================================================== --- nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java (added) +++ nutch/trunk/src/plugin/parse-metatags/src/java/org/apache/nutch/parse/MetaTagsParser.java Wed Mar 21 12:43:19 2012 @@ -0,0 +1,107 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse; + +import java.util.Enumeration; +import java.util.HashSet; +import java.util.Properties; +import java.util.Set; + +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.protocol.Content; +import org.w3c.dom.DocumentFragment; + +/** + * Parse HTML meta tags (keywords, description) and store them in the parse metadata so that + * they can be indexed with the index-metadata plugin with the prefix 'metatag.' + ***/ + +public class MetaTagsParser implements HtmlParseFilter { + + private static final Log LOG = LogFactory.getLog(MetaTagsParser.class + .getName()); + + private Configuration conf; + + private Set<String> metatagset = new HashSet<String>(); + + public void setConf(Configuration conf) { + this.conf = conf; + // specify whether we want a specific subset of metadata + // by default take everything we can find + String metatags = conf.get("metatags.names", "*"); + String[] values = metatags.split(";"); + for (String val : values) + metatagset.add(val.toLowerCase()); + } + + public Configuration getConf() { + return this.conf; + } + + public ParseResult filter(Content content, ParseResult parseResult, + HTMLMetaTags metaTags, DocumentFragment doc) { + + Parse parse = parseResult.get(content.getUrl()); + Metadata metadata = parse.getData().getParseMeta(); + + // check in the metadata first : the tika-parser + // might have stored the values there already + + for (String mdName : metadata.names()) { + String value = metadata.get(mdName); + // check whether the name is in the list of what we want or if + // specified * + if (metatagset.contains("*") || metatagset.contains(mdName.toLowerCase())) { + LOG.debug("Found meta tag : " + mdName + "\t" + value); + metadata.add("metatag." + mdName.toLowerCase(), value); + } + } + + Properties generalMetaTags = metaTags.getGeneralTags(); + for (Enumeration tagNames = generalMetaTags.propertyNames(); tagNames + .hasMoreElements();) { + String name = (String) tagNames.nextElement(); + String value = generalMetaTags.getProperty(name); + // check whether the name is in the list of what we want or if + // specified * + if (metatagset.contains("*") || metatagset.contains(name.toLowerCase())) { + LOG.debug("Found meta tag : " + name + "\t" + value); + metadata.add("metatag." + name.toLowerCase(), value); + } + } + + Properties httpequiv = metaTags.getHttpEquivTags(); + for (Enumeration tagNames = httpequiv.propertyNames(); tagNames + .hasMoreElements();) { + String name = (String) tagNames.nextElement(); + String value = httpequiv.getProperty(name); + // check whether the name is in the list of what we want or if + // specified * + if (metatagset.contains("*") || metatagset.contains(name.toLowerCase())) { + LOG.debug("Found meta tag : " + name + "\t" + value); + metadata.add("metatag." + name.toLowerCase(), value); + } + } + + return parseResult; + } + +} Added: nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java?rev=1303371&view=auto ============================================================================== --- nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java (added) +++ nutch/trunk/src/plugin/parse-metatags/src/test/org/apache/nutch/parse/html/TestMetatagParser.java Wed Mar 21 12:43:19 2012 @@ -0,0 +1,68 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.parse.html; + +import junit.framework.TestCase; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.parse.Parse; +import org.apache.nutch.parse.ParseUtil; +import org.apache.nutch.protocol.Content; +import org.apache.nutch.protocol.Protocol; +import org.apache.nutch.protocol.ProtocolFactory; +import org.apache.nutch.util.NutchConfiguration; + +public class TestMetatagParser extends TestCase { + + private String fileSeparator = System.getProperty("file.separator"); + private String sampleDir = System.getProperty("test.data", "."); + private String sampleFile = "testMetatags.html"; + private String description = "This is a test of description"; + private String keywords = "This is a test of keywords"; + + public TestMetatagParser(String name) { + super(name); + } + + public void testIt() { + Configuration conf = NutchConfiguration.create(); + + String urlString = "file:" + sampleDir + fileSeparator + sampleFile; + + try { + Protocol protocol = new ProtocolFactory(conf).getProtocol(urlString); + Content content = protocol.getProtocolOutput(new Text(urlString), + new CrawlDatum()).getContent(); + + Parse parse = new ParseUtil(conf).parse(content).get(content.getUrl()); + + // check that we get the same values + Metadata parseMeta = parse.getData().getParseMeta(); + + assertEquals(description, parseMeta.get("metatag.description")); + assertEquals(keywords, parseMeta.get("metatag.keywords")); + } catch (Exception e) { + e.printStackTrace(); + fail(e.toString()); + } + } + +}