Author: jorgelbg Date: Mon Feb 23 02:53:24 2015 New Revision: 1661600 URL: http://svn.apache.org/r1661600 Log: NUTCH-1928 Indexing filter of documents by the MIME type
Added: nutch/trunk/src/plugin/mimetype-filter/ nutch/trunk/src/plugin/mimetype-filter/build.xml nutch/trunk/src/plugin/mimetype-filter/ivy.xml nutch/trunk/src/plugin/mimetype-filter/plugin.xml nutch/trunk/src/plugin/mimetype-filter/sample/ nutch/trunk/src/plugin/mimetype-filter/sample/allow-images.txt nutch/trunk/src/plugin/mimetype-filter/sample/block-html.txt nutch/trunk/src/plugin/mimetype-filter/src/ nutch/trunk/src/plugin/mimetype-filter/src/java/ nutch/trunk/src/plugin/mimetype-filter/src/java/org/ nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/ nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/ nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/ nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/ nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java nutch/trunk/src/plugin/mimetype-filter/src/test/ nutch/trunk/src/plugin/mimetype-filter/src/test/org/ nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/ nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/ nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/ nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/ nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java Modified: nutch/trunk/build.xml nutch/trunk/conf/nutch-default.xml nutch/trunk/default.properties nutch/trunk/src/plugin/build.xml Modified: nutch/trunk/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/build.xml?rev=1661600&r1=1661599&r2=1661600&view=diff ============================================================================== --- nutch/trunk/build.xml (original) +++ nutch/trunk/build.xml Mon Feb 23 02:53:24 2015 @@ -178,6 +178,7 @@ <packageset dir="${plugins.dir}/index-more/src/java"/> <packageset dir="${plugins.dir}/index-geoip/src/java"/> <packageset dir="${plugins.dir}/index-static/src/java"/> + <packageset dir="${plugins.dir}/mimetype-filter/src/java"/> <packageset dir="${plugins.dir}/indexer-dummy/src/java"/> <packageset dir="${plugins.dir}/indexer-elastic/src/java/" /> <packageset dir="${plugins.dir}/indexer-solr/src/java"/> @@ -584,6 +585,7 @@ <packageset dir="${plugins.dir}/index-metadata/src/java"/> <packageset dir="${plugins.dir}/index-more/src/java"/> <packageset dir="${plugins.dir}/index-static/src/java"/> + <packageset dir="${plugins.dir}/mimetype-filter/src/java"/> <packageset dir="${plugins.dir}/indexer-dummy/src/java"/> <packageset dir="${plugins.dir}/indexer-elastic/src/java/" /> <packageset dir="${plugins.dir}/indexer-solr/src/java"/> @@ -969,6 +971,8 @@ <source path="${plugins.dir}/index-basic/src/test/" /> <source path="${plugins.dir}/index-geoip/src/java/" /> <source path="${plugins.dir}/index-geoip/src/test/" /> + <source path="${plugins.dir}/mimetype-filter/src/java/" /> + <source path="${plugins.dir}/mimetype-filter/src/test/" /> <source path="${plugins.dir}/indexer-dummy/src/java/" /> <source path="${plugins.dir}/indexer-solr/src/java/" /> <source path="${plugins.dir}/indexer-elastic/src/java/" /> Modified: nutch/trunk/conf/nutch-default.xml URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1661600&r1=1661599&r2=1661600&view=diff ============================================================================== --- nutch/trunk/conf/nutch-default.xml (original) +++ nutch/trunk/conf/nutch-default.xml Mon Feb 23 02:53:24 2015 @@ -1602,4 +1602,15 @@ <description>Whether to support multivalued headings.</description> </property> +<!-- mimetype-filter plugin properties --> + +<property> + <name>mimetype.filter.file</name> + <value>mimetype-filter.txt</value> + <description> + The configuration file for the mimetype-filter plugin. This file contains + the rules used to allow or deny the indexing of certain documents. + </description> +</property> + </configuration> Modified: nutch/trunk/default.properties URL: http://svn.apache.org/viewvc/nutch/trunk/default.properties?rev=1661600&r1=1661599&r2=1661600&view=diff ============================================================================== --- nutch/trunk/default.properties (original) +++ nutch/trunk/default.properties Mon Feb 23 02:53:24 2015 @@ -148,6 +148,7 @@ plugins.index=\ org.apache.nutch.indexer.basic*:\ org.apache.nutch.indexer.feed*:\ org.apache.nutch.indexer.geoip*:\ + org.apache.nutch.indexer.filter*:\ org.apache.nutch.indexer.metadata*:\ org.apache.nutch.indexer.more*:\ org.apache.nutch.indexer.static*:\ Modified: nutch/trunk/src/plugin/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/build.xml?rev=1661600&r1=1661599&r2=1661600&view=diff ============================================================================== --- nutch/trunk/src/plugin/build.xml (original) +++ nutch/trunk/src/plugin/build.xml Mon Feb 23 02:53:24 2015 @@ -35,6 +35,7 @@ <ant dir="index-more" target="deploy"/> <ant dir="index-static" target="deploy"/> <ant dir="index-metadata" target="deploy"/> + <ant dir="mimetype-filter" target="deploy"/> <ant dir="indexer-dummy" target="deploy"/> <ant dir="indexer-elastic" target="deploy"/> <ant dir="indexer-solr" target="deploy"/> @@ -88,6 +89,7 @@ <ant dir="index-geoip" target="test"/> <ant dir="index-more" target="test"/> <ant dir="index-static" target="test"/> + <ant dir="mimetype-filter" target="test"/> <ant dir="language-identifier" target="test"/> <ant dir="lib-http" target="test"/> <ant dir="protocol-file" target="test"/> @@ -126,10 +128,11 @@ <ant dir="headings" target="clean"/> <ant dir="index-basic" target="clean"/> <ant dir="index-anchor" target="clean"/> - <ant dir="index-geoip" target="clean"/> + <ant dir="index-geoip" target="clean"/> <ant dir="index-more" target="clean"/> <ant dir="index-static" target="clean"/> <ant dir="index-metadata" target="clean"/> + <ant dir="mimetype-filter" target="clean"/> <ant dir="indexer-dummy" target="clean"/> <ant dir="indexer-elastic" target="clean"/> <ant dir="indexer-solr" target="clean"/> Added: nutch/trunk/src/plugin/mimetype-filter/build.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/build.xml?rev=1661600&view=auto ============================================================================== --- nutch/trunk/src/plugin/mimetype-filter/build.xml (added) +++ nutch/trunk/src/plugin/mimetype-filter/build.xml Mon Feb 23 02:53:24 2015 @@ -0,0 +1,28 @@ +<?xml version="1.0"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project name="mimetype-filter" default="jar-core"> + + <import file="../build-plugin.xml" /> + + <!-- for junit test --> + <mkdir dir="${build.test}/data"/> + <copy todir="${build.test}/data"> + <fileset dir="sample" includes="**/*.txt"/> + </copy> + +</project> Added: nutch/trunk/src/plugin/mimetype-filter/ivy.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/ivy.xml?rev=1661600&view=auto ============================================================================== --- nutch/trunk/src/plugin/mimetype-filter/ivy.xml (added) +++ nutch/trunk/src/plugin/mimetype-filter/ivy.xml Mon Feb 23 02:53:24 2015 @@ -0,0 +1,41 @@ +<?xml version="1.0" ?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<ivy-module version="1.0"> + <info organisation="org.apache.nutch" module="${ant.project.name}"> + <license name="Apache 2.0"/> + <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/> + <description> + Apache Nutch + </description> + </info> + + <configurations> + <include file="${nutch.root}/ivy/ivy-configurations.xml"/> + </configurations> + + <publications> + <!--get the artifact from our module name--> + <artifact conf="master"/> + </publications> + + <dependencies> + </dependencies> + +</ivy-module> Added: nutch/trunk/src/plugin/mimetype-filter/plugin.xml URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/plugin.xml?rev=1661600&view=auto ============================================================================== --- nutch/trunk/src/plugin/mimetype-filter/plugin.xml (added) +++ nutch/trunk/src/plugin/mimetype-filter/plugin.xml Mon Feb 23 02:53:24 2015 @@ -0,0 +1,37 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<plugin + id="mimetype-filter" + name="Filter indexed documents by the detected MIME" + version="1.0.0" + provider-name="nutch.org"> + + <runtime> + <library name="mimetype-filter.jar"> + <export name="*"/> + </library> + </runtime> + + <extension id="org.apache.nutch.indexer.filter" + name="Nutch MIME filter" + point="org.apache.nutch.indexer.IndexingFilter"> + <implementation id="MimeTypeIndexingFilter" + class="org.apache.nutch.indexer.filter.MimeTypeIndexingFilter"/> + </extension> + +</plugin> Added: nutch/trunk/src/plugin/mimetype-filter/sample/allow-images.txt URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/sample/allow-images.txt?rev=1661600&view=auto ============================================================================== --- nutch/trunk/src/plugin/mimetype-filter/sample/allow-images.txt (added) +++ nutch/trunk/src/plugin/mimetype-filter/sample/allow-images.txt Mon Feb 23 02:53:24 2015 @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This filter can be configured to work in one of two modes (similar to +# suffix-url-filter) + +# default to reject ('-'): in this mode, only documents with a mimetype that +# match the ones specified in the config file will be accepted, all other +# mimetypes will be rejected. + +# default to accept ('+'): in this mode, only documents with a mimetype +# that match the ones specified in the config file will be rejected, +# all other mimetypes will be accepted. + +# The format of this config file is one mimetype per line, with no preceding +# whitespace. Order, in which suffixes are specified, doesn't matter. Blank +# lines and comments (#) are allowed. +# + +- + +image Added: nutch/trunk/src/plugin/mimetype-filter/sample/block-html.txt URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/sample/block-html.txt?rev=1661600&view=auto ============================================================================== --- nutch/trunk/src/plugin/mimetype-filter/sample/block-html.txt (added) +++ nutch/trunk/src/plugin/mimetype-filter/sample/block-html.txt Mon Feb 23 02:53:24 2015 @@ -0,0 +1,34 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# This filter can be configured to work in one of two modes (similar to +# suffix-url-filter) + +# default to reject ('-'): in this mode, only documents with a mimetype that +# match the ones specified in the config file will be accepted, all other +# mimetypes will be rejected. + +# default to accept ('+'): in this mode, only documents with a mimetype +# that match the ones specified in the config file will be rejected, +# all other mimetypes will be accepted. + +# The format of this config file is one mimetype per line, with no preceding +# whitespace. Order, in which suffixes are specified, doesn't matter. Blank +# lines and comments (#) are allowed. +# + ++ + +text/html \ No newline at end of file Added: nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java?rev=1661600&view=auto ============================================================================== --- nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java (added) +++ nutch/trunk/src/plugin/mimetype-filter/src/java/org/apache/nutch/indexer/filter/MimeTypeIndexingFilter.java Mon Feb 23 02:53:24 2015 @@ -0,0 +1,191 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer.filter; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +// Nutch imports +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.Writable; + +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; + +import org.apache.nutch.indexer.IndexingException; +import org.apache.nutch.indexer.IndexingFilter; +import org.apache.nutch.indexer.NutchDocument; + +import org.apache.nutch.parse.Parse; +import org.apache.nutch.net.protocols.Response; + +import org.apache.nutch.util.MimeUtil; +import org.apache.nutch.util.PrefixStringMatcher; +import org.apache.nutch.util.TrieStringMatcher; +import org.apache.tika.Tika; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.Reader; +import java.util.ArrayList; +import java.util.List; + +/** + * An {@link org.apache.nutch.indexer.IndexingFilter} that allows filtering + * of documents based on the MIME Type detected by Tika + * + */ +public class MimeTypeIndexingFilter implements IndexingFilter { + + public static final String MIMEFILTER_REGEX_FILE = "mimetype.filter.file"; + + private static final Logger LOG = LoggerFactory + .getLogger(MimeTypeIndexingFilter.class); + + private MimeUtil MIME; + private Tika tika = new Tika(); + + private TrieStringMatcher trie; + + private Configuration conf; + + private boolean acceptMode = true; + + // Inherited JavaDoc + @Override + public NutchDocument filter(NutchDocument doc, Parse parse, Text url, + CrawlDatum datum, Inlinks inlinks) throws IndexingException { + + String mimeType; + String contentType; + + Writable tcontentType = datum.getMetaData() + .get(new Text(Response.CONTENT_TYPE)); + + if (tcontentType != null) { + contentType = tcontentType.toString(); + } else { + contentType = parse.getData().getMeta(Response.CONTENT_TYPE); + } + + if (contentType == null) { + mimeType = tika.detect(url.toString()); + } else { + mimeType = MIME.forName(MimeUtil.cleanMimeType(contentType)); + } + + contentType = mimeType; + + if (LOG.isInfoEnabled()) { + LOG.info(String.format("[%s] %s", contentType, url)); + } + + if (null != trie) { + if (trie.shortestMatch(contentType) == null) { + // no match, but + if (acceptMode) { + return doc; + } + return null; + } else { + // matched, but we are blocking + if (acceptMode) { + return null; + } + } + } + + return doc; + } + + /* + * ----------------------------- + * <implementation:Configurable> * + * ----------------------------- + */ + @Override + public void setConf(Configuration conf) { + this.conf = conf; + MIME = new MimeUtil(conf); + + // load the file of the values + String file = conf.get(MIMEFILTER_REGEX_FILE, ""); + + if (file != null) { + if (file.isEmpty()) { + LOG.warn(String + .format("Missing %s property, ALL mimetypes will be allowed", + MIMEFILTER_REGEX_FILE)); + } else { + Reader reader = conf.getConfResourceAsReader(file); + + try { + readConfiguration(reader); + } catch (IOException e) { + if (LOG.isErrorEnabled()) { + LOG.error(e.getMessage()); + } + + throw new RuntimeException(e.getMessage(), e); + } + } + } + } + + private void readConfiguration(Reader reader) throws IOException { + BufferedReader in = new BufferedReader(reader); + String line; + List rules = new ArrayList(); + + while (null != (line = in.readLine())) { + if (line.length() == 0) { + continue; + } + + char first = line.charAt(0); + switch (first) { + case ' ': + case '\n': + case '#': // skip blank & comment lines + break; + case '+': + acceptMode = true; + break; + case '-': + acceptMode = false; + break; + default: + rules.add(line); + break; + } + } + + trie = new PrefixStringMatcher(rules); + } + + @Override + public Configuration getConf() { + return this.conf; + } + /* + * ------------------------------ * </implementation:Configurable> * + * ------------------------------ + */ +} + Added: nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java?rev=1661600&view=auto ============================================================================== --- nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java (added) +++ nutch/trunk/src/plugin/mimetype-filter/src/test/org/apache/nutch/indexer/filter/MimeTypeIndexingFilterTest.java Mon Feb 23 02:53:24 2015 @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.nutch.indexer.filter; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.io.Text; +import org.apache.nutch.crawl.CrawlDatum; +import org.apache.nutch.crawl.Inlinks; +import org.apache.nutch.indexer.NutchDocument; +import org.apache.nutch.metadata.Metadata; +import org.apache.nutch.net.protocols.Response; +import org.apache.nutch.parse.Outlink; +import org.apache.nutch.parse.ParseData; +import org.apache.nutch.parse.ParseImpl; +import org.apache.nutch.parse.ParseStatus; +import org.apache.nutch.util.NutchConfiguration; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +/** + * JUnit based tests of class + * {@link org.apache.nutch.indexer.filter.MimeTypeIndexingFilter} + * + */ +public class MimeTypeIndexingFilterTest { + + private Configuration conf = NutchConfiguration.create(); + private MimeTypeIndexingFilter filter = new MimeTypeIndexingFilter(); + private String[] MIME_TYPES = { "text/html", "image/png", "application/pdf" }; + private ParseImpl[] parses = new ParseImpl[MIME_TYPES.length]; + private String sampleDir = System.getProperty("test.data", "."); + + @Before + public void setUp() throws Exception { + for (int i = 0; i < MIME_TYPES.length; i++) { + Metadata metadata = new Metadata(); + metadata.add(Response.CONTENT_TYPE, MIME_TYPES[i]); + + ParseImpl parse = new ParseImpl("text", + new ParseData(new ParseStatus(), "title", new Outlink[0], metadata)); + + parses[i] = parse; + } + } + + @Test + public void testMissingConfigFile() throws Exception { + String file = conf.get(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, ""); + Assert.assertEquals(String + .format("Property %s must not be present in the the configuration file", + MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE), "", file); + + filter.setConf(conf); + + // property not set so in this cases all documents must pass the filter + for (int i = 0; i < parses.length; i++) { + NutchDocument doc = filter.filter(new NutchDocument(), parses[i], + new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); + + Assert.assertNotNull("All documents must be allowed by default", doc); + } + } + + @Test + public void testAllowOnlyImages() throws Exception { + conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "allow-images.txt"); + filter.setConf(conf); + + for (int i = 0; i < parses.length; i++) { + NutchDocument doc = filter.filter(new NutchDocument(), parses[i], + new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); + + if (MIME_TYPES[i].contains("image")) { + Assert.assertNotNull("Allow only images", doc); + } else { + Assert.assertNull("Block everything else", doc); + } + } + } + + @Test + public void testBlockHTML() throws Exception { + conf.set(MimeTypeIndexingFilter.MIMEFILTER_REGEX_FILE, "block-html.txt"); + filter.setConf(conf); + + for (int i = 0; i < parses.length; i++) { + NutchDocument doc = filter.filter(new NutchDocument(), parses[i], + new Text("http://www.example.com/"), new CrawlDatum(), new Inlinks()); + + if (MIME_TYPES[i].contains("html")) { + Assert.assertNull("Block only HTML documents", doc); + } else { + Assert.assertNotNull("Allow everything else", doc); + } + } + } +}