[ https://issues.apache.org/jira/browse/NUTCH-2414?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16289890#comment-16289890 ]
ASF GitHub Bot commented on NUTCH-2414: --------------------------------------- lewismc closed pull request #217: NUTCH-2414 - Allow LanguageIndexingFilter to actually filter documents by language URL: https://github.com/apache/nutch/pull/217 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml index c406907c5..05fe6912f 100644 --- a/conf/nutch-default.xml +++ b/conf/nutch-default.xml @@ -1566,6 +1566,16 @@ visit https://wiki.apache.org/nutch/SimilarityScoringFilter--> </description> </property> +<property> + <name>lang.index.languages</name> + <value></value> + <description>If not empty, should be a comma separated list of language codes. + Only documents with one of these language codes will be indexed. + "unknown" is a valid language code, will match documents where language + detection failed. + </description> +</property> + <!-- index-static plugin properties --> <property> diff --git a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java index fbfe8f978..cd954c70d 100644 --- a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java +++ b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java @@ -27,6 +27,9 @@ import org.apache.nutch.metadata.Metadata; import org.apache.nutch.net.protocols.Response; +import java.util.HashSet; +import java.util.Set; + // Hadoop imports import org.apache.hadoop.conf.Configuration; @@ -49,6 +52,7 @@ public class LanguageIndexingFilter implements IndexingFilter { private Configuration conf; + private Set<String> indexLangs; /** * Constructs a new Language Indexing Filter. @@ -73,6 +77,10 @@ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, lang = "unknown"; } + if (!indexLangs.isEmpty() && !indexLangs.contains(lang)) { + return null; + } + doc.add("lang", lang); return doc; @@ -80,6 +88,7 @@ public NutchDocument filter(NutchDocument doc, Parse parse, Text url, public void setConf(Configuration conf) { this.conf = conf; + indexLangs = new HashSet<>(conf.getStringCollection("lang.index.languages")); } public Configuration getConf() { ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > Allow LanguageIndexingFilter to actually filter documents by language. > ---------------------------------------------------------------------- > > Key: NUTCH-2414 > URL: https://issues.apache.org/jira/browse/NUTCH-2414 > Project: Nutch > Issue Type: Improvement > Components: plugin > Affects Versions: 1.13 > Reporter: Yossi Tamari > Priority: Minor > Fix For: 1.14 > > > It is often useful to only index pages in select languages (e.g. only those > languages that we intend to search in). At first glance it seems that this is > done by LanguageIndexingFilter, but currently all the filter does is add the > language as a field to the index. > We can add a configuration property to LanguageIndexingFilter that will allow > it to only index languages specified in this property. -- This message was sent by Atlassian JIRA (v6.4.14#64029)