[ 
https://issues.apache.org/jira/browse/NUTCH-2414?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16289890#comment-16289890
 ] 

ASF GitHub Bot commented on NUTCH-2414:
---------------------------------------

lewismc closed pull request #217: NUTCH-2414 - Allow LanguageIndexingFilter to 
actually filter documents by language
URL: https://github.com/apache/nutch/pull/217
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
index c406907c5..05fe6912f 100644
--- a/conf/nutch-default.xml
+++ b/conf/nutch-default.xml
@@ -1566,6 +1566,16 @@ visit 
https://wiki.apache.org/nutch/SimilarityScoringFilter-->
   </description>
 </property>
 
+<property>
+  <name>lang.index.languages</name>
+  <value></value>
+  <description>If not empty, should be a comma separated list of language 
codes.
+  Only documents with one of these language codes will be indexed.
+  "unknown" is a valid language code, will match documents where language
+  detection failed.
+  </description>
+</property>
+
 <!-- index-static plugin properties -->
 
 <property>
diff --git 
a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
 
b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
index fbfe8f978..cd954c70d 100644
--- 
a/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
+++ 
b/src/plugin/language-identifier/src/java/org/apache/nutch/analysis/lang/LanguageIndexingFilter.java
@@ -27,6 +27,9 @@
 import org.apache.nutch.metadata.Metadata;
 import org.apache.nutch.net.protocols.Response;
 
+import java.util.HashSet;
+import java.util.Set;
+
 // Hadoop imports
 import org.apache.hadoop.conf.Configuration;
 
@@ -49,6 +52,7 @@
 public class LanguageIndexingFilter implements IndexingFilter {
 
   private Configuration conf;
+  private Set<String> indexLangs;
 
   /**
    * Constructs a new Language Indexing Filter.
@@ -73,6 +77,10 @@ public NutchDocument filter(NutchDocument doc, Parse parse, 
Text url,
       lang = "unknown";
     }
 
+    if (!indexLangs.isEmpty() && !indexLangs.contains(lang)) {
+       return null;
+    }
+    
     doc.add("lang", lang);
 
     return doc;
@@ -80,6 +88,7 @@ public NutchDocument filter(NutchDocument doc, Parse parse, 
Text url,
 
   public void setConf(Configuration conf) {
     this.conf = conf;
+    indexLangs = new 
HashSet<>(conf.getStringCollection("lang.index.languages"));
   }
 
   public Configuration getConf() {


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


> Allow LanguageIndexingFilter to actually filter documents by language.
> ----------------------------------------------------------------------
>
>                 Key: NUTCH-2414
>                 URL: https://issues.apache.org/jira/browse/NUTCH-2414
>             Project: Nutch
>          Issue Type: Improvement
>          Components: plugin
>    Affects Versions: 1.13
>            Reporter: Yossi Tamari
>            Priority: Minor
>             Fix For: 1.14
>
>
> It is often useful to only index pages in select languages (e.g. only those 
> languages that we intend to search in). At first glance it seems that this is 
> done by LanguageIndexingFilter, but currently all the filter does is add the 
> language as a field to the index.
> We can add a configuration property to LanguageIndexingFilter that will allow 
> it to only index languages specified in this property.



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)

Reply via email to