Author: rwesten
Date: Mon Apr 15 06:47:13 2013
New Revision: 1467865

URL: http://svn.apache.org/r1467865
Log:
STANBOL-1031: The Jena TDB LDpath RDFBackend implementation now correctly 
handles literals with emtpy language; STANBOL-1016: implemented TripleFilter 
for the Jena TDB indexing source

Added:
    
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LiteralLanguageFilter.java
   (with props)
    
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilter.java
   (with props)
    
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfImportFilter.java
   (with props)
    
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LanguageLiteralFilterTest.java
   (with props)
    
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilterTest.java
   (with props)
    
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/resources/prefix.config
Modified:
    
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/AbstractTdbBackend.java
    
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java
    
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfIndexingSource.java
    
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java

Modified: 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/AbstractTdbBackend.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/AbstractTdbBackend.java?rev=1467865&r1=1467864&r2=1467865&view=diff
==============================================================================
--- 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/AbstractTdbBackend.java
 (original)
+++ 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/AbstractTdbBackend.java
 Mon Apr 15 06:47:13 2013
@@ -54,7 +54,8 @@ public abstract class AbstractTdbBackend
     private TypeMapper typeMapper = TypeMapper.getInstance();
     
     private Locale toLocale(String lang){
-        if(lang == null){ 
+      //Jena TDB uses '' for representing Literals without language
+        if(lang == null || lang.isEmpty()){ 
             return null;
         }
         Locale locale = localeCache.get(lang);

Modified: 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java?rev=1467865&r1=1467864&r2=1467865&view=diff
==============================================================================
--- 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java
 (original)
+++ 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/DestinationTripleGraph.java
 Mon Apr 15 06:47:13 2013
@@ -1,6 +1,9 @@
 package org.apache.stanbol.entityhub.indexing.source.jenatdb;
 
+import java.util.Map;
+
 import org.apache.jena.atlas.lib.Tuple;
+import org.apache.jena.atlas.logging.Log;
 import org.slf4j.Logger;
 
 import com.hp.hpl.jena.graph.Node;
@@ -25,23 +28,52 @@ import com.hp.hpl.jena.tdb.sys.Names;
  * <p>
  * This code is based on the DestinationGraph implementation private to the 
  * {@link TDBLoader} class.
+ * <p>
+ * In addition this implementation supports an {@link RdfImportFilter} that
+ * can be used to filter RDF triples read from RDF files before adding them
+ * to the RDF TripleStore. 
  * 
  * @author Rupert Westenthaler
  *
  */
 class DestinationTripleGraph implements BulkStreamRDF {
+    /**
+     * ImportFilter that accepts all triples. This is used in case 
+     * <code>null</code> is parsed as {@link RdfImportFilter} to the 
constructor
+     */
+    private static final RdfImportFilter NO_FILTER = new RdfImportFilter() {
+        @Override
+        public void setConfiguration(Map<String,Object> config) {}
+        @Override
+        public boolean needsInitialisation() { return false;}
+        @Override
+        public void initialise() {}
+        @Override
+        public void close() {}
+        @Override
+        public boolean accept(Node s, Node p, Node o) {return true;}
+    };
     final private DatasetGraphTDB dsg ;
     final private LoadMonitor monitor ;
     final private LoaderNodeTupleTable loaderTriples ;
     final private boolean startedEmpty ;
     private long count = 0 ;
+    private long filteredCount = 0;
     private StatsCollector stats ;
+    private RdfImportFilter importFilter;
+    private final Logger importLog;
 
-    DestinationTripleGraph(final DatasetGraphTDB dsg, Logger log) {
+    DestinationTripleGraph(final DatasetGraphTDB dsg, RdfImportFilter 
importFilter, Logger log) {
         this.dsg = dsg ;
         startedEmpty = dsg.isEmpty() ;
         monitor = new LoadMonitor(dsg, log, "triples", 
BulkLoader.DataTickPoint, BulkLoader.IndexTickPoint) ;
         loaderTriples = new 
LoaderNodeTupleTable(dsg.getTripleTable().getNodeTupleTable(), "triples", 
monitor) ;
+        if(importFilter == null){
+            this.importFilter = NO_FILTER;
+        } else {
+            this.importFilter = importFilter;
+        }
+        this.importLog = log;
     }
 
     @Override
@@ -49,19 +81,25 @@ class DestinationTripleGraph implements 
     {
         loaderTriples.loadStart() ;
         loaderTriples.loadDataStart() ;
-
         this.stats = new StatsCollector() ;
     }
+
+    private void triple(Node s, Node p, Node o){
+        if(importFilter.accept(s, p, o)){
+            loaderTriples.load(s, p, o);
+            stats.record(null, s, p, o);
+            count++;
+        } else {
+            filteredCount++;
+            if(filteredCount%100000 == 0){
+                importLog.info("Filtered: {} triples ({}%)",filteredCount,
+                    ((double)filteredCount*100/(double)(filteredCount+count)));
+            }
+        }
+    }
     @Override
-    final public void triple(Triple triple)
-    {
-        Node s = triple.getSubject() ;
-        Node p = triple.getPredicate() ;
-        Node o = triple.getObject() ;
-
-        loaderTriples.load(s, p, o)  ;
-        stats.record(null, s, p, o) ; 
-        count++ ;
+    final public void triple(Triple triple) {
+        triple(triple.getSubject(),triple.getPredicate(),triple.getObject());
     }
 
     @Override
@@ -81,23 +119,21 @@ class DestinationTripleGraph implements 
     }
 
     @Override
-    public void start()                     {}
+    public void start(){}
     @Override
     public void quad(Quad quad) { 
-        triple(quad.asTriple());
+        triple(quad.getSubject(),quad.getPredicate(),quad.getObject());
     }
     @Override
     public void tuple(Tuple<Node> tuple) { 
         if(tuple.size() >= 3){
-            loaderTriples.load(tuple.get(0), tuple.get(1), tuple.get(2))  ;
-            stats.record(null, tuple.get(0), tuple.get(1), tuple.get(2)) ; 
-            count++ ;
+            triple(tuple.get(0),tuple.get(1),tuple.get(2));
         } else {
             throw new TDBException("Tuple with < 3 Nodes encountered while 
loading a single graph");
         }
     }
     @Override
-    public void base(String base)           { }
+    public void base(String base){}
     @Override
     public void prefix(String prefix, String iri)  { } // TODO
     @Override

Added: 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LiteralLanguageFilter.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LiteralLanguageFilter.java?rev=1467865&view=auto
==============================================================================
--- 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LiteralLanguageFilter.java
 (added)
+++ 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LiteralLanguageFilter.java
 Mon Apr 15 06:47:13 2013
@@ -0,0 +1,134 @@
+package org.apache.stanbol.entityhub.indexing.source.jenatdb;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
+import org.osgi.service.cm.ConfigurationException;
+
+import com.hp.hpl.jena.graph.Node;
+
+/**
+ * Allows to filter Tiples based on the language of the value. Triples with
+ * values other than <code>{@link Node#isLiteral()} == true</code> are 
accepted.
+ * This is also true for all Literals that do not have a language assigned.
+ * @author Rupert Westenthaler
+ *
+ */
+public class LiteralLanguageFilter implements RdfImportFilter {
+    /**
+     * Allows to configure the literal languages included/excluded during the
+     * import of RDF data<p>
+     * <b>Syntax: </b><code>{lang1},!{lang2},*</code>
+     * <ul>
+     * <li>'{lang}' includes an language
+     * <li>'!{lang}'excludes an language
+     * <li>',' is the separator, additional spaces are trimmed
+     * <li>'*' will include all properties not explicitly excluded
+     * </ul>
+     */
+    public static final String PARAM_LITERAL_LANGUAGES = "if-literal-language";
+    private Set<String> configuredLanguages;
+    private Set<String> excludedLanguages;
+    private boolean includeAll;
+    
+    public LiteralLanguageFilter(){}
+    
+    /**
+     * For unit tests
+     * @param config the test config
+     */
+    protected LiteralLanguageFilter(String config){
+        parseLanguages(config);
+    }
+    
+    
+    @Override
+    public void setConfiguration(Map<String,Object> config) {
+        
+        Object value = config.get(PARAM_LITERAL_LANGUAGES);
+        if(value == null){
+            includeAll = true;
+            excludedLanguages = Collections.emptySet();
+            configuredLanguages = Collections.emptySet();
+        } else {
+            parseLanguages(value.toString());
+        }
+    }
+
+    private void parseLanguages(String config){
+        configuredLanguages = new HashSet<String>();
+        excludedLanguages = new HashSet<String>();
+        String[] languages = config.split(",");
+        for(int i = 0;i < languages.length;i++){
+            languages[i] = languages[i].trim().toLowerCase(Locale.ROOT);
+            if(includeAll == false && languages[i].equals("*")){
+                includeAll = true;
+            }
+        }
+        for(String lang : languages) {
+            if(lang.isEmpty() || lang.equals("*")){
+                continue; //ignore null values and * is already processed
+            }
+            //lang = lang.toLowerCase(); //country codes are upper case
+            if(lang.charAt(0) == '!'){ //exclude
+                lang = lang.substring(1);
+                if(lang.isEmpty()){
+                    continue; //only a '!' without an lanugage
+                }
+                if(configuredLanguages.contains(lang)){
+                    throw new IllegalArgumentException(
+                        "Langauge '"+lang+"' is both included and excluded 
(config: "
+                        + config+")");
+                }
+                excludedLanguages.add(lang);
+            } else{
+                if(excludedLanguages.contains(lang)){
+                    throw new IllegalArgumentException( 
+                        "Langauge '"+lang+"' is both included and excluded 
(config: "
+                        + config+")");
+                }
+                configuredLanguages.add(lang);
+            }
+        }
+    }
+    
+    @Override
+    public boolean needsInitialisation() {
+        return false;
+    }
+
+    @Override
+    public void initialise() {
+    }
+
+    @Override
+    public void close() {
+    }
+
+    @Override
+    public boolean accept(Node s, Node p, Node o) {
+        if(o.isLiteral()){
+            if(includeAll && excludedLanguages.isEmpty()){
+                return true; //deactivated
+            }
+            String lang = o.getLiteralLanguage();
+            if(lang != null && !lang.isEmpty()){
+                if(includeAll){
+                    return !excludedLanguages.contains(lang);
+                } else {
+                    return configuredLanguages.contains(lang);
+                }
+            } else { //no plain literal (null) or default language (empty)
+                return true; //accept it
+            }
+        } else {
+            return true; //accept all none literals
+        }
+    }
+
+}

Propchange: 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LiteralLanguageFilter.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilter.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilter.java?rev=1467865&view=auto
==============================================================================
--- 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilter.java
 (added)
+++ 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilter.java
 Mon Apr 15 06:47:13 2013
@@ -0,0 +1,182 @@
+package org.apache.stanbol.entityhub.indexing.source.jenatdb;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+import java.util.TreeMap;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.stanbol.commons.namespaceprefix.NamespaceMappingUtils;
+import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixProvider;
+import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixService;
+import org.apache.stanbol.entityhub.indexing.core.config.IndexingConfig;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import com.hp.hpl.jena.graph.Node;
+
+public class PropertyPrefixFilter implements RdfImportFilter {
+    
+    private final Logger log = 
LoggerFactory.getLogger(PropertyPrefixFilter.class);
+    
+    /**
+     * Links to a file that defines included & excluded properties (one per 
line)<p>
+     * <b>Syntax</b>
+     * <ul>
+     * <li>Lines starting with '#' are ignored
+     * <li>'!{prefix}' will exclude all properties starting with the {prefix}.
+     * <li>'{prefix}' will include all properties starting with {prefix}
+     * <li>'*' will include all properties not explicitly excluded
+     * <li> Namespace prefixes are supported!
+     * <li> '{prefix}*' is also supported. However all {prefix} values are
+     * interpreted like that.
+     * </ul>
+     * <b>NOTES</b>: (1) Longer prefixes are matched first. (1) All processed 
+     * values are stored in-memory. That means that matching prefixes are only 
+     * calculate on the first appearance of an property. 
+     */
+    public static final String PARAM_PROPERTY_FILTERS = "if-property-filter";
+    
+    
+    public PropertyPrefixFilter(){}
+    /**
+     * For unit tests only
+     * @param nsPrefixService
+     * @param lines
+     */
+    protected PropertyPrefixFilter(NamespacePrefixProvider nsPrefixService, 
+            List<String> lines){
+        parsePropertyPrefixConfig(nsPrefixService, lines);
+    }
+    
+    private Map<String, Boolean> propertyPrefixMap;
+    private Map<String, Boolean> propertyMap;
+    private boolean includeAll;
+    
+    
+    @Override
+    public void setConfiguration(Map<String,Object> config) {
+        IndexingConfig indexingConfig = 
(IndexingConfig)config.get(IndexingConfig.KEY_INDEXING_CONFIG);
+        NamespacePrefixService nsPrefixService = 
indexingConfig.getNamespacePrefixService();
+        log.info("Configure {}",getClass().getSimpleName());
+        Object value = config.get(PARAM_PROPERTY_FILTERS);
+        if(value == null){
+            propertyPrefixMap = Collections.emptyMap();
+            propertyMap = Collections.emptyMap();
+            includeAll = true;
+        } else {
+            log.info(" > property Prefix Filters");
+            //ensure that longer prefixes are first
+            File propertyPrefixConfig = 
indexingConfig.getConfigFile(value.toString());
+            List<String> lines;
+            InputStream in = null;
+            try {
+                in = new FileInputStream(propertyPrefixConfig);
+                lines = IOUtils.readLines(in,"UTF-8");
+            }catch (IOException e) {
+                throw new IllegalArgumentException("Unable to read property 
filter configuration "
+                    + "from the configured File 
"+propertyPrefixConfig.getAbsolutePath(),e);
+            } finally {
+                IOUtils.closeQuietly(in);
+            }
+            parsePropertyPrefixConfig(nsPrefixService, lines);
+        }
+        
+    }
+
+    /**
+     * @param nsPrefixService
+     * @param propertyPrefixConfig
+     */
+    private void parsePropertyPrefixConfig(NamespacePrefixProvider 
nsPrefixService, List<String> lines) {
+        propertyPrefixMap = new TreeMap<String,Boolean>(new 
Comparator<String>() {
+            @Override
+            public int compare(String o1, String o2) {
+                int length = o2.length() - o1.length();
+                if(length != 0){
+                    return length;
+                } else {
+                    return o1.compareTo(o2);
+                }
+            }
+        });
+        propertyMap = new HashMap<String,Boolean>();
+        includeAll = lines.remove("*");
+        log.info("    - includeAll: {}",includeAll);
+        for(String line : lines){
+            if(line.startsWith("#") || line.isEmpty() || line.equals("*")){
+                continue; //ignore comment, empty lines and multiple '*'
+            }
+            boolean exclude = line.charAt(0) == '!';
+            String prefix = exclude ? line.substring(1) : line;
+            prefix = prefix.trim();
+            if(includeAll && !exclude){
+                continue; //ignore includes if * is active
+            }
+            String uri; 
+            String nsPrefix = NamespaceMappingUtils.getPrefix(prefix);
+            if(nsPrefix != null){
+                String ns = nsPrefixService.getNamespace(nsPrefix);
+                if(ns == null){
+                    throw new IllegalArgumentException("Unable to resolve 
namesoace prefix used by '"
+                            +prefix+"' by using the NamespacePrefixService!");
+                }
+                uri = new StringBuilder(ns).append(prefix,nsPrefix.length()+1, 
prefix.length()).toString();
+            } else {
+                uri = prefix;
+            }
+            if(uri.charAt(uri.length()-1) == '*'){
+                uri = uri.substring(0, uri.length()-1);
+            }
+            log.info("    - '{}' {}", uri, exclude ? "excluded" : "included");
+            propertyPrefixMap.put(uri, !exclude);
+        }
+    }
+
+    @Override
+    public boolean needsInitialisation() {
+        return false;
+    }
+
+    @Override
+    public void initialise() {
+    }
+
+    @Override
+    public void close() {
+    }
+
+    @Override
+    public boolean accept(Node s, Node p, Node o) {
+        if(p.isURI()){
+            if(includeAll && propertyPrefixMap.isEmpty()){
+                return true;
+            }
+            String property = p.getURI();
+            Boolean state = propertyMap.get(property);
+            if(state != null){
+                return state;
+            }
+            //first time we encounter this property ... need to calculate
+            for(Entry<String,Boolean> entry : propertyPrefixMap.entrySet()){
+               if(property.startsWith(entry.getKey())){
+                   propertyMap.put(property, entry.getValue());
+                   return entry.getValue();
+               }
+            }
+            //no match ... set based on includeAll
+            propertyMap.put(property, includeAll);
+        } else {
+            return false;
+        }
+        return false;
+    }
+
+}

Propchange: 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilter.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfImportFilter.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfImportFilter.java?rev=1467865&view=auto
==============================================================================
--- 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfImportFilter.java
 (added)
+++ 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfImportFilter.java
 Mon Apr 15 06:47:13 2013
@@ -0,0 +1,18 @@
+package org.apache.stanbol.entityhub.indexing.source.jenatdb;
+
+import org.apache.stanbol.entityhub.indexing.core.IndexingComponent;
+
+import com.hp.hpl.jena.graph.Node;
+
+/**
+ * Allows to filter Triples parsed from RDF files. Useful to NOT import some
+ * RDF triples from RDF dumps that are not relevant for the indexing process.
+ * @author Rupert Westenthaler
+ *
+ */
+public interface RdfImportFilter extends IndexingComponent{
+
+    
+    public boolean accept(Node s, Node p, Node o);
+    
+}

Propchange: 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfImportFilter.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfIndexingSource.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfIndexingSource.java?rev=1467865&r1=1467864&r2=1467865&view=diff
==============================================================================
--- 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfIndexingSource.java
 (original)
+++ 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfIndexingSource.java
 Mon Apr 15 06:47:13 2013
@@ -44,6 +44,7 @@ import org.apache.stanbol.entityhub.serv
 import org.apache.stanbol.entityhub.servicesapi.model.Text;
 import org.apache.stanbol.entityhub.servicesapi.model.ValueFactory;
 import org.apache.stanbol.entityhub.servicesapi.util.ModelUtils;
+import org.joda.time.field.ImpreciseDateTimeField;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -114,6 +115,13 @@ public class RdfIndexingSource extends A
      */
     public static final String PARAM_IMPORT_SOURCE = "import";
     /**
+     * Allows to configure a {@link RdfImportFilter} (full qualified class 
name).
+     * If present it gets the full configuration set for this component parsed.
+     * This means that the import filter can be configured by the same 
+     * configuration as this component.
+     */
+    public static final String PARAM_IMPORT_FILTER = "import-filter";
+    /**
      * The default directory name used to search for RDF files to be imported
      */
     public static final String DEFAULT_SOURCE_FOLDER_NAME = "rdfdata";
@@ -140,6 +148,7 @@ public class RdfIndexingSource extends A
      * used for logging a single WARN level entry on the first ignored BNode
      */
     private boolean bnodeIgnored = false;
+    private RdfImportFilter importFilter;
     
     /**
      * Default Constructor relaying on that {@link #setConfiguration(Map)} is
@@ -168,17 +177,20 @@ public class RdfIndexingSource extends A
      * imported
      * @param valueFactory The {@link ValueFactory} used to create instances
      * or <code>null</code> to use the default implementation.
+     * @param importFilter Optionally an importFilter used for filtering some
+     * triples read from the RDF source files.
      */
     public RdfIndexingSource(File modelLocation, 
                                File sourceFileOrDirectory,
-                               ValueFactory valueFactory){
+                               ValueFactory valueFactory,
+                               RdfImportFilter importFilter){
         if(modelLocation == null){
             throw new IllegalArgumentException("The parsed model location MUST 
NOT be NULL!");
         }
         //init the store
         this.indexingDataset = initTDBDataset(modelLocation);
         //use a ResourceLoader that fails on the first invalid RDF file 
(STANBOL-328)
-        this.loader =  new ResourceLoader(new 
RdfResourceImporter(indexingDataset), true,true);
+        this.loader =  new ResourceLoader(new 
RdfResourceImporter(indexingDataset,importFilter), true,true);
         loader.addResource(sourceFileOrDirectory);
     }
     @Override
@@ -187,10 +199,48 @@ public class RdfIndexingSource extends A
         //first init the RDF Model
         this.indexingDataset = Utils.getTDBDataset(config);
         //second we need to check if we need to import RDF files to the RDF 
model
+        //look if we need want to use an import filter
+        Object value = config.get(PARAM_IMPORT_FILTER);
+        if(value == null){
+            log.info("No RDF Import Filter configured");
+            importFilter = null;
+        } else {
+            String[] filterNames = value.toString().split(",");
+            List<RdfImportFilter> filters = new ArrayList<RdfImportFilter>();
+            ClassLoader cl = indexingConfig.getClass().getClassLoader();
+            for(String filterName : filterNames){
+                filterName = filterName.trim();
+                try {
+                    Class<? extends RdfImportFilter> importFilterClass = 
cl.loadClass(
+                        filterName).asSubclass(RdfImportFilter.class);
+                    RdfImportFilter filter = importFilterClass.newInstance();
+                    filter.setConfiguration(config);
+                    filters.add(filter);
+                    log.info("Use RDF ImportFilter {} (type: 
{})",importFilter,importFilterClass.getSimpleName());
+                } catch (ClassNotFoundException e) {
+                    throw new IllegalArgumentException("Configured 
RdfImportFilter '"
+                        +filterName+"' not found", e);
+                } catch (InstantiationException e) {
+                    throw new IllegalArgumentException("Configured 
RdfImportFilter '"
+                            +filterName+"' can not be instantiated", e);
+                } catch (IllegalAccessException e) {
+                    throw new IllegalArgumentException("Configured 
RdfImportFilter '"
+                            +filterName+"' can not be created", e);
+                }
+            }
+            if(filters.isEmpty()){
+                this.importFilter = null;
+            } else if(filters.size() == 1){
+                this.importFilter = filters.get(0);
+            } else {
+                this.importFilter = new UnionImportFilter(filters.toArray(
+                    new RdfImportFilter[filters.size()]));
+            }
+        }
         //create the ResourceLoader
-        this.loader =  new ResourceLoader(new 
RdfResourceImporter(indexingDataset), true);
+        this.loader =  new ResourceLoader(new 
RdfResourceImporter(indexingDataset, importFilter), true);
         
-        Object value = config.get(PARAM_IMPORTED_FOLDER);
+        value = config.get(PARAM_IMPORTED_FOLDER);
         String importedFolderName;
         if(value != null && !value.toString().isEmpty()){
             importedFolderName = value.toString();
@@ -281,17 +331,25 @@ public class RdfIndexingSource extends A
     }
     @Override
     public boolean needsInitialisation() {
-        //if there are resources with the state REGISTERED we need an 
initialisation
-        return !loader.getResources(ResourceState.REGISTERED).isEmpty();
+        return (importFilter != null && importFilter.needsInitialisation()) ||
+                !loader.getResources(ResourceState.REGISTERED).isEmpty();
     }
     @Override
     public void initialise(){
-        loader.loadResources();
+        if(importFilter != null && importFilter.needsInitialisation()){
+            importFilter.initialise();
+        }
+        if(!loader.getResources(ResourceState.REGISTERED).isEmpty()){
+            loader.loadResources();
+        }
     }
     @Override
     public void close() {
         loader = null;
         indexingDataset.close();
+        if(importFilter != null){
+            importFilter.close();
+        }
     }
     public void debug(){
         String entityVar = "s";
@@ -345,20 +403,32 @@ public class RdfIndexingSource extends A
             resource = Node.createURI(id);
         }
         Representation source = vf.createRepresentation(id);
-        ExtendedIterator<Triple> outgoing = 
indexingDataset.getDefaultGraph().find(resource, null, null);
-        boolean found = outgoing.hasNext();
-        while(outgoing.hasNext()){ //iterate over the statements for that 
resource
-            Triple statement = outgoing.next();
-            Node predicate = statement.getPredicate();
-            if(predicate == null || !predicate.isURI()){
-                log.warn("Ignore field {} for resource {} because it is null 
or not an URI!",
-                    predicate,resource);
-            } else {
-                String field = predicate.getURI();
-                Node value = statement.getObject();
-                processValue(value, source, field);
-            } //end else predicate != null
-        } //end iteration over resource triple
+        boolean found;
+        ExtendedIterator<Triple> outgoing = null;
+        try { // There may still be exceptions while reading triples
+            outgoing = indexingDataset.getDefaultGraph().find(resource, null, 
null);
+            found = outgoing.hasNext();
+            while(outgoing.hasNext()){ //iterate over the statements for that 
resource
+                Triple statement = outgoing.next();
+                Node predicate = statement.getPredicate();
+                if(predicate == null || !predicate.isURI()){
+                    log.warn("Ignore field {} for resource {} because it is 
null or not an URI!",
+                        predicate,resource);
+                } else {
+                    String field = predicate.getURI();
+                    Node value = statement.getObject();
+                    processValue(value, source, field);
+                } //end else predicate != null
+            } //end iteration over resource triple
+        } catch (Exception e) {
+            log.warn("Unable to retrieve entity data for Entity '"+id+"'",e);
+            found = false;
+            try {
+                if(outgoing != null){
+                    outgoing.close();
+                }
+            } catch (Exception e1) { /* ignore */}
+        }
         if(found) {
             if(log.isTraceEnabled()){
                 log.info("Resource: \n{}", 
ModelUtils.getRepresentationInfo(source));
@@ -407,9 +477,9 @@ public class RdfIndexingSource extends A
                         if(duration != null && !duration.isEmpty()) {
                             source.add(field, literalValue.toString());
                         }
-                    } else {
+                    } else if(!ll.getLexicalForm().isEmpty()){
                         source.add(field, literalValue);
-                    }
+                    } //else ignore literals that are empty
                 } catch (DatatypeFormatException e) {
                     log.warn(" Unable to convert {} to {} -> use lecicalForm",
                         ll.getLexicalForm(),ll.getDatatype());
@@ -765,5 +835,40 @@ public class RdfIndexingSource extends A
             return super.createURI(uri);
         }
     }
+    /**
+     * used in case multiple {@link RdfImportFilter}s are configured.
+     * @author Rupert Westenthaler
+     *
+     */
+    private class UnionImportFilter implements RdfImportFilter {
+
+        RdfImportFilter[] filters;
+        
+        UnionImportFilter(RdfImportFilter[] filters){
+            this.filters = filters;
+        }
+        
+        @Override
+        public void setConfiguration(Map<String,Object> config) {}
+
+        @Override
+        public boolean needsInitialisation() { return false;}
+
+        @Override
+        public void initialise() {}
+
+        @Override
+        public void close() {}
+
+        @Override
+        public boolean accept(Node s, Node p, Node o) {
+            boolean state = true;
+            for(int i=0;state && i < filters.length;i++){
+                state = filters[i].accept(s, p, o);
+            }
+            return state;
+        }
+        
+    }
     
 }

Modified: 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java?rev=1467865&r1=1467864&r2=1467865&view=diff
==============================================================================
--- 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java
 (original)
+++ 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/main/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/RdfResourceImporter.java
 Mon Apr 15 06:47:13 2013
@@ -37,12 +37,12 @@ public class RdfResourceImporter impleme
     private static final Logger log = 
LoggerFactory.getLogger(RdfResourceImporter.class);
    // private final DatasetGraphTDB indexingDataset;
     private final DestinationTripleGraph destination;
-    public RdfResourceImporter(DatasetGraphTDB indexingDataset){
+    public RdfResourceImporter(DatasetGraphTDB indexingDataset, 
RdfImportFilter importFilter){
         if(indexingDataset == null){
             throw new IllegalArgumentException("The parsed DatasetGraphTDB 
instance MUST NOT be NULL!");
         }
         //this.indexingDataset = indexingDataset;
-        this.destination = new DestinationTripleGraph(indexingDataset,log);
+        this.destination = new 
DestinationTripleGraph(indexingDataset,importFilter,log);
     }
 
     @Override

Added: 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LanguageLiteralFilterTest.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LanguageLiteralFilterTest.java?rev=1467865&view=auto
==============================================================================
--- 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LanguageLiteralFilterTest.java
 (added)
+++ 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LanguageLiteralFilterTest.java
 Mon Apr 15 06:47:13 2013
@@ -0,0 +1,58 @@
+package org.apache.stanbol.entityhub.indexing.source.jenatdb;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import com.hp.hpl.jena.datatypes.RDFDatatype;
+import com.hp.hpl.jena.datatypes.xsd.XSDDatatype;
+import com.hp.hpl.jena.graph.Node;
+
+public class LanguageLiteralFilterTest {
+
+    @Test(expected=IllegalArgumentException.class)
+    public void testIncludeExcludeConfig1(){
+        new LiteralLanguageFilter("en,de,!de");
+    }
+    @Test(expected=IllegalArgumentException.class)
+    public void testIncludeExcludeConfig2(){
+        new LiteralLanguageFilter("en,!de,de");
+    }
+    @Test
+    public void testDataTypes(){
+        RdfImportFilter filter = new LiteralLanguageFilter("en,de");
+        
+        Assert.assertTrue(filter.accept(null, null, 
+            Node.createLiteral("test", "en", false)));
+        Assert.assertTrue(filter.accept(null, null, 
+            Node.createLiteral("test")));
+        Assert.assertTrue(filter.accept(null, null, 
+            Node.createLiteral("10",XSDDatatype.XSDint)));
+        Assert.assertTrue(filter.accept(null, null, 
+            Node.createAnon()));
+        Assert.assertTrue(filter.accept(null, null, 
+            Node.createURI("urn:test")));
+    }
+    @Test
+    public void testIncludeTest(){
+        RdfImportFilter filter = new LiteralLanguageFilter("en,de");
+        
+        Assert.assertTrue(filter.accept(null, null, 
+            Node.createLiteral("test", "en", false)));
+        Assert.assertTrue(filter.accept(null, null, 
+            Node.createLiteral("test", "de", false)));
+        Assert.assertFalse(filter.accept(null, null, 
+            Node.createLiteral("test", "fr", false)));
+    }
+    @Test
+    public void testExcludeTest(){
+        RdfImportFilter filter = new LiteralLanguageFilter("*,en,!de");
+        
+        Assert.assertTrue(filter.accept(null, null, 
+            Node.createLiteral("test", "en", false)));
+        Assert.assertFalse(filter.accept(null, null, 
+            Node.createLiteral("test", "de", false)));
+        Assert.assertTrue(filter.accept(null, null, 
+            Node.createLiteral("test", "fr", false)));
+    }
+    
+}

Propchange: 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/LanguageLiteralFilterTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilterTest.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilterTest.java?rev=1467865&view=auto
==============================================================================
--- 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilterTest.java
 (added)
+++ 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilterTest.java
 Mon Apr 15 06:47:13 2013
@@ -0,0 +1,87 @@
+package org.apache.stanbol.entityhub.indexing.source.jenatdb;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.stanbol.commons.namespaceprefix.NamespacePrefixProvider;
+import 
org.apache.stanbol.commons.namespaceprefix.impl.NamespacePrefixProviderImpl;
+import org.apache.stanbol.entityhub.servicesapi.defaults.NamespaceEnum;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import com.hp.hpl.jena.graph.Node;
+
+public class PropertyPrefixFilterTest {
+    
+    private static final String FB = "http://rdf.freebase.com/ns/";;
+
+    private static final String TEST_CONFIG = "prefix.config";
+    
+    
+    private static NamespacePrefixProvider nsPrefixProvider;
+
+    private static final Map<String,String> nsMappings = new 
HashMap<String,String>();
+    static {
+        nsMappings.put("fb", FB);
+        nsMappings.put("rdf", NamespaceEnum.rdf.getNamespace());
+        nsMappings.put("rdfs", NamespaceEnum.rdfs.getNamespace());
+        nsMappings.put("skos", NamespaceEnum.skos.getNamespace());
+    }
+    
+    private static List<String> configLines;
+    
+    private RdfImportFilter importFilter;
+    
+    @BeforeClass
+    public static void init() throws IOException{
+        nsPrefixProvider = new NamespacePrefixProviderImpl(nsMappings);
+        InputStream in = 
PropertyPrefixFilterTest.class.getClassLoader().getResourceAsStream(TEST_CONFIG);
+        Assert.assertNotNull("Unable to read test config",in);
+        configLines = (List<String>)IOUtils.readLines(in, "UTF-8");
+    }
+    
+    @Before
+    public void createImportFilter(){
+        importFilter = new PropertyPrefixFilter(nsPrefixProvider, configLines);
+    }
+    
+    @Test
+    public void testMappings(){
+        Node subject = Node.createURI("urn:subject");
+        Node value = Node.createURI("urn:value");
+        
+        Node rdfType = Node.createURI(NamespaceEnum.rdf+"type");
+        Assert.assertTrue(importFilter.accept(subject,rdfType,value));
+
+        Node rdfsLabel = Node.createURI(NamespaceEnum.rdfs+"label");
+        Assert.assertTrue(importFilter.accept(subject,rdfsLabel,value));
+
+        Node guid = Node.createURI(FB+"type.object.guid");
+        Assert.assertFalse(importFilter.accept(subject,guid,value));
+        
+        Node permission = Node.createURI(FB+"type.object.permission");
+        Assert.assertFalse(importFilter.accept(subject,permission,value));
+        
+        Node name = Node.createURI(FB+"type.object.name");
+        Assert.assertTrue(importFilter.accept(subject,name,value));
+        
+        Node description = Node.createURI(FB+"type.object.description");
+        Assert.assertTrue(importFilter.accept(subject,description,value));
+
+        Node dummy = Node.createURI(FB+"type.dummy");
+        Assert.assertFalse(importFilter.accept(subject,dummy,value));
+        
+        Node typePlain = Node.createURI(FB+"type");
+        Assert.assertFalse(importFilter.accept(subject,typePlain,value));
+        
+        Node other = Node.createURI(NamespaceEnum.cc+"license");
+        Assert.assertFalse(importFilter.accept(subject,other,value));
+    }
+
+}

Propchange: 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/java/org/apache/stanbol/entityhub/indexing/source/jenatdb/PropertyPrefixFilterTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/resources/prefix.config
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/resources/prefix.config?rev=1467865&view=auto
==============================================================================
--- 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/resources/prefix.config
 (added)
+++ 
stanbol/trunk/entityhub/indexing/source/jenatdb/src/test/resources/prefix.config
 Mon Apr 15 06:47:13 2013
@@ -0,0 +1,11 @@
+
+# all from rdf and rdfs namespace
+rdf:*
+rdfs:*
+# exclude some specific type.object properties
+!fb:type.object.guid
+!fb:type.object.permission
+# import all type.object (other than excluded)
+fb:type.object
+# exclude all type properties (other than type.object)
+!fb:type


Reply via email to