Author: rwesten
Date: Mon Jan 20 07:57:35 2014
New Revision: 1559637

URL: http://svn.apache.org/r1559637
Log:
implementation for STANBOL-1259 for the 0.12 branch. This also adds the new 
features as configuration options to the EntityhubDereferenceEngine

Modified:
    
stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceConstants.java
    
stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceEngineConfig.java
    
stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/EntityDereferenceEngine.java
    
stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/java/org/apache/stanbol/enhancer/engines/dereference/entityhub/EntityhubDereferenceEngine.java
    
stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/resources/OSGI-INF/metatype/metatype.properties

Modified: 
stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceConstants.java
URL: 
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceConstants.java?rev=1559637&r1=1559636&r2=1559637&view=diff
==============================================================================
--- 
stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceConstants.java
 (original)
+++ 
stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceConstants.java
 Mon Jan 20 07:57:35 2014
@@ -82,4 +82,32 @@ public interface DereferenceConstants {
      */
     String DEREFERENCE_ENTITIES_LDPATH = "enhancer.engines.dereference.ldpath";
 
+       /**
+        * A URI prefix checked for entity URIs. Only entities that do match 
any of the
+        * parsed prefixes or {@link #URI_PATTERN} will be dereferenced. If no 
+        * pattern nor prefixes are configured all entities will be 
dereferenced. 
+        * This has lower priority as {@link #FALLBACK_MODE}.
+        * @see #FALLBACK_MODE
+        */
+    String URI_PREFIX = "enhancer.engines.dereference.uriPrefix";
+    
+    
+       /**
+        * Regex pattern applied to entity URIs. Only entities that do match 
any of
+        * the configured {@link #URI_PREFIX} or pattern will be dereferenced. 
+        * If no pattern nor prefixes are configured all entities will be 
dereferenced.
+        * This has lower priority as {@link #FALLBACK_MODE}.
+        * @see #FALLBACK_MODE
+        */
+    String URI_PATTERN = "enhancer.engines.dereference.uriPattern";
+    
+    /**
+     * If fallback mode is activated a dereference engine will not try to
+     * dereference entities for those there are already triples added to the
+     * enhancement results.
+     */
+    String FALLBACK_MODE = "enhancer.engines.dereference.fallback";
+    
+    boolean DEFAULT_FALLBACK_MODE = true;
+    
 }

Modified: 
stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceEngineConfig.java
URL: 
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceEngineConfig.java?rev=1559637&r1=1559636&r2=1559637&view=diff
==============================================================================
--- 
stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceEngineConfig.java
 (original)
+++ 
stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/DereferenceEngineConfig.java
 Mon Jan 20 07:57:35 2014
@@ -213,4 +213,60 @@ public class DereferenceEngineConfig imp
         return config;
     }
     
+    /**
+     * If the {@link DereferenceConstants#FALLBACK_MODE} is active or inactive
+     * @return the fallback mode state
+     */
+    public boolean isFallbackMode(){
+       Object value = config.get(FALLBACK_MODE);
+       return value == null ? DereferenceConstants.DEFAULT_FALLBACK_MODE :
+               Boolean.parseBoolean(value.toString());
+    }
+    
+    /**
+     * The configured {@link DereferenceConstants#URI_PATTERN}
+     * @return the URI patterns. An empty List if none
+     */
+    public List<String> getUriPatterns(){
+       Object value = config.get(DereferenceConstants.URI_PATTERN);
+       return getStrValues(value);
+    }
+    /**
+     * The configured {@link DereferenceConstants#URI_PREFIX}
+     * @return the URI prefixes. An empty List if none
+     */
+    public List<String> getUriPrefixes(){
+       Object value = config.get(DereferenceConstants.URI_PREFIX);
+       return getStrValues(value);
+    }
+       /**
+        * Extracts String values from the parsed value.
+        * @param value the value (String, String[] or Collection<?>
+        * @return the values as List in the parsed order
+        */
+       private List<String> getStrValues(Object value) {
+               final List<String> values;
+       if(value instanceof String){
+               values = StringUtils.isBlank(((String)value)) ? 
+                               Collections.<String>emptyList() : 
+                                       
Collections.singletonList((String)value);
+       } else if(value instanceof String[]){
+               values = new ArrayList<String>();
+               for(String pattern : (String[])value){
+                       if(!StringUtils.isBlank(pattern)){
+                               values.add(pattern);
+                       }
+               }
+       } else if(value instanceof Collection<?>){
+               values = new ArrayList<String>();
+               for(Object pattern : (Collection<?>)value){
+                       if(pattern != null && 
StringUtils.isBlank(pattern.toString())){
+                               values.add(pattern.toString());
+                       }
+               }               
+       } else {
+               values = Collections.emptyList();
+       }
+       return values;
+       }
 }

Modified: 
stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/EntityDereferenceEngine.java
URL: 
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/EntityDereferenceEngine.java?rev=1559637&r1=1559636&r2=1559637&view=diff
==============================================================================
--- 
stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/EntityDereferenceEngine.java
 (original)
+++ 
stanbol/branches/release-0.12/enhancement-engines/dereference/core/src/main/java/org/apache/stanbol/enhancer/engines/dereference/EntityDereferenceEngine.java
 Mon Jan 20 07:57:35 2014
@@ -16,6 +16,7 @@
  */
 package org.apache.stanbol.enhancer.engines.dereference;
 
+import static 
org.apache.stanbol.enhancer.engines.dereference.DereferenceConstants.URI_PATTERN;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
 
@@ -32,6 +33,9 @@ import java.util.concurrent.ExecutionExc
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Future;
 import java.util.concurrent.locks.Lock;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+import java.util.regex.PatternSyntaxException;
 
 import org.apache.clerezza.rdf.core.Language;
 import org.apache.clerezza.rdf.core.MGraph;
@@ -74,6 +78,13 @@ public class EntityDereferenceEngine imp
     
     protected final boolean filterAcceptLanguages;
     
+    protected final boolean uriFilterPresent;
+    
+    protected final List<String> prefixList;
+    
+    protected final List<Pattern> patternList;
+    
+    protected final boolean fallbackMode;
     /**
      * The Map holding the {@link #serviceProperties} for this engine.
      */
@@ -90,14 +101,53 @@ public class EntityDereferenceEngine imp
         }
         this.config = config;
         this.name = config.getEngineName();
+        log.debug("create {} name {}", getClass().getSimpleName(), name);
         this.filterContentLanguages = config.isFilterContentLanguages();
+        log.debug(" - filter content languages: {}", filterContentLanguages);
         this.filterAcceptLanguages = config.isFilterAcceptLanguages();
+        log.debug(" - filter Accept languages: {}", filterAcceptLanguages);
         if(dereferencer == null){
             throw new IllegalArgumentException("The parsed EntityDereferencer 
MUST NOT be NULL!");
         }
         this.dereferencer = dereferencer;
-        //init the defautl ordering
-        setEngineOrdering(DEFAULT_ENGINE_ORDERING);
+        log.debug(" - dereferenced {} (type: {})", dereferencer, 
dereferencer.getClass().getName());
+        //init the default ordering
+        this.fallbackMode = config.isFallbackMode();
+        log.debug(" - fallback Mode: {}", fallbackMode);
+        //Set the default engine ordering based on the fallback mode state:
+        //in case of fallback mode call this after dereferencing engines 
+        //without fallback mode
+        setEngineOrdering(fallbackMode ? DEFAULT_ENGINE_ORDERING - 1 : 
+               DEFAULT_ENGINE_ORDERING);
+        log.debug(" - engine order: {}", getEngineOrdering());
+        //sort the prefixes
+        prefixList = config.getUriPrefixes();
+        if(prefixList.size() > 1){
+               Collections.sort(prefixList);
+        }
+        if(log.isDebugEnabled()){
+               log.debug(" - configured prefixes:");
+               for(String prefix : prefixList){
+                       log.debug("     {}",prefix);
+               }
+        }
+        //compile the patterns
+        patternList = new ArrayList<Pattern>();
+        for(String pattern : config.getUriPatterns()){
+               try {
+                       patternList.add(Pattern.compile(pattern));
+               } catch (PatternSyntaxException e){
+                       throw new IllegalStateException("Unable to compile URI 
pattern '"
+                                       + pattern + "' pared via property '" + 
URI_PATTERN + "'!");
+               }
+        }
+        if(log.isDebugEnabled()){
+               log.debug(" - configured patterns:");
+               for(Pattern pattern : patternList){
+                       log.debug("     {}",pattern);
+               }
+        }
+        uriFilterPresent = !prefixList.isEmpty() || !patternList.isEmpty();
     }
     
     /**
@@ -157,6 +207,7 @@ public class EntityDereferenceEngine imp
             return;
         }
         log.debug("> dereference Entities for ContentItem {}", ci.getUri());
+        long start = System.nanoTime();
         final DereferenceContext derefContext = new 
DereferenceContext(offline);
         Set<String> includedLangs = new HashSet<String>();
         //TODO: parse accept languages as soon as Enhancement properties are 
implemented
@@ -172,27 +223,27 @@ public class EntityDereferenceEngine imp
                 }
             } //no content language filtering - leave contentLanguages empty
             //parse the referenced entities from the graph
+            Set<UriRef> checked = new HashSet<UriRef>();
             Iterator<Triple> entityReferences = metadata.filter(null, 
ENHANCER_ENTITY_REFERENCE, null);
             while(entityReferences.hasNext()){
                 Triple triple = entityReferences.next();
                 Resource entityReference = triple.getObject();
-                if(entityReference instanceof UriRef){
+                if((entityReference instanceof UriRef) && //only URIs
+                               checked.add((UriRef)entityReference) && //do 
not check a URI twice
+                               chekcFallbackMode((UriRef)entityReference, 
metadata) && //fallback mode
+                               checkURI((UriRef)entityReference)){ //URI 
prefixes and patterns
                     boolean added = 
referencedEntities.add((UriRef)entityReference);
                     if(added && log.isTraceEnabled()){
                         log.trace("  ... schedule Entity {}", entityReference);
                     }
-                } else if(log.isWarnEnabled()){
-                    //log enhancement that use a fise:entiy-reference with a 
non UriRef value!
-                    NonLiteral enhancement = triple.getSubject();
-                    log.warn("Can not dereference invalid Enhancement 
{}",enhancement);
-                    for(Iterator<Triple> it = metadata.filter(enhancement, 
null, null);it.hasNext();){
-                        log.warn("   {}", it.next());
-                    }
+                } else if(log.isTraceEnabled()){
+                    log.trace(" ... ignore Entity {}",entityReferences);
                 }
             }
         } finally {
             ci.getLock().readLock().unlock();
         }
+        long schedule = System.nanoTime();
         if(!includedLangs.isEmpty()){
             includedLangs.add(null); //also include literals without language
             //and set the list to the dereference context
@@ -204,7 +255,6 @@ public class EntityDereferenceEngine imp
             referencedEntities.size());
         //(2) dereference the Entities
         ExecutorService executor = dereferencer.getExecutor();
-        long start = System.currentTimeMillis();
         Set<UriRef> failedEntities = new HashSet<UriRef>();
         int dereferencedCount = 0;
         List<DereferenceJob> dereferenceJobs = new ArrayList<DereferenceJob>(
@@ -256,25 +306,99 @@ public class EntityDereferenceEngine imp
                 }
             }
         }
-        long duration = System.currentTimeMillis() - start;
+        long end = System.nanoTime();
+        float sheduleDuration = ((schedule - start)/10000)/100f;
+        float dereferenceDuration = ((end - schedule)/10000)/100f;
+        float duration = ((end - start)/10000)/100f;
         if(!failedEntities.isEmpty()){
             log.warn(" - unable to dereference {} of {} for ContentItem {}",
                 new Object[] {failedEntities.size(),referencedEntities.size(), 
                     ci.getUri()});
         }
         if(log.isDebugEnabled() && dereferencedCount > 0){
-            log.debug(" - dereferenced {} of {} Entities in {}ms 
({}ms/dereferenced)", 
-                new Object[]{dereferencedCount, referencedEntities.size(),
-                    duration, (duration*100/dereferencedCount)/100.0f});
+            log.debug(" - dereferenced {} of {} Entities in {}ms | 
schedule:{}ms | "
+                       + " dereference: {}ms ({}ms/entity)", new Object[]{
+                                       dereferencedCount, 
referencedEntities.size(),
+                                       duration, sheduleDuration, 
dereferenceDuration,
+                                       dereferenceDuration/dereferencedCount});
         }
         
     }
 
-    @Override
+       @Override
     public String getName() {
         return name;
     }
 
+    protected boolean chekcFallbackMode(UriRef entityReference, MGraph 
metadata) {
+               return fallbackMode ? //in case we use fallback mode
+                               //filter entities for those an outgoing 
relation is present
+                               !metadata.filter(entityReference, null, 
null).hasNext() :
+                                       true; //otherwise process all entities
+       }
+    /**
+     * Checks if we need to schedule an Entity based on its URI. This uses
+     * configured URI prefixes and URI patterns.
+     * @param entity the entity to check
+     * @return <code>true</code> if this entity should be scheduled for
+     * dereferencing. <code>false</code> if not.
+     */
+    protected boolean checkURI(UriRef entity){
+       if(!uriFilterPresent){ //if no prefix nor pattern is set
+               return true; //accept all
+       }
+       //first prefixes as this is faster
+       String entityUri = entity.getUnicodeString();
+       log.trace(" - checkURI {}", entityUri);
+       //(1) check against prefixes
+       if(!prefixList.isEmpty()){
+               //as we do not want to check with all configured prefixes let 
us do a
+               //binary search for the correct one
+               int pos = Collections.binarySearch(prefixList, entityUri);
+                   if(pos < 0){
+                       /**
+                        * Example:
+                        * ["a","b"] <- "bc"
+                        * binary search returns -3 (because insert point would 
be +2)
+                        * to find the prefix we need the insert point-1 -> pos 
1
+                        *
+                        * Example2:
+                        * [] <- "bc"
+                        * binary search returns -1 (because insert point would 
be 0)
+                        * to find the prefix we need the insert point-1 -> pos 
-1
+                        * therefore we need to check for negative prefixPos 
and return
+                        * an empty list!
+                        */
+                       int prefixPos = Math.abs(pos)-2;
+                       String prefix = prefixList.get(prefixPos);
+                       if(prefixPos >= 0 && entityUri.startsWith(prefix)){
+                               log.trace(" ... matched prefix {}", prefix);
+                               return true; //it matches a prefix in the list
+                       } else { //try configured regex pattern
+                               log.trace("  ... no match for prefix {}", 
prefix);
+                       }
+                   } else {
+                       return true; //entityUri found in list
+                   }
+       }
+           //(2) check against regex
+       if(!patternList.isEmpty()){
+               for(Pattern pattern : patternList){
+                       Matcher m = pattern.matcher(entityUri);
+                       if(m.find()){
+                               if(log.isTraceEnabled()) {
+                                       log.trace("  ... matches pattern {}", 
pattern);
+                               }
+                               return true;
+                       } else if(log.isTraceEnabled()){ //try the next pattern
+                                       log.trace("  ... no match for pattern 
{}", pattern);
+                       }
+               }
+       }
+       return false; //no match
+    }
+    
+    
     /**
      * Used both as {@link Callable} submitted to the {@link ExecutorService}
      * and as object to {@link #await()} the completion of the task.

Modified: 
stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/java/org/apache/stanbol/enhancer/engines/dereference/entityhub/EntityhubDereferenceEngine.java
URL: 
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/java/org/apache/stanbol/enhancer/engines/dereference/entityhub/EntityhubDereferenceEngine.java?rev=1559637&r1=1559636&r2=1559637&view=diff
==============================================================================
--- 
stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/java/org/apache/stanbol/enhancer/engines/dereference/entityhub/EntityhubDereferenceEngine.java
 (original)
+++ 
stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/java/org/apache/stanbol/enhancer/engines/dereference/entityhub/EntityhubDereferenceEngine.java
 Mon Jan 20 07:57:35 2014
@@ -63,8 +63,12 @@ import org.slf4j.LoggerFactory;
 @org.apache.felix.scr.annotations.Properties(value={
     @Property(name=PROPERTY_NAME),
     @Property(name=EntityhubDereferenceEngine.SITE_ID),
+    @Property(name=DereferenceConstants.FALLBACK_MODE, 
+       boolValue=DereferenceConstants.DEFAULT_FALLBACK_MODE),
+    @Property(name=DereferenceConstants.URI_PREFIX, 
cardinality=Integer.MAX_VALUE),
+    @Property(name=DereferenceConstants.URI_PATTERN, 
cardinality=Integer.MAX_VALUE),
     @Property(name=DereferenceConstants.FILTER_CONTENT_LANGUAGES, 
-    boolValue=DereferenceConstants.DEFAULT_FILTER_CONTENT_LANGUAGES),
+       boolValue=DereferenceConstants.DEFAULT_FILTER_CONTENT_LANGUAGES),
     @Property(name=DEREFERENCE_ENTITIES_FIELDS,cardinality=Integer.MAX_VALUE,
        
value={"rdfs:comment","geo:lat","geo:long","foaf:depiction","dbp-ont:thumbnail"}),
     @Property(name=DEREFERENCE_ENTITIES_LDPATH, cardinality=Integer.MAX_VALUE),

Modified: 
stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: 
http://svn.apache.org/viewvc/stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1559637&r1=1559636&r2=1559637&view=diff
==============================================================================
--- 
stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/resources/OSGI-INF/metatype/metatype.properties
 (original)
+++ 
stanbol/branches/release-0.12/enhancement-engines/dereference/entityhub/src/main/resources/OSGI-INF/metatype/metatype.properties
 Mon Jan 20 07:57:35 2014
@@ -53,3 +53,23 @@ enhancer.engine.dereference.filterConten
 enhancer.engine.dereference.filterContentlanguages.description=If enabled only 
Literals \
 in the language detected for the parsed Content (or with no defined language) 
are dereferenced
 
+
+enhancer.engines.dereference.fallback.name=Fallback Mode
+enhancer.engines.dereference.fallback.description=If enabled the engine will 
only \
+try to dereference Entities for those no data where yet added to the 
Enhancement Results
+
+enhancer.engines.dereference.uriPrefix.name= URI Prefix
+enhancer.engines.dereference.uriPrefix.description=URI prefixes such as \
+'http://http://rdf.freebase.com/ns/' this engine will try to resolve. NOTE: 
that \
+his engine will use both the configured URI prefixes AND URI patterns to check 
\
+if it can dereference an Entity. If any of those configuration matches the \
+Entity will be dereferenced.
+
+enhancer.engines.dereference.uriPattern.name=URI Pattern
+enhancer.engines.dereference.uriPattern.description=Regex pattern matched URI \
+against URIs (e.g. '^http://(\w+\.)?dbpedia\.org/resource/.*' would match \
+dbpedia.org Resources regardless of the language). NOTE: that \
+his engine will use both the configured URI prefixes AND URI patterns to check 
\
+if it can dereference an Entity. If any of those configuration matches the \
+Entity will be dereferenced.
+


Reply via email to