ContentItemResource.java

rwesten Fri, 07 Oct 2011 13:21:41 -0700

Author: rwesten
Date: Fri Oct  7 20:21:13 2011
New Revision: 1180200

URL: http://svn.apache.org/viewvc?rev=1180200&view=rev
Log:
This fixes STANBOL-342 by replacing the SPARQL query with several 
graph.filter(..) calls.


On the test case with ~ 150 Text- and 100 Entityannotation the processing time 
was reduced from ~20sec to about 500ms

Modified:
    
incubator/stanbol/trunk/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java

Modified: 
incubator/stanbol/trunk/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java?rev=1180200&r1=1180199&r2=1180200&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java
 Fri Oct  7 20:21:13 2011
@@ -18,13 +18,23 @@ package org.apache.stanbol.enhancer.jers
 
 import static javax.ws.rs.core.MediaType.TEXT_HTML;
 import static org.apache.stanbol.commons.web.base.CorsHelper.addCORSOrigin;
+import static 
org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.getReference;
+import static 
org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.getReferences;
+import static 
org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.getString;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_ORGANISATION;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_PERSON;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.DBPEDIA_PLACE;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.SKOS_CONCEPT;
+import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
+import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
+import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL;
+import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.GEO_LAT;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.GEO_LONG;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT;
+import static 
org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_ENTITYANNOTATION;
+import static 
org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION;
 
 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
@@ -33,6 +43,7 @@ import java.net.URI;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.EnumMap;
 import java.util.HashMap;
 import java.util.Iterator;
 import java.util.List;
@@ -48,16 +59,15 @@ import javax.ws.rs.core.Response;
 import javax.ws.rs.core.UriInfo;
 import javax.ws.rs.core.Response.ResponseBuilder;
 
-import org.apache.clerezza.rdf.core.Graph;
 import org.apache.clerezza.rdf.core.Language;
 import org.apache.clerezza.rdf.core.Literal;
 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.NonLiteral;
 import org.apache.clerezza.rdf.core.PlainLiteral;
 import org.apache.clerezza.rdf.core.Resource;
 import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.TripleCollection;
-import org.apache.clerezza.rdf.core.TypedLiteral;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.access.TcManager;
 import org.apache.clerezza.rdf.core.impl.SimpleMGraph;
@@ -65,14 +75,11 @@ import org.apache.clerezza.rdf.core.impl
 import org.apache.clerezza.rdf.core.serializedform.Serializer;
 import org.apache.clerezza.rdf.core.serializedform.SupportedFormat;
 import org.apache.clerezza.rdf.core.sparql.ParseException;
-import org.apache.clerezza.rdf.core.sparql.QueryParser;
-import org.apache.clerezza.rdf.core.sparql.ResultSet;
-import org.apache.clerezza.rdf.core.sparql.SolutionMapping;
-import org.apache.clerezza.rdf.core.sparql.query.SelectQuery;
-import org.apache.clerezza.rdf.utils.GraphNode;
+import org.apache.clerezza.rdf.ontologies.RDF;
 import org.apache.commons.io.IOUtils;
 import org.apache.stanbol.commons.web.base.resource.BaseStanbolResource;
 import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -111,16 +118,15 @@ public class ContentItemResource extends
 
     protected String serializationFormat = SupportedFormat.RDF_XML;
 
-    protected Collection<EntityExtractionSummary> people;
 
-    protected Collection<EntityExtractionSummary> organizations;
-
-    protected Collection<EntityExtractionSummary> places;
-
-    protected Collection<EntityExtractionSummary> concepts;
+    /**
+     * Map holding the extraction mapped by {@link Properties#DC_TYPE} and the
+     * {@link Properties#ENHANCER_SELECTED_TEXT}.
+     * This map is initialised by {@link #initOccurrences()}.
+     */
+    protected Map<UriRef,Map<String,EntityExtractionSummary>> 
extractionsByTypeMap = 
+        new HashMap<UriRef,Map<String,EntityExtractionSummary>>();
     
-    protected Collection<EntityExtractionSummary> others;
-
     public ContentItemResource(String localId,
                                ContentItem ci,
                                UriInfo uriInfo,
@@ -155,6 +161,9 @@ public class ContentItemResource extends
         defaultThumbnails.put(DBPEDIA_PLACE, getStaticRootUrl() + 
"/home/images/compass_48.png");
         defaultThumbnails.put(SKOS_CONCEPT, getStaticRootUrl() + 
"/home/images/black_gear_48.png");
         defaultThumbnails.put(null, getStaticRootUrl() + 
"/home/images/unknown_48.png");
+        long start = System.currentTimeMillis();
+        initOccurrences();
+        log.info(" ... {}ms fro parsing Enhancement 
Reuslts",System.currentTimeMillis()-start);
     }
 
     public String getRdfMetadata(String mediatype) throws 
UnsupportedEncodingException {
@@ -191,95 +200,110 @@ public class ContentItemResource extends
         return metadataHref;
     }
 
-    public Collection<EntityExtractionSummary> getPersonOccurrences() throws 
ParseException {
-        if (people == null) {
-            people = getOccurrences(DBPEDIA_PERSON);
+    public Collection<EntityExtractionSummary> getOccurrences(UriRef type){
+        Map<String,EntityExtractionSummary> typeMap = 
extractionsByTypeMap.get(type);
+        Collection<EntityExtractionSummary> typeOccurrences;
+        if(typeMap != null){
+            typeOccurrences = typeMap.values();
+        } else {
+            typeOccurrences  = Collections.emptyList();
         }
-        return people;
+        return typeOccurrences;
+    }
+    
+    public Collection<EntityExtractionSummary> getPersonOccurrences() throws 
ParseException {
+        return getOccurrences(DBPEDIA_PERSON);
     }
     public Collection<EntityExtractionSummary> getOtherOccurrences() throws 
ParseException {
-        if(others == null){
-            others = getOccurrences(null);
-        }
-        return others;
+        return getOccurrences(null);
     }
 
     public Collection<EntityExtractionSummary> getOrganizationOccurrences() 
throws ParseException {
-        if (organizations == null) {
-            organizations = getOccurrences(DBPEDIA_ORGANISATION);
-        }
-        return organizations;
+        return getOccurrences(DBPEDIA_ORGANISATION);
     }
 
     public Collection<EntityExtractionSummary> getPlaceOccurrences() throws 
ParseException {
-        if (places == null) {
-            places = getOccurrences(DBPEDIA_PLACE);
-        }
-        return places;
+        return getOccurrences(DBPEDIA_PLACE);
     }
     public Collection<EntityExtractionSummary> getConceptOccurrences() throws 
ParseException {
-        if (concepts == null) {
-            concepts = getOccurrences(SKOS_CONCEPT);
-        }
-        return concepts;
+        return getOccurrences(SKOS_CONCEPT);
+    }
+    enum EAProps {
+        label,
+        entity,
+        confidence
     }
 
-    public Collection<EntityExtractionSummary> getOccurrences(UriRef type) 
throws ParseException {
+    private void initOccurrences() {
         MGraph graph = contentItem.getMetadata();
-        StringBuilder queryBuilder = new StringBuilder();
-        queryBuilder.append("PREFIX enhancer: 
<http://fise.iks-project.eu/ontology/> ");
-        queryBuilder.append("PREFIX dc:   <http://purl.org/dc/terms/> ");
-        queryBuilder.append("SELECT ?textAnnotation ?text ?entity 
?entity_label ?confidence WHERE { ");
-        queryBuilder.append("  ?textAnnotation a enhancer:TextAnnotation ." );
-        if(type != null){
-            queryBuilder.append("  ?textAnnotation dc:type 
").append(type).append(" . ");
-        } else {
-            //append a filter that this value needs to be non existent
-            queryBuilder.append(" OPTIONAL { ?textAnnotation dc:type ?type } . 
");
-            queryBuilder.append(" FILTER(!bound(?type)) ");
-        }
-        queryBuilder.append("  ?textAnnotation enhancer:selected-text ?text ." 
);
-        queryBuilder.append(" OPTIONAL {");
-        queryBuilder.append("   ?entityAnnotation dc:relation ?textAnnotation 
.");
-        queryBuilder.append("   ?entityAnnotation a enhancer:EntityAnnotation 
. ");
-        queryBuilder.append("   ?entityAnnotation enhancer:entity-reference 
?entity .");
-        queryBuilder.append("   ?entityAnnotation enhancer:entity-label 
?entity_label .");
-        queryBuilder.append("   ?entityAnnotation enhancer:confidence 
?confidence . }" );
-        queryBuilder.append("} ORDER BY ?text ");
-//        String queryString = String.format(queryBuilder.toString(), type);
-
-        SelectQuery query = (SelectQuery) 
QueryParser.getInstance().parse(queryBuilder.toString());
-        ResultSet result = tcManager.executeSparqlQuery(query, graph);
-        Map<String,EntityExtractionSummary> occurrenceMap = new 
TreeMap<String,EntityExtractionSummary>();
         LiteralFactory lf = LiteralFactory.getInstance();
-        while (result.hasNext()) {
-            SolutionMapping mapping = result.next();
-
-            UriRef textAnnotationUri = (UriRef) mapping.get("textAnnotation");
-            if (graph.filter(textAnnotationUri, Properties.DC_RELATION, 
null).hasNext()) {
+        Map<UriRef,Collection<NonLiteral>> suggestionMap = new 
HashMap<UriRef,Collection<NonLiteral>>();
+        // 1) get Entity Annotations
+        Map<NonLiteral,Map<EAProps,Object>> entitySuggestionMap = new 
HashMap<NonLiteral,Map<EAProps,Object>>();
+        Iterator<Triple> entityAnnotations = graph.filter(null, RDF.type, 
ENHANCER_ENTITYANNOTATION);
+        while(entityAnnotations.hasNext()){
+            NonLiteral entityAnnotation = 
entityAnnotations.next().getSubject();
+            //to avoid multiple lookups (e.g. if one entityAnnotation links to+
+            //several TextAnnotations) we cache the data in an intermediate Map
+            Map<EAProps,Object> eaData = new 
EnumMap<EAProps,Object>(EAProps.class);
+            eaData.put(EAProps.entity, getReference(graph, entityAnnotation, 
ENHANCER_ENTITY_REFERENCE));
+            eaData.put(EAProps.label, getString(graph, entityAnnotation, 
ENHANCER_ENTITY_LABEL));
+            eaData.put(EAProps.confidence, EnhancementEngineHelper.get(
+                graph, entityAnnotation, ENHANCER_CONFIDENCE, Double.class, 
lf));
+            entitySuggestionMap.put(entityAnnotation, eaData);
+            Iterator<UriRef> textAnnotations = getReferences(graph, 
entityAnnotation, DC_RELATION);
+            while(textAnnotations.hasNext()){
+                UriRef textAnnotation = textAnnotations.next();
+                Collection<NonLiteral> suggestions = 
suggestionMap.get(textAnnotation);
+                if(suggestions == null){
+                    suggestions = new ArrayList<NonLiteral>();
+                    suggestionMap.put(textAnnotation, suggestions);
+                }
+                suggestions.add(entityAnnotation);
+            }
+        }
+        // 2) get the TextAnnotations
+        Iterator<Triple> textAnnotations = graph.filter(null, RDF.type, 
ENHANCER_TEXTANNOTATION);
+        while(textAnnotations.hasNext()){
+            NonLiteral textAnnotation = textAnnotations.next().getSubject();
+            if (graph.filter(textAnnotation, DC_RELATION, null).hasNext()) {
                 // this is not the most specific occurrence of this name: skip
                 continue;
             }
-            // TODO: collect the selected text and contexts of subsumed
-            // annotations
-
-            Literal textLiteral = (Literal) mapping.get("text");
-            String text = textLiteral.getLexicalForm();
-
-            EntityExtractionSummary entity = occurrenceMap.get(text);
-            if (entity == null) {
-                entity = new EntityExtractionSummary(text, type, 
defaultThumbnails);
-                occurrenceMap.put(text, entity);
-            }
-            UriRef entityUri = (UriRef) mapping.get("entity");
-            if (entityUri != null) {
-                String label = ((Literal) 
mapping.get("entity_label")).getLexicalForm();
-                Double confidence = lf.createObject(Double.class, 
(TypedLiteral) mapping.get("confidence"));
-                Graph properties = new GraphNode(entityUri, 
contentItem.getMetadata()).getNodeContext();
-                entity.addSuggestion(entityUri, label, confidence, properties);
+            String text = getString(graph, textAnnotation, 
Properties.ENHANCER_SELECTED_TEXT);
+            if(text == null){
+                //ignore text annotations without text
+                continue;
+            }
+            Iterator<UriRef> types = getReferences(graph, textAnnotation, 
DC_TYPE);
+            if(!types.hasNext()){ //create an iterator over null in case no 
types are present
+                types = Collections.singleton((UriRef)null).iterator();
+            }
+            while(types.hasNext()){
+                UriRef type = types.next();
+                Map<String,EntityExtractionSummary> occurrenceMap = 
extractionsByTypeMap.get(type);
+                if(occurrenceMap == null){
+                    occurrenceMap = new 
TreeMap<String,EntityExtractionSummary>(String.CASE_INSENSITIVE_ORDER);
+                    extractionsByTypeMap.put(type, occurrenceMap);
+                }
+                EntityExtractionSummary entity = occurrenceMap.get(text);
+                if(entity == null){
+                    entity = new EntityExtractionSummary(text, type, 
defaultThumbnails);
+                    occurrenceMap.put(text, entity);
+                }
+                Collection<NonLiteral> suggestions = 
suggestionMap.get(textAnnotation);
+                if(suggestions != null){
+                    for(NonLiteral entityAnnotation : suggestions){
+                        Map<EAProps,Object> eaData = 
entitySuggestionMap.get(entityAnnotation);
+                        entity.addSuggestion(
+                            (UriRef)eaData.get(EAProps.entity),
+                            (String)eaData.get(EAProps.label), 
+                            (Double)eaData.get(EAProps.confidence), 
+                            graph);
+                    }
+                }
             }
         }
-        return occurrenceMap.values();
     }
 
     public static class EntityExtractionSummary implements 
Comparable<EntityExtractionSummary> {

svn commit: r1180200 - /incubator/stanbol/trunk/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java

Reply via email to