Author: rwesten
Date: Sat May 19 10:27:15 2012
New Revision: 1340409

URL: http://svn.apache.org/viewvc?rev=1340409&view=rev
Log:
STANBOL-583: Mainly adaptions of the CELI classification engine

* Now fise:TopicEnhancements are created as defined by STANBOL-617 for that I 
needed to change mappings from the CELI results (see notes in SOAP result 
processing part of the HttpClient) 
* Adapted UnitTest to check those
* HttpClient adaptions similar to the other engines
* Improved Errorhandling of the classification engine
* classification engine now uses a write lock while writing classification 
results


NOTE: I added extensive NOTES to changes performed to the classification 
engine. A lot of those notes would be similar for all CELI engines.

Modified:
    
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java
    
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java
    
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java
    
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java
    
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngineTest.java

Modified: 
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java?rev=1340409&r1=1340408&r2=1340409&view=diff
==============================================================================
--- 
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java
 (original)
+++ 
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java
 Sat May 19 10:27:15 2012
@@ -1,7 +1,13 @@
 package org.apache.stanbol.enhancer.engines.celi.classification.impl;
 
+import static 
org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.createTextEnhancement;
+import static 
org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses.SKOS_CONCEPT;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
+import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL;
+import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE;
+import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE;
 
 import java.io.IOException;
 import java.net.URL;
@@ -13,11 +19,14 @@ import java.util.Map.Entry;
 import java.util.Set;
 import java.util.Vector;
 
+import javax.xml.soap.SOAPException;
+
 import org.apache.clerezza.rdf.core.Literal;
 import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
 import org.apache.clerezza.rdf.core.NoConvertorException;
 import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
 import org.apache.clerezza.rdf.core.impl.TripleImpl;
 import org.apache.felix.scr.annotations.Activate;
 import org.apache.felix.scr.annotations.Component;
@@ -37,6 +46,8 @@ import org.apache.stanbol.enhancer.servi
 import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
 import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
+import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
+import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses;
 import org.osgi.service.cm.ConfigurationException;
 import org.osgi.service.component.ComponentContext;
 import org.slf4j.Logger;
@@ -52,7 +63,8 @@ public class CeliClassificationEnhanceme
         * This ensures that no connections to external services are made if 
Stanbol is started in offline mode 
         * as the OnlineMode service will only be available if OfflineMode is 
deactivated. 
         */
-       @Reference
+       @SuppressWarnings("unused") //it's not unused!
+    @Reference
     private OnlineMode onlineMode; 
        
        private static List<String> supportedLangs = new Vector<String>();
@@ -66,6 +78,10 @@ public class CeliClassificationEnhanceme
                supportedLangs.add("pl");
                supportedLangs.add("nl");
        }
+       /**
+        * The literal factory used to create types literals
+        */
+    private LiteralFactory literalFactory = LiteralFactory.getInstance();
 
        /**
         * The literal representing the LangIDEngine as creator.
@@ -77,10 +93,16 @@ public class CeliClassificationEnhanceme
         * {@link ServiceProperties#ORDERING_CONTENT_EXTRACTION}
         */
        public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION;
-
+       /**
+        * Currently used as fise:entity-type for TopicAnnotations
+        */
+       private static final UriRef OWL_CLASS = new 
UriRef("http://www.w3.org/2002/07/owl#Class";);
+       
        private Logger log = LoggerFactory.getLogger(getClass());
 
-       private String language = null;
+       //NOTE: one CAN NOT store the language as member, as EnhancementEngines
+       //      can be called in parallel by multiple threads!
+       //private String language = null;
 
        /**
         * This contains the only MIME type directly supported by this 
enhancement
@@ -108,7 +130,8 @@ public class CeliClassificationEnhanceme
        @Activate
        protected void activate(ComponentContext ctx) throws IOException, 
ConfigurationException {
                super.activate(ctx);
-               Dictionary<String, Object> properties = ctx.getProperties();
+               @SuppressWarnings("unchecked")
+        Dictionary<String, Object> properties = ctx.getProperties();
                this.licenseKey = (String) properties.get(LICENSE_KEY);
                if (licenseKey == null || licenseKey.isEmpty()) {
                        log.warn("no CELI license key configured for this 
Engine, a guest account will be used (max 100 requests per day). Go on 
http://linguagrid.org for getting a proper license key.");
@@ -129,63 +152,114 @@ public class CeliClassificationEnhanceme
 
        @Override
        public int canEnhance(ContentItem ci) throws EngineException {
-               this.language = EnhancementEngineHelper.getLanguage(ci);
-               if (language == null) {
-                       throw new IllegalStateException("Unable to extract 
Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the 
canEnhance " + "method! -> This indicated an Bug in the implementation of the " 
+ "EnhancementJobManager!");
-               }
-
-               if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null 
&& this.isLangSupported(language))
+               String language = EnhancementEngineHelper.getLanguage(ci);
+               //canEnhance should inform if it can not enhance a ContentItem 
because
+               //of an potential error in the EnhancementChain configuration, 
but not
+               //throw runtime exceptions.
+//             if (language == null) {
+//                     throw new IllegalStateException("Unable to extract 
Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the 
canEnhance " + "method! -> This indicated an Bug in the implementation of the " 
+ "EnhancementJobManager!");
+//             }
+        if(language==null) {
+            log.warn("Unable to enhance ContentItem {} because language of the 
Content is unknown." +
+                    " Please check that a language identification engine is 
active in this EnhancementChain.",
+                    ci.getUri());
+        }
+
+               if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null 
&& this.isLangSupported(language)) {
+                   //NOTE: ENHANCE_ASYNC indicates that the 
computeEnhancements Method
+                   //      correctly applies read/write locks to the 
contentItem
                        return ENHANCE_ASYNC;
-               else
+               } else {
                        return CANNOT_ENHANCE;
+               }
        }
 
 
        @Override
        public void computeEnhancements(ContentItem ci) throws EngineException {
-               if (this.language == null)
-                       this.language = EnhancementEngineHelper.getLanguage(ci);
-
+           //NOTE: in the computeEnhancements Method on can check metadata 
already
+           //      checked within the canEnhance method. THis is not required, 
but it
+           //      may help to identify potential bugs in the 
EnhancementJobManager
+           //      implementation
+        String language = EnhancementEngineHelper.getLanguage(ci);
+        if (!isLangSupported(language)){
+            throw new IllegalStateException("Call to computeEnhancement with 
unsupported language '"
+                    +language+" for ContentItem "+ ci.getUri() +": This is 
also checked "
+                    + "in the canEnhance method! -> This indicated an Bug in 
the "
+                    + "implementation of the " + "EnhancementJobManager!");
+        }
                Entry<UriRef, Blob> contentPart = ContentItemHelper.getBlob(ci, 
SUPPORTED_MIMTYPES);
                if (contentPart == null) {
-                       throw new IllegalStateException("No ContentPart with 
Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + 
": This is also checked in the canEnhance method! -> This "
-                                       + "indicated an Bug in the 
implementation of the " + "EnhancementJobManager!");
+                       throw new IllegalStateException("No ContentPart with 
Mimetype '" 
+                               + TEXT_PLAIN_MIMETYPE + "' found for 
ContentItem " 
+                               + ci.getUri() + ": This is also checked in the 
canEnhance "
+                               + "method! -> This indicates an Bug in the 
implementation of "
+                               + "the EnhancementJobManager!");
                }
-               String text = "";
+               String text;
                try {
                        text = 
ContentItemHelper.getText(contentPart.getValue());
                } catch (IOException e) {
                        throw new InvalidContentException(this, ci, e);
                }
                if (text.trim().length() == 0) {
-                       log.info("No text contained in ContentPart 
{"+contentPart.getKey()+"} of ContentItem {"+ci.getUri()+"}");
+                       log.info("No text contained in ContentPart {} of 
ContentItem {}",
+                           contentPart.getKey(),ci.getUri());
                        return;
                }
-
+               //NOTE: EnhancementEngine implementations should pass all 
Exceptions 
+               //      (RuntimeExceptions as is and others wrapped as 
EngineExceptions). 
+               //      The EnhancementJobManager implementation has to catch 
and
+               //      process all those. Handling depends on the 
configuration of the
+               //      EnhancementChain (e.g. if this engine is optional 
enhancement of
+               //      the ContentItem will continue).
+               //      This is important as otherwise Users would get "200 ok" 
replies
+               //      for failed enhancement requests that have failed!
+               //
+               //      This means that:
+               //      * Http clients should pass on IOExceptions and 
SOAPExceptions
+               //      * No try/catch that also includes RuntimeExceptions
+               List<Concept> lista;
                try {
-                       
-                       List<Concept> lista = this.client.extractConcepts(text, 
language);
-                       LiteralFactory literalFactory = 
LiteralFactory.getInstance();
-
-                       MGraph g = ci.getMetadata();
-
-                       UriRef textAnnotation = 
EnhancementEngineHelper.createTextEnhancement(ci, this);
-
-                       for (Concept ne : lista) {
-                               List<UriRef> uris = 
this.getEntityRefForType(ne.getClassLabel());
-
-                               try {
-                                       for (UriRef uri : uris)
-                                               g.add(new 
TripleImpl(textAnnotation, DC_RELATION, uri));
-                                       g.add(new TripleImpl(textAnnotation, 
ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(ne.getConfidence())));
-                               } catch (NoConvertorException e) {
-                                       log.error(e.getMessage(),e);
-                               }
-                       }
-               } catch (Exception e) {
-                       log.error(e.getMessage(),e);
+                       lista = this.client.extractConcepts(text, language);
+        } catch (IOException e) { //re-throw exceptions as EngineException
+            throw new EngineException("Error while calling the CELI 
classification"
+                +" service (configured URL: " +serviceURL+")!",e);
+        } catch (SOAPException e) {
+            throw new EngineException("Error wile encoding/decoding the 
request/"
+                +"response to the CELI classification service!",e);
+        } 
+               if(lista.isEmpty()){ //not topics found
+                   return; //nothing to do
+               }
+               MGraph g = ci.getMetadata();
+               //NOTE: EnhancementEngines that use "ENHANCE_ASYNC" need to 
acquire a
+               //      writeLock before modifications to the enhancement 
metadata
+               ci.getLock().writeLock().lock();
+               try {
+               //see STANBOL-617 for rules how to encode extracted topics
+               //we need a single TextAnnotation to link all TopicAnnotations
+               UriRef textAnnotation = createTextEnhancement(ci, this);
+               // add the dc:type skos:Concept
+               g.add(new TripleImpl(textAnnotation, DC_TYPE, SKOS_CONCEPT));
+               
+               //not create the fise:TopicAnnotations
+               for (Concept ne : lista) {
+                   UriRef topicAnnotation = 
EnhancementEngineHelper.createTopicEnhancement(ci, this);
+               g.add(new TripleImpl(topicAnnotation, 
ENHANCER_ENTITY_REFERENCE, ne.getUri()));
+                g.add(new TripleImpl(topicAnnotation, ENHANCER_ENTITY_LABEL, 
+                    new PlainLiteralImpl(ne.getLabel())));
+                //TODO: currently I use owl:class as entity-type, because that 
is
+                //      what the linked dbpedia ontology resources are.
+                g.add(new TripleImpl(topicAnnotation, ENHANCER_ENTITY_TYPE, 
OWL_CLASS));
+                g.add(new TripleImpl(topicAnnotation, ENHANCER_CONFIDENCE, 
+                    literalFactory.createTypedLiteral(ne.getConfidence())));
+                //link to the TextAnnotation
+                g.add(new TripleImpl(topicAnnotation, DC_RELATION, 
textAnnotation));
+               }
+               } finally {
+                   ci.getLock().writeLock().unlock();
                }
-
        }
 
        private boolean isLangSupported(String language) {
@@ -195,14 +269,6 @@ public class CeliClassificationEnhanceme
                        return false;
        }
 
-       private List<UriRef> getEntityRefForType(String classificationLabels) {
-               List<UriRef> refs = new Vector<UriRef>();
-               String[] tmps = classificationLabels.split(" ");
-               for (String dbPediaLabel : tmps) {
-                       refs.add(new UriRef(NamespaceEnum.dbpedia_ont + 
dbPediaLabel));
-               }
-               return refs;
-       }
 
        @Override
        public Map<String, Object> getServiceProperties() {

Modified: 
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java?rev=1340409&r1=1340408&r2=1340409&view=diff
==============================================================================
--- 
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java
 (original)
+++ 
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java
 Sat May 19 10:27:15 2012
@@ -1,23 +1,35 @@
 package org.apache.stanbol.enhancer.engines.celi.classification.impl;
 
+import java.io.BufferedWriter;
 import java.io.ByteArrayInputStream;
 import java.io.IOException;
+import java.io.InputStream;
 import java.io.OutputStreamWriter;
+import java.io.StringWriter;
+import java.io.Writer;
 import java.net.HttpURLConnection;
 import java.net.URL;
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
+import java.util.Map;
 import java.util.Vector;
 
 import javax.xml.soap.MessageFactory;
 import javax.xml.soap.SOAPBody;
+import javax.xml.soap.SOAPException;
 import javax.xml.soap.SOAPMessage;
 import javax.xml.soap.SOAPPart;
 import javax.xml.transform.stream.StreamSource;
 
+import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.util.Base64;
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.lang.StringEscapeUtils;
+import org.apache.stanbol.enhancer.engines.celi.utils.Utils;
+import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 import org.w3c.dom.Element;
@@ -25,108 +37,258 @@ import org.w3c.dom.NodeList;
 
 public class ClassificationClientHTTP {
        
-       private final Logger log = LoggerFactory.getLogger(getClass());
-       
+       private final static Logger log = 
LoggerFactory.getLogger(ClassificationClientHTTP.class);
+       //NOTE: Defining charset, content-type and SOAP prefix/suffix as
+       //      constants does make more easy to configure those things
+    /**
+     * The UTF-8 {@link Charset}
+     */
+    private static final Charset UTF8 = Charset.forName("UTF-8");
+    /**
+     * The content type "text/xml; charset={@link #UTF8}"
+     */
+    private static final String CONTENT_TYPE = "text/xml; 
charset="+UTF8.name();
+    /**
+     * The XML version, encoding; SOAP envelope, heder and starting element of 
the body;
+     * processTextRequest and text starting element.
+     */
+    private static final String SOAP_PREFIX = "<?xml version=\"1.0\" 
encoding=\""+UTF8.name()+"\"?>" 
+            + "<soapenv:Envelope 
xmlns:soapenv=\"http://schemas.xmlsoap.org/soap/envelope/\"; "
+            + 
"xmlns:clas=\"http://linguagrid.org/v20110204/classification\";><soapenv:Header/><soapenv:Body>";
+    /**
+     * closes the text, processTextRequest, SOAP body and envelope
+     */
+    private static final String SOAP_SUFFIX = 
"</soapenv:Body></soapenv:Envelope>";
+    
+    //TODO: This should be configurable
        private static final int maxResultToReturn = 3;
        
-       private URL serviceEP;
-       private String licenseKey;
+       private final URL serviceEP;
+       private final String licenseKey;
+       
+       //NOTE: the request headers are the same for all request - so they can 
be
+       //      initialized in the constructor.
+       private final Map<String,String> requestHeaders;
        
        
        public ClassificationClientHTTP(URL serviceUrl, String licenseKey){
                this.serviceEP=serviceUrl;
                this.licenseKey=licenseKey;
+        Map<String,String> headers = new HashMap<String,String>();
+        headers.put("Content-Type", CONTENT_TYPE);
+        if(licenseKey != null){
+            String encoded = Base64.encode(this.licenseKey.getBytes(UTF8));
+            headers.put("Authorization", "Basic "+encoded);
+        }
+        this.requestHeaders = Collections.unmodifiableMap(headers);
        }
        
-       
-       public String doPostRequest(URL url, String body) throws IOException {
-               
-               HttpURLConnection urlConn = (HttpURLConnection) 
url.openConnection();
-               urlConn.setRequestMethod("POST");
-               urlConn.setDoInput(true);
-               if (null != body) {
-                       urlConn.setDoOutput(true);
-               } else {
-                       urlConn.setDoOutput(false);
-               }
-               urlConn.setUseCaches(false);
-               String  contentType = "text/xml; charset=utf-8";
-               urlConn.setRequestProperty("Content-Type", contentType);
-               if(this.licenseKey!=null){
-                       String encoded = 
Base64.encode(this.licenseKey.getBytes("UTF-8"));
-                       urlConn.setRequestProperty("Authorization", "Basic 
"+encoded);
-               }
-               
-               // send POST output
-               if (null != body) {
-                       OutputStreamWriter printout = new 
OutputStreamWriter(urlConn.getOutputStream(), "UTF-8");
-                       printout.write(body);
-                       printout.flush();
-                       printout.close();
-               }
-               
-               //close connection
-               urlConn.disconnect();
-               
-               // get response data
-               return IOUtils.toString(urlConn.getInputStream(), "UTF-8");
-       }
-
-
-       public List<Concept> extractConcepts(String text,String lang) {
-               List<Concept> extractedConcepts = new Vector<Concept>();
+       /*
+        * NOTE: parsing/returning a String requires to create in-memory copies
+        *       of the sent/received data. Imaging users that send the text of
+        *       100 pages PDF files to the Stanbol Enhancer.
+        *       Because of that an implementation that directly streams the
+        *       StringEscapeUtils.escapeXml(..) to the request is preferable 
+        *       
+        *       This will no longer allow to debug the data of the request and
+        *       response. See the commented main method at the end for 
alternatives
+        */
+//     public String doPostRequest(URL url, String body) throws IOException {
+//             
+//             HttpURLConnection urlConn = (HttpURLConnection) 
url.openConnection();
+//             urlConn.setRequestMethod("POST");
+//             urlConn.setDoInput(true);
+//             if (null != body) {
+//                     urlConn.setDoOutput(true);
+//             } else {
+//                     urlConn.setDoOutput(false);
+//             }
+//             urlConn.setUseCaches(false);
+//             String  contentType = "text/xml; charset=utf-8";
+//             urlConn.setRequestProperty("Content-Type", contentType);
+//             if(this.licenseKey!=null){
+//                     String encoded = 
Base64.encode(this.licenseKey.getBytes("UTF-8"));
+//                     urlConn.setRequestProperty("Authorization", "Basic 
"+encoded);
+//             }
+//             
+//             // send POST output
+//             if (null != body) {
+//                     OutputStreamWriter printout = new 
OutputStreamWriter(urlConn.getOutputStream(), "UTF-8");
+//                     printout.write(body);
+//                     printout.flush();
+//                     printout.close();
+//             }
+//             
+//             //close connection
+//             urlConn.disconnect();
+//             
+//             // get response data
+//             return IOUtils.toString(urlConn.getInputStream(), "UTF-8");
+//     }
+
+
+       //NOTE: forward IOException and SOAPExceptions to allow correct error 
handling
+       //      by the EnhancementJobManager.
+       //      Also RuntimeExceptions MUST NOT be cached out of the same 
reason!
+       public List<Concept> extractConcepts(String text,String lang) throws 
IOException, SOAPException {
+        if(text == null || text.isEmpty()){
+            //no text -> no classification
+            return Collections.emptyList();
+        }
+
+        //create the POST request
+        HttpURLConnection con = Utils.createPostRequest(serviceEP, 
requestHeaders);
+        //"stream" the request content directly to the buffered writer
+        BufferedWriter writer = new BufferedWriter(new 
OutputStreamWriter(con.getOutputStream(),UTF8));
+        writer.write(SOAP_PREFIX);
+        writer.write("<clas:classify>");
+        writer.write("<clas:user>wiki</clas:user>");//TODO: should the user be 
configurable?
+        writer.write("<clas:model>");
+        writer.write(lang);
+        writer.write("</clas:model>");
+        writer.write("<clas:text>");
+        StringEscapeUtils.escapeXml(writer, text); //write the escaped text 
directly to the request
+        writer.write("</clas:text>");
+        writer.write("</clas:classify>");
+        writer.write(SOAP_SUFFIX);
+        writer.close();
+
+        //Call the service
+        long start = System.currentTimeMillis();
+        InputStream stream = con.getInputStream();
+        log.debug("Request to {} took 
{}ms",serviceEP,System.currentTimeMillis()-start);
+
+        //NOTE: forward IOException and SOAPExceptions to allow correct error 
handling
+        //      by the EnhancementJobManager.
+        //      Also RuntimeExceptions MUST NOT be cached out of the same 
reason!
+
+//             try {
+
+               // Create SoapMessage
+               MessageFactory msgFactory = MessageFactory.newInstance();
+               SOAPMessage message = msgFactory.createMessage();
+               SOAPPart soapPart = message.getSOAPPart();
+
+               // NOTE: directly use the InputStream provided by the 
URLConnection!
+//                     ByteArrayInputStream stream = new 
ByteArrayInputStream(responseXml.getBytes("UTF-8"));
+               StreamSource source = new StreamSource(stream);
+
+               // Set contents of message
+               soapPart.setContent(source);
+
+               SOAPBody soapBody = message.getSOAPBody();
+        List<Concept> extractedConcepts = new Vector<Concept>();
+               NodeList nlist = soapBody.getElementsByTagNameNS("*","return");
+               HashSet<String> inserted=new HashSet<String>();
+               for (int i = 0; i < nlist.getLength() && i<maxResultToReturn; 
i++) {
+                   //NOTE: do not catch RuntimeExceptions. Error handling is 
done by
+                   //      the EnhancementJobManager!
+//                     try {
+                       Element result = (Element) nlist.item(i);
+
+                       //NOTE: (rwesten) implemented a mapping from the CELI 
classification
+                       //      to the Stanbol fise:TopicEnhancements 
(STANBOL-617) that
+                       //        * one fise:TopicAnnotation is generated per 
"model"
+                       //        * the whole label string is used as 
fise:entity-label
+                       //        * the uri of the most specific dbpedia 
ontology type (see
+                       //          selectClassificationClass) is used as 
fise:entity-reference
+                       //      This has the intuition that for users it is 
easier to grasp
+                       //      the meaning of the whole lable, while for 
machines the link
+                       //      to the most specific dbpedia ontology class is 
best suited.
+                       String model = 
result.getElementsByTagNameNS("*","label").item(0).getTextContent();
+                       model=model.substring(1, model.length()-1);
+                       UriRef modelConcept = selectClassificationClass(model);
+                       String 
conf=result.getElementsByTagNameNS("*","score").item(0).getTextContent();
+                       Double confidence= new Double(conf);
+                       extractedConcepts.add(new 
Concept(model,modelConcept,confidence));
+//                     } catch (Exception e) {
+//                             e.printStackTrace();
+//                     }
 
-               try {
-                       String txt = StringEscapeUtils.escapeXml(text);
-                       String xmldata = "<soapenv:Envelope 
xmlns:soapenv=\"http://schemas.xmlsoap.org/soap/envelope/\"; 
xmlns:clas=\"http://linguagrid.org/v20110204/classification\";><soapenv:Header/><soapenv:Body>
 <clas:classify>"
-                                                       
+"<clas:user>wiki</clas:user><clas:model>"+lang+"</clas:model><clas:text>"+txt+"</clas:text></clas:classify></soapenv:Body></soapenv:Envelope>";
-
-                       
-                       String responseXml = doPostRequest(this.serviceEP, 
xmldata);
-                       log.debug(responseXml);
-
-                       // Create SoapMessage
-                       MessageFactory msgFactory = 
MessageFactory.newInstance();
-                       SOAPMessage message = msgFactory.createMessage();
-                       SOAPPart soapPart = message.getSOAPPart();
-
-                       // Load the SOAP text into a stream source
-                       ByteArrayInputStream stream = new 
ByteArrayInputStream(responseXml.getBytes("UTF-8"));
-                       StreamSource source = new StreamSource(stream);
-
-                       // Set contents of message
-                       soapPart.setContent(source);
-
-                       SOAPBody soapBody = message.getSOAPBody();
-                       NodeList nlist = 
soapBody.getElementsByTagNameNS("*","return");
-                       HashSet<String> inserted=new HashSet<String>();
-                       for (int i = 0; i < nlist.getLength() && 
i<maxResultToReturn; i++) {
-                               try {
-                                       Element result = (Element) 
nlist.item(i);
-
-                                       String model = 
result.getElementsByTagNameNS("*","label").item(0).getTextContent();
-                                       model=model.substring(1, 
model.length()-1);
-                                       String 
conf=result.getElementsByTagNameNS("*","score").item(0).getTextContent();
-                                       float confidence=Float.parseFloat(conf);
-                                       
-                                       String[] tmps=model.split(" ");
-                                       
-                                       for(String t: tmps){
-                                               if(!inserted.contains(t)){
-                                                       
extractedConcepts.add(new Concept(t, confidence));
-                                                       inserted.add(t);
-                                               }
-                                       }
-                               } catch (Exception e) {
-                                       e.printStackTrace();
-                               }
-
-                       }
-               } catch (Exception e) {
-                       e.printStackTrace();
                }
-               
+//             } catch (Exception e) {
+//                     e.printStackTrace();
+//             }
                return extractedConcepts;
        }
-
+    /**
+     * TopicClassifications require only a single fise:entity-reference.
+     * However the CELI classification service delivers <p>
+     * <code><pre>
+     *     <ns2:label>[Organisation HockeyTeam SportsTeam]</ns2:label>
+     * </pre></code>
+     * because of that this method needs to select one of the labels.<p>
+     * This method currently selects the 2nd token if there are more than one
+     * concept suggestions included. NOTE that the whole literal is used as
+     * fise:entity-label!
+     * @param classificationLabels the label string
+     * @return the selected label
+     */
+    private UriRef selectClassificationClass(String classificationLabels) {
+        //NOTE: (rwesten) In general it would be better if CELI could provide
+        //      de-referenceable URLs for those suggestions.
+        //      If that is possible one would no longer need to link to the
+        //      most specific dbpedia ontology class for a category e.g.
+        //          http://dbpedia.org/ontology/HockeyTeam
+        //      for
+        //          [Organisation HockeyTeam SportsTeam]
+        //      but e.g.
+        //          http://linguagrid.org/category/HockeyTeam
+        //      meaning the linguagrid could provide categories as skos 
thesaurus
+        //      via it's web interface
+        int start = classificationLabels.charAt(0) == '[' ? 1 : 0;
+        int end = classificationLabels.charAt(classificationLabels.length()-1) 
== ']' ?
+                classificationLabels.length() - 1 : 
classificationLabels.length();
+        String[] tmps = classificationLabels.substring(start, end).split(" ");
+        return new UriRef(NamespaceEnum.dbpedia_ont.getNamespace()+ //the 
namespace
+            (tmps.length > 1 ? tmps[1] : tmps[0])); //the Class for the label
+    }  
+       
+       //NOTE: If you stream the contents directly to the stream, you can no 
longer
+       //      debug the request/response. Because of that it is sometimes
+       //      helpful to have a main method for those tests
+       //      An even better variant would be to write a UnitTest for that!!
+       //      This would be recommended of the called service is still in beta
+       //      and may change at any time
+//    public static void main(String[] args) throws Exception {
+//        String lang = "fr";
+//        String text = "Brigitte Bardot, née  le 28 septembre " +
+//                "1934 à Paris, est une actrice de cinéma et chanteuse 
française.";
+//        
+//        //For request testing
+//        //Writer request = new StringWriter();
+//        
+//        //For response testing
+//        HttpURLConnection con = Utils.createPostRequest(
+//            new 
URL("http://linguagrid.org/LSGrid/ws/dbpedia-classification";),
+//            Collections.singletonMap("Content-Type", CONTENT_TYPE));
+//        Writer request = new OutputStreamWriter(con.getOutputStream(),UTF8);
+//        
+//        //"stream" the request content directly to the buffered writer
+//        BufferedWriter writer = new BufferedWriter(request);
+//        
+//        writer.write(SOAP_PREFIX);
+//        writer.write("<clas:classify>");
+//        writer.write("<clas:user>wiki</clas:user>");//TODO: should the user 
be configurable?
+//        writer.write("<clas:model>");
+//        writer.write(lang);
+//        writer.write("</clas:model>");
+//        writer.write("<clas:text>");
+//        StringEscapeUtils.escapeXml(writer, text); //write the escaped text 
directly to the request
+//        writer.write("</clas:text>");
+//        writer.write("</clas:classify>");
+//        writer.write(SOAP_SUFFIX);
+//        writer.close();
+//        
+//        //log the Request (if request testing)
+//        //log.info("Request \n{}",request.toString());
+//        
+//        //for response testing we need to call the service
+//        //Call the service
+//        long start = System.currentTimeMillis();
+//        InputStream stream = con.getInputStream();
+//        log.info("Request to took {}ms",System.currentTimeMillis()-start);
+//        log.info("Response:\n{}",IOUtils.toString(stream));
+//        stream.close();
+//    }
 }

Modified: 
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java?rev=1340409&r1=1340408&r2=1340409&view=diff
==============================================================================
--- 
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java
 (original)
+++ 
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java
 Sat May 19 10:27:15 2012
@@ -1,28 +1,34 @@
 package org.apache.stanbol.enhancer.engines.celi.classification.impl;
 
+import org.apache.clerezza.rdf.core.UriRef;
+
 public class Concept {
        
-       private String classLabel;
-       private float confidence;
+       private final String label;
+       private final UriRef uri;
+       private final Double confidence;
        
-       public Concept(String classLabel, float confidence) {
+       public Concept(String label, UriRef uri,Double confidence) {
                super();
-               this.classLabel = classLabel;
+               this.label = label;
+               this.uri = uri;
                this.confidence = confidence;
        }
        
-       public String getClassLabel() {
-               return classLabel;
-       }
-       public void setClassLabel(String classLabel) {
-               this.classLabel = classLabel;
-       }
-       public float getConfidence() {
+
+       public Double getConfidence() {
                return confidence;
        }
-       public void setConfidence(float confidence) {
-               this.confidence = confidence;
-       }
+
+
+    public String getLabel() {
+        return label;
+    }
+
+
+    public UriRef getUri() {
+        return uri;
+    }
        
        
 }

Modified: 
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java?rev=1340409&r1=1340408&r2=1340409&view=diff
==============================================================================
--- 
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java
 (original)
+++ 
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java
 Sat May 19 10:27:15 2012
@@ -75,7 +75,6 @@ public class NERserviceClientHTTP {
                //no text -> no extractions
                return Collections.emptyList();
            }
-               List<NamedEntity> extractedNE = new Vector<NamedEntity>();
 
            //create the POST request
                HttpURLConnection con = Utils.createPostRequest(serviceEP, 
requestHeaders);
@@ -102,6 +101,9 @@ public class NERserviceClientHTTP {
                soapPart.setContent(source);
 
                SOAPBody soapBody = message.getSOAPBody();
+               
+               //extract the results
+        List<NamedEntity> extractedNE = new Vector<NamedEntity>();
                NodeList nlist = soapBody.getElementsByTagName("result");
                for (int i = 0; i < nlist.getLength(); i++) {
                        Element result = (Element) nlist.item(i);

Modified: 
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngineTest.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngineTest.java?rev=1340409&r1=1340408&r2=1340409&view=diff
==============================================================================
--- 
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngineTest.java
 (original)
+++ 
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngineTest.java
 Sat May 19 10:27:15 2012
@@ -1,18 +1,25 @@
 package org.apache.stanbol.enhancer.engines.celi.classification.impl;
 
+import static junit.framework.Assert.assertEquals;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
 import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_ENTITYANNOTATION;
 import static 
org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION;
+import static 
org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper.validateAllTopicAnnotations;
 import static org.junit.Assert.assertTrue;
 
 import java.io.IOException;
 import java.net.UnknownHostException;
 import java.util.Dictionary;
+import java.util.HashMap;
 import java.util.Hashtable;
 import java.util.Iterator;
 
+import junit.framework.Assert;
+
+import org.apache.clerezza.rdf.core.LiteralFactory;
 import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.Resource;
 import org.apache.clerezza.rdf.core.Triple;
 import org.apache.clerezza.rdf.core.UriRef;
 import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
@@ -25,6 +32,8 @@ import org.apache.stanbol.enhancer.servi
 import org.apache.stanbol.enhancer.servicesapi.EngineException;
 import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
 import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
+import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
+import org.apache.stanbol.enhancer.test.helper.EnhancementStructureHelper;
 import org.junit.AfterClass;
 import org.junit.BeforeClass;
 import org.junit.Test;
@@ -76,11 +85,15 @@ public class CeliClassificationEnhanceme
                        classificationEngine.computeEnhancements(ci);
 
                TestUtils.logEnhancements(ci);
-                       
-                       int textAnnoNum = 
checkAllTextAnnotations(ci.getMetadata(), TEXT);
-               log.info(textAnnoNum + " TextAnnotations found ...");
-               int entityAnnoNum = checkAllEntityAnnotations(ci.getMetadata());
-               log.info(entityAnnoNum + " EntityAnnotations found ...");
+                HashMap<UriRef,Resource> expectedValues = new 
HashMap<UriRef,Resource>();
+                   expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM, 
ci.getUri());
+                   expectedValues.put(Properties.DC_CREATOR, 
LiteralFactory.getInstance().createTypedLiteral(
+                       classificationEngine.getClass().getName()));
+
+                       int textAnnoNum = 
EnhancementStructureHelper.validateAllTextAnnotations(ci.getMetadata(), 
TEXT,expectedValues);
+                       assertEquals("Only a single fise:TextAnnotation is 
expeted", 1, textAnnoNum);
+                       int numTopicAnnotations = 
validateAllTopicAnnotations(ci.getMetadata()  , expectedValues);
+                       assertTrue("No TpocisAnnotations found", 
numTopicAnnotations > 0);
                } catch (EngineException e) {
                        if (e.getCause() != null && e.getCause() instanceof 
UnknownHostException) {
                                log.warn("Celi Service not reachable -> 
offline? -> deactivate test");
@@ -90,25 +103,4 @@ public class CeliClassificationEnhanceme
                }
        }
 
-       private int checkAllEntityAnnotations(MGraph g) {
-               Iterator<Triple> entityAnnotationIterator = g.filter(null, 
RDF_TYPE, ENHANCER_ENTITYANNOTATION);
-               int entityAnnotationCount = 0;
-               while (entityAnnotationIterator.hasNext()) {
-                       UriRef entityAnnotation = (UriRef) 
entityAnnotationIterator.next().getSubject();
-                       entityAnnotationCount++;
-               }
-               return entityAnnotationCount;
-       }
-       
-       private int checkAllTextAnnotations(MGraph g, String content) {
-               Iterator<Triple> textAnnotationIterator = g.filter(null, 
RDF_TYPE, ENHANCER_TEXTANNOTATION);
-               // test if a textAnnotation is present
-               assertTrue(textAnnotationIterator.hasNext());
-               int textAnnotationCount = 0;
-               while (textAnnotationIterator.hasNext()) {
-                       UriRef textAnnotation = (UriRef) 
textAnnotationIterator.next().getSubject();
-                       textAnnotationCount++;
-               }
-               return textAnnotationCount;
-       }
 }


Reply via email to