Modified: incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java?rev=1239618&r1=1239617&r2=1239618&view=diff ============================================================================== --- incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java (original) +++ incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java Thu Feb 2 13:52:27 2012 @@ -23,7 +23,6 @@ import static org.apache.stanbol.enhance import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT; import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTION_CONTEXT; import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START; -import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT; import java.io.IOException; import java.nio.charset.Charset; @@ -35,6 +34,7 @@ import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Set; import opennlp.tools.namefind.NameFinderME; @@ -56,10 +56,12 @@ import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.stanbol.commons.opennlp.OpenNLP; import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider; +import org.apache.stanbol.enhancer.servicesapi.Blob; import org.apache.stanbol.enhancer.servicesapi.ContentItem; import org.apache.stanbol.enhancer.servicesapi.EngineException; import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; import org.apache.stanbol.enhancer.servicesapi.InvalidContentException; +import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper; import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper; import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses; import org.apache.stanbol.enhancer.servicesapi.rdf.Properties; @@ -71,6 +73,11 @@ import org.slf4j.LoggerFactory; */ public class NEREngineCore implements EnhancementEngine { protected static final String TEXT_PLAIN_MIMETYPE = "text/plain"; + /** + * Contains the only supported mimetype {@link #TEXT_PLAIN_MIMETYPE} + */ + protected static final Set<String> SUPPORTED_MIMETYPES = + Collections.singleton(TEXT_PLAIN_MIMETYPE); private final Logger log = LoggerFactory.getLogger(getClass()); private static Map<String,UriRef> entityTypes = new HashMap<String,UriRef>(); @@ -119,43 +126,44 @@ public class NEREngineCore implements En //first check the langauge before processing the content (text) String language = extractLanguage(ci); if(language == null){ - log.warn("Unable to extract Language for ContentItem {}: The text" + - "of this ContentItem will not be processed by the NER engine!", - ci.getUri()); - return; + throw new IllegalStateException("Unable to extract Language for " + + "ContentItem "+ci.getUri()+": This is also checked in the canEnhance " + + "method! -> This indicated an Bug in the implementation of the " + + "EnhancementJobManager!"); } if(!isProcessedLangage(language)){ - log.warn("The language {} of ContentItem {} is not configured to be" + - "processed by this NER engine instance (processed {})!", - new Object[]{language,ci.getUri(),processedLangs}); - return; + throw new IllegalStateException("The language '"+language+"' of ContentItem "+ci.getUri() + + " is not configured to be processed by this NER engine instance " + + "(processed "+processedLangs+"): This is also checked in the canEnhance " + + "method! -> This indicated an Bug in the implementation of the " + + "EnhancementJobManager!"); + } + Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES); + if(contentPart == null){ + throw new IllegalStateException("No ContentPart with Mimetype '" + + TEXT_PLAIN_MIMETYPE+"' found for ContentItem "+ci.getUri() + + ": This is also checked in the canEnhance method! -> This " + + "indicated an Bug in the implementation of the " + + "EnhancementJobManager!"); } - String mimeType = ci.getMimeType().split(";", 2)[0]; String text; - if (TEXT_PLAIN_MIMETYPE.equals(mimeType)) { - try { - text = IOUtils.toString(ci.getStream(),"UTF-8"); - } catch (IOException e) { - throw new InvalidContentException(this, ci, e); - } - } else { - //TODO: change that as soon the Adapter Pattern is used for multiple - // mimetype support. - StringBuilder textBuilder = new StringBuilder(); - Iterator<Triple> it = ci.getMetadata().filter(ci.getUri(), NIE_PLAINTEXTCONTENT, null); - while (it.hasNext()) { - textBuilder.append(it.next().getObject()); - } - text = textBuilder.toString(); + try { + text = ContentItemHelper.getText(contentPart.getValue()); + } catch (IOException e) { + throw new InvalidContentException(this, ci, e); } if (text.trim().length() == 0) { // TODO: make the length of the data a field of the ContentItem // interface to be able to filter out empty items in the canEnhance // method - log.warn("nothing to extract knowledge from in ContentItem {}", ci); + log.warn("ContentPart {} of ContentItem {} does not contain any text" + + "to extract knowledge from in ContentItem {}", + contentPart.getKey(),ci); return; } - log.debug("computeEnhancements {} text={}", ci.getUri().getUnicodeString(), StringUtils.abbreviate(text, 100)); + log.debug("computeEnhancements from ContentPart {} of ContentItem {}: text={}", + new Object[]{contentPart.getKey(),ci.getUri().getUnicodeString(), + StringUtils.abbreviate(text, 100)}); try { for (Map.Entry<String,UriRef> type : entityTypes.entrySet()) { String typeLabel = type.getKey(); @@ -190,57 +198,62 @@ public class NEREngineCore implements En LiteralFactory literalFactory = LiteralFactory.getInstance(); MGraph g = ci.getMetadata(); Map<String,List<NameOccurrence>> entityNames = extractNameOccurrences(nameFinderModel, text); - - Map<String,UriRef> previousAnnotations = new LinkedHashMap<String,UriRef>(); - for (Map.Entry<String,List<NameOccurrence>> nameInContext : entityNames.entrySet()) { - - String name = nameInContext.getKey(); - List<NameOccurrence> occurrences = nameInContext.getValue(); - - UriRef firstOccurrenceAnnotation = null; - - for (NameOccurrence occurrence : occurrences) { - UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this); - g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, literalFactory - .createTypedLiteral(name))); - g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, literalFactory - .createTypedLiteral(occurrence.context))); - g.add(new TripleImpl(textAnnotation, DC_TYPE, typeUri)); - g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory - .createTypedLiteral(occurrence.confidence))); - if (occurrence.start != null && occurrence.end != null) { - g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory - .createTypedLiteral(occurrence.start))); - g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory - .createTypedLiteral(occurrence.end))); - } - - // add the subsumption relationship among occurrences of the same - // name - if (firstOccurrenceAnnotation == null) { - // check already extracted annotations to find a first most - // specific occurrence - for (Map.Entry<String,UriRef> entry : previousAnnotations.entrySet()) { - if (entry.getKey().contains(name)) { - // we have found a most specific previous - // occurrence, use it as subsumption target - firstOccurrenceAnnotation = entry.getValue(); - g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation)); - break; - } + //lock the ContentItem while writing the RDF data for found Named Entities + ci.getLock().writeLock().lock(); + try { + Map<String,UriRef> previousAnnotations = new LinkedHashMap<String,UriRef>(); + for (Map.Entry<String,List<NameOccurrence>> nameInContext : entityNames.entrySet()) { + + String name = nameInContext.getKey(); + List<NameOccurrence> occurrences = nameInContext.getValue(); + + UriRef firstOccurrenceAnnotation = null; + + for (NameOccurrence occurrence : occurrences) { + UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this); + g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, literalFactory + .createTypedLiteral(name))); + g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, literalFactory + .createTypedLiteral(occurrence.context))); + g.add(new TripleImpl(textAnnotation, DC_TYPE, typeUri)); + g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory + .createTypedLiteral(occurrence.confidence))); + if (occurrence.start != null && occurrence.end != null) { + g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory + .createTypedLiteral(occurrence.start))); + g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory + .createTypedLiteral(occurrence.end))); } + + // add the subsumption relationship among occurrences of the same + // name if (firstOccurrenceAnnotation == null) { - // no most specific previous occurrence, I am the first, - // most specific occurrence to be later used as a target - firstOccurrenceAnnotation = textAnnotation; - previousAnnotations.put(name, textAnnotation); + // check already extracted annotations to find a first most + // specific occurrence + for (Map.Entry<String,UriRef> entry : previousAnnotations.entrySet()) { + if (entry.getKey().contains(name)) { + // we have found a most specific previous + // occurrence, use it as subsumption target + firstOccurrenceAnnotation = entry.getValue(); + g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation)); + break; + } + } + if (firstOccurrenceAnnotation == null) { + // no most specific previous occurrence, I am the first, + // most specific occurrence to be later used as a target + firstOccurrenceAnnotation = textAnnotation; + previousAnnotations.put(name, textAnnotation); + } + } else { + // I am referring to a most specific first occurrence of the + // same name + g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation)); } - } else { - // I am referring to a most specific first occurrence of the - // same name - g.add(new TripleImpl(textAnnotation, DC_RELATION, firstOccurrenceAnnotation)); } } + } finally { + ci.getLock().writeLock().unlock(); } } @@ -390,21 +403,11 @@ public class NEREngineCore implements En } public int canEnhance(ContentItem ci) { - // in case text/pain;charSet=UTF8 is parsed - String mimeType = ci.getMimeType().split(";", 2)[0]; - if(TEXT_PLAIN_MIMETYPE.equalsIgnoreCase(mimeType) || //plain test - //or extracted text - ci.getMetadata().filter(ci.getUri(), NIE_PLAINTEXTCONTENT, null).hasNext()){ - //TODO: check if the language metadata are already present when - //canEnhance is called. If not than return ENHANCE_SYNCHRONOUS - if(isProcessedLangage(extractLanguage(ci))){ - return ENHANCE_SYNCHRONOUS; - } else { - return CANNOT_ENHANCE; - } - } else { //no textual content available - return CANNOT_ENHANCE; + if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null + && isProcessedLangage(extractLanguage(ci))){ + return ENHANCE_ASYNC; //The NER engine now supports Async processing! } + return CANNOT_ENHANCE; } /**
Modified: incubator/stanbol/trunk/enhancer/engines/taxonomylinking/src/main/java/org/apache/stanbol/enhancer/engines/taxonomy/impl/TaxonomyLinkingEngine.java URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/taxonomylinking/src/main/java/org/apache/stanbol/enhancer/engines/taxonomy/impl/TaxonomyLinkingEngine.java?rev=1239618&r1=1239617&r2=1239618&view=diff ============================================================================== --- incubator/stanbol/trunk/enhancer/engines/taxonomylinking/src/main/java/org/apache/stanbol/enhancer/engines/taxonomy/impl/TaxonomyLinkingEngine.java (original) +++ incubator/stanbol/trunk/enhancer/engines/taxonomylinking/src/main/java/org/apache/stanbol/enhancer/engines/taxonomy/impl/TaxonomyLinkingEngine.java Thu Feb 2 13:52:27 2012 @@ -16,8 +16,6 @@ */ package org.apache.stanbol.enhancer.engines.taxonomy.impl; -import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT; - import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; @@ -29,6 +27,7 @@ import java.util.HashSet; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Set; import java.util.TreeMap; import java.util.TreeSet; @@ -48,11 +47,9 @@ import opennlp.tools.util.Span; import org.apache.clerezza.rdf.core.LiteralFactory; import org.apache.clerezza.rdf.core.MGraph; -import org.apache.clerezza.rdf.core.Triple; import org.apache.clerezza.rdf.core.UriRef; import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl; import org.apache.clerezza.rdf.core.impl.TripleImpl; -import org.apache.commons.io.IOUtils; import org.apache.commons.lang.StringUtils; import org.apache.felix.scr.annotations.Activate; import org.apache.felix.scr.annotations.Component; @@ -68,12 +65,14 @@ import org.apache.felix.scr.annotations. import org.apache.felix.scr.annotations.Service; import org.apache.stanbol.commons.opennlp.OpenNLP; import org.apache.stanbol.commons.stanboltools.offline.OfflineMode; +import org.apache.stanbol.enhancer.servicesapi.Blob; import org.apache.stanbol.enhancer.servicesapi.ContentItem; import org.apache.stanbol.enhancer.servicesapi.EngineException; import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; import org.apache.stanbol.enhancer.servicesapi.InvalidContentException; import org.apache.stanbol.enhancer.servicesapi.ServiceProperties; import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine; +import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper; import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper; import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses; import org.apache.stanbol.enhancer.servicesapi.rdf.Properties; @@ -150,6 +149,10 @@ public class TaxonomyLinkingEngine protected static final String TEXT_PLAIN_MIMETYPE = "text/plain"; /** + * Contains the only supported mime type {@link #TEXT_PLAIN_MIMETYPE} + */ + protected static final Set<String> SUPPORTED_MIMETYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE); + /** * The default value for the Execution of this Engine. Currently set to * {@link ServiceProperties#ORDERING_EXTRACTION_ENHANCEMENT} + 10. It should run after Metaxa and LangId. */ @@ -360,17 +363,11 @@ public class TaxonomyLinkingEngine @Override public int canEnhance(ContentItem ci) throws EngineException { - String mimeType = ci.getMimeType().split(";", 2)[0]; - if (TEXT_PLAIN_MIMETYPE.equalsIgnoreCase(mimeType)) { - return ENHANCE_SYNCHRONOUS; - } - // check for existence of textual content in metadata - UriRef subj = ci.getUri(); - Iterator<Triple> it = ci.getMetadata().filter(subj, NIE_PLAINTEXTCONTENT, null); - if (it.hasNext()) { + if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null){ return ENHANCE_SYNCHRONOUS; + } else { + return ENHANCE_ASYNC; } - return CANNOT_ENHANCE; } @Override @@ -398,35 +395,32 @@ public class TaxonomyLinkingEngine } else { // null indicates to use the Entityhub to lookup Entities site = null; } - String mimeType = ci.getMimeType().split(";", 2)[0]; + Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES); + if(contentPart == null){ + throw new IllegalStateException("No ContentPart with a supported Mime Type" + + "found for ContentItem "+ci.getUri()+"(supported: '" + + SUPPORTED_MIMETYPES+"') -> this indicates that canEnhance was" + + "NOT called and indicates a bug in the used EnhancementJobManager!"); + } String text; - if (TEXT_PLAIN_MIMETYPE.equals(mimeType)) { - try { - text = IOUtils.toString(ci.getStream(),"UTF-8"); - } catch (IOException e) { - throw new InvalidContentException(this, ci, e); - } - } else { - //TODO: change that as soon the Adapter Pattern is used for multiple - // mimetype support. - StringBuilder textBuilder = new StringBuilder(); - Iterator<Triple> it = ci.getMetadata().filter(ci.getUri(), NIE_PLAINTEXTCONTENT, null); - while (it.hasNext()) { - textBuilder.append(it.next().getObject()); - } - text = textBuilder.toString(); + try { + text = ContentItemHelper.getText(contentPart.getValue()); + } catch (IOException e) { + throw new InvalidContentException(this, ci, e); } if (text.trim().length() == 0) { // TODO: make the length of the data a field of the ContentItem // interface to be able to filter out empty items in the canEnhance // method - log.warn("nothing to extract knowledge from in ContentItem {}", ci); + log.warn("ContentPart {} of ContentItem {} does not contain any text to extract knowledge from.", + contentPart.getKey(),ci.getUri()); return; } - //TODO: determin the language + //TODO: determine the language String language = "en"; - log.debug("computeEnhancements for ContentItem {} language {} text={}", - new Object []{ci.getUri().getUnicodeString(), language, StringUtils.abbreviate(text, 100)}); + log.debug("computeEnhancements for ContentPart {} of ContentItem {} language {} text={}", + new Object [] { contentPart.getKey(),ci.getUri().getUnicodeString(), + language, StringUtils.abbreviate(text, 100) }); //first get the models Tokenizer tokenizer = initTokenizer(language); Modified: incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java?rev=1239618&r1=1239617&r2=1239618&view=diff ============================================================================== --- incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java (original) +++ incubator/stanbol/trunk/enhancer/engines/topic/src/main/java/org/apache/stanbol/enhancer/engine/topic/TopicClassificationEngine.java Thu Feb 2 13:52:27 2012 @@ -16,7 +16,6 @@ */ package org.apache.stanbol.enhancer.engine.topic; -import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT; import java.io.File; import java.io.IOException; @@ -32,6 +31,7 @@ import java.util.Iterator; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.Set; import java.util.UUID; @@ -62,11 +62,13 @@ import org.apache.solr.common.SolrInputD import org.apache.solr.common.params.CommonParams; import org.apache.solr.common.params.MoreLikeThisParams; import org.apache.stanbol.commons.solr.utils.StreamQueryRequest; +import org.apache.stanbol.enhancer.servicesapi.Blob; import org.apache.stanbol.enhancer.servicesapi.ContentItem; import org.apache.stanbol.enhancer.servicesapi.EngineException; import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; import org.apache.stanbol.enhancer.servicesapi.InvalidContentException; import org.apache.stanbol.enhancer.servicesapi.ServiceProperties; +import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper; import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper; import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses; import org.apache.stanbol.enhancer.topic.Batch; @@ -161,6 +163,15 @@ public class TopicClassificationEngine e private static final Logger log = LoggerFactory.getLogger(TopicClassificationEngine.class); + /** + * The "text/plain" mime type + */ + protected static final String PLAIN_TEXT_MIMETYPE = "text/plain"; + /** + * Contains the only supported mime type {@link #PLAIN_TEXT_MIMETYPE} + */ + protected static final Set<String> SUPPORTED_MIMETYPES = Collections.singleton(PLAIN_TEXT_MIMETYPE); + public static final String SOLR_NON_EMPTY_FIELD = "[\"\" TO *]"; // TODO: make the following fields configurable @@ -267,20 +278,41 @@ public class TopicClassificationEngine e @Override public int canEnhance(ContentItem ci) throws EngineException { - String text = getTextFromContentItem(ci); - if (getActiveSolrServer() == null) { - log.warn(String.format("Solr Core '%s' is not available.", solrCoreId)); - return CANNOT_ENHANCE; - } - if (text.trim().length() == 0) { + if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null && + getActiveSolrServer() != null){ + return ENHANCE_SYNCHRONOUS; + } else { return CANNOT_ENHANCE; } - return ENHANCE_SYNCHRONOUS; + //TODO ogrisel: validate that it is no problem that this does no longer + //check that the text is not empty +// if (text.trim().length() == 0) { +// return CANNOT_ENHANCE; +// } } @Override public void computeEnhancements(ContentItem ci) throws EngineException { - String text = getTextFromContentItem(ci); + Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES); + if(contentPart == null){ + throw new IllegalStateException("No ContentPart with a supported Mime Type" + + "found for ContentItem "+ci.getUri()+"(supported: '" + + SUPPORTED_MIMETYPES+"') -> this indicates that canEnhance was" + + "NOT called and indicates a bug in the used EnhancementJobManager!"); + } + String text; + try { + text = ContentItemHelper.getText(contentPart.getValue()); + } catch (IOException e) { + throw new InvalidContentException(String.format("Unable to extract " + +" textual content from ContentPart %s of ContentItem %s!", + contentPart.getKey(),ci.getUri()), e); + } + if(text.trim().isEmpty()){ + log.warn("ContentPart {} of ContentItem {} does not contain any " + + "text to extract topics from",contentPart.getKey(),ci.getUri()); + return; + } MGraph metadata = ci.getMetadata(); List<TopicSuggestion> topics; try { @@ -288,18 +320,23 @@ public class TopicClassificationEngine e } catch (ClassifierException e) { throw new EngineException(e); } - for (TopicSuggestion topic : topics) { - UriRef enhancement = EnhancementEngineHelper.createEntityEnhancement(ci, this); - metadata.add(new TripleImpl(enhancement, - org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE, - TechnicalClasses.ENHANCER_TOPICANNOTATION)); - - // add link to entity - metadata.add(new TripleImpl(enhancement, - org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE, - new UriRef(topic.uri))); - // TODO: make it possible to dereference and the path to the root the entities according to a - // configuration parameter + ci.getLock().writeLock().lock(); + try { + for (TopicSuggestion topic : topics) { + UriRef enhancement = EnhancementEngineHelper.createEntityEnhancement(ci, this); + metadata.add(new TripleImpl(enhancement, + org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE, + TechnicalClasses.ENHANCER_TOPICANNOTATION)); + + // add link to entity + metadata.add(new TripleImpl(enhancement, + org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE, + new UriRef(topic.uri))); + // TODO: make it possible to dereference and the path to the root the entities according to a + // configuration parameter + } + } finally { + ci.getLock().writeLock().unlock(); } } @@ -309,25 +346,6 @@ public class TopicClassificationEngine e (Object) order)); } - protected String getTextFromContentItem(ContentItem ci) throws InvalidContentException { - // Refactor the following using an adapter. - String text = ""; - if (ci.getMimeType().startsWith("text/plain")) { - try { - // TODO: handle explicit charsets if any and fallback to UTF-8 if missing - text = IOUtils.toString(ci.getStream(), "UTF-8"); - } catch (IOException e) { - throw new InvalidContentException(this, ci, e); - } - } else { - Iterator<Triple> it = ci.getMetadata().filter(ci.getUri(), NIE_PLAINTEXTCONTENT, null); - while (it.hasNext()) { - text += it.next().getObject(); - } - } - return text; - } - public static TopicClassificationEngine fromParameters(Dictionary<String,Object> config) throws ConfigurationException { TopicClassificationEngine engine = new TopicClassificationEngine(); engine.configure(config); Modified: incubator/stanbol/trunk/enhancer/engines/zemanta/src/main/java/org/apache/stanbol/enhancer/engines/zemanta/impl/ZemantaEnhancementEngine.java URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/zemanta/src/main/java/org/apache/stanbol/enhancer/engines/zemanta/impl/ZemantaEnhancementEngine.java?rev=1239618&r1=1239617&r2=1239618&view=diff ============================================================================== --- incubator/stanbol/trunk/enhancer/engines/zemanta/src/main/java/org/apache/stanbol/enhancer/engines/zemanta/impl/ZemantaEnhancementEngine.java (original) +++ incubator/stanbol/trunk/enhancer/engines/zemanta/src/main/java/org/apache/stanbol/enhancer/engines/zemanta/impl/ZemantaEnhancementEngine.java Thu Feb 2 13:52:27 2012 @@ -25,19 +25,20 @@ import static org.apache.stanbol.enhance import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE; import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT; import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START; -import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT; import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE; import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_CATEGORY; import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; import java.util.Collections; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; +import java.util.Map.Entry; import java.util.Set; import org.apache.clerezza.rdf.core.Literal; @@ -59,12 +60,14 @@ import org.apache.felix.scr.annotations. import org.apache.felix.scr.annotations.Service; import org.apache.stanbol.commons.stanboltools.offline.OnlineMode; import org.apache.stanbol.enhancer.engines.zemanta.ZemantaOntologyEnum; +import org.apache.stanbol.enhancer.servicesapi.Blob; import org.apache.stanbol.enhancer.servicesapi.ContentItem; import org.apache.stanbol.enhancer.servicesapi.EngineException; import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; import org.apache.stanbol.enhancer.servicesapi.InvalidContentException; import org.apache.stanbol.enhancer.servicesapi.ServiceProperties; import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine; +import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper; import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper; import org.osgi.framework.BundleContext; import org.osgi.service.cm.ConfigurationException; @@ -73,6 +76,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; + /** * Apache Stanbol Enhancer Zemanta enhancement engine. * This enhancement engine uses the the Zemanta API for enhancing content. @@ -104,8 +108,9 @@ public class ZemantaEnhancementEngine public static final String DMOZ_BASE_URL = "http://www.dmoz.org/"; public static final String ZEMANTA_DMOZ_PREFIX = "Top/"; - protected static final String TEXT_PLAIN_MIMETYPE = "text/plain"; - protected static final String TEXT_HTML_MIMETYPE = "text/html"; + protected static final Set<String> SUPPORTED_MIMETYPES = + Collections.unmodifiableSet(new HashSet<String>( + Arrays.asList("text/plain","text/html"))); private static final Logger log = LoggerFactory.getLogger(ZemantaEnhancementEngine.class); @@ -158,53 +163,31 @@ public class ZemantaEnhancementEngine } public int canEnhance(ContentItem ci) { - if(isTextOrHtml(ci)){ - return ENHANCE_SYNCHRONOUS; + if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES) != null){ + return ENHANCE_ASYNC; //the ZEMANTA engine now supports async processing! } else { - // check for existence of textual content in metadata - UriRef subj = ci.getUri(); - Iterator<Triple> it = ci.getMetadata().filter(subj, NIE_PLAINTEXTCONTENT, null); - if (it.hasNext()) { - return ENHANCE_SYNCHRONOUS; - } + return CANNOT_ENHANCE; } - return CANNOT_ENHANCE; } - /** - * @param ci - */ - private boolean isTextOrHtml(ContentItem ci) { - String mimeType = ci.getMimeType().split(";", 2)[0]; - if (TEXT_PLAIN_MIMETYPE.equalsIgnoreCase(mimeType)) { - return true; - } else if (TEXT_HTML_MIMETYPE.equalsIgnoreCase(mimeType)) { - return true; - } else { - return false; - } - } public void computeEnhancements(ContentItem ci) throws EngineException { + Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES); + if(contentPart == null){ + throw new IllegalStateException("No ContentPart with a supported Mime Type" + + "found for ContentItem "+ci.getUri()+"(supported: '" + + SUPPORTED_MIMETYPES+"') -> this indicates that canEnhance was" + + "NOT called and indicates a bug in the used EnhancementJobManager!"); + } String text; - if(isTextOrHtml(ci)){ - try { - text = IOUtils.toString(ci.getStream(),"UTF-8"); - } catch (IOException e) { - throw new InvalidContentException(this, ci, e); - } - } else { - //TODO: change that as soon the Adapter Pattern is used for multiple - // mimetype support. - StringBuilder textBuilder = new StringBuilder(); - Iterator<Triple> it = ci.getMetadata().filter(ci.getUri(), NIE_PLAINTEXTCONTENT, null); - while (it.hasNext()) { - textBuilder.append(it.next().getObject()); - } - text = textBuilder.toString(); + try { + text = ContentItemHelper.getText(contentPart.getValue()); + } catch (IOException e) { + throw new InvalidContentException(this, ci, e); } if (text.trim().length() == 0) { - log.warn("nothing to enhance"); + log.warn("ContentPart {} of ContentItem {} does not contain any text to enhance", + contentPart.getKey(),ci.getUri()); return; } MGraph graph = ci.getMetadata(); @@ -219,8 +202,13 @@ public class ZemantaEnhancementEngine } //now we need to process the results and convert them into the Enhancer //annotation structure - processRecognition(results, graph, text, ciId); - processCategories(results, graph, ciId); + ci.getLock().writeLock().lock(); + try { + processRecognition(results, graph, text, ciId); + processCategories(results, graph, ciId); + } finally { + ci.getLock().writeLock().unlock(); + } } public Map<String, Object> getServiceProperties() { // TODO Auto-generated method stub Modified: incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/helper/ContentItemHelper.java URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/helper/ContentItemHelper.java?rev=1239618&r1=1239617&r2=1239618&view=diff ============================================================================== --- incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/helper/ContentItemHelper.java (original) +++ incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/helper/ContentItemHelper.java Thu Feb 2 13:52:27 2012 @@ -22,16 +22,21 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.UnsupportedEncodingException; import java.net.URLEncoder; +import java.nio.charset.Charset; import java.security.MessageDigest; import java.security.NoSuchAlgorithmException; +import java.util.Collections; import java.util.HashMap; import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; import java.util.StringTokenizer; import org.apache.clerezza.rdf.core.UriRef; import org.apache.commons.io.IOUtils; import org.apache.stanbol.enhancer.servicesapi.Blob; import org.apache.stanbol.enhancer.servicesapi.ContentItem; +import org.apache.stanbol.enhancer.servicesapi.NoSuchPartException; /** @@ -48,6 +53,8 @@ public class ContentItemHelper { public static final int MAX_BUF_SIZE = 64 * 1024; // 64 kB private static final char[] HEX_DIGITS = "0123456789abcdef".toCharArray(); + + public static final String UTF8 = "UTF-8"; // TODO: instead of using a static helper, build an OSGi component with a // configurable site-wide URI namespace for ids that are local to the @@ -214,4 +221,68 @@ public class ContentItemHelper { } return parsed; } + /** + * Searches an {@link ContentItem#getPart(UriRef, Class) content part} + * of the type {@link Blob} with one of the the parsed mimeTypes. <p> + * NOTE:<ul> + * <li> MimeTypes are converted to lower case before compared with + * the entries of the parsed set. Therefore it is important that the parsed + * set only contains lower case values! + * <li> A read lock on the parsed {@link ContentItem} is applied while + * searching for a fitting {@link Blob} + * </ul> + * @param ci the contentITem + * @param mimeTypes List of possible mimeTypes + * @return the {@link UriRef URI} and the {@link Blob content} of the content + * part or <code>null</code> if not found + * @throws IllegalArgumentException If the parsed {@link ContentItem} is + * <code>null</code> or the parsed Set with the mimeTypes is <code>null</code> + * or {@link Set#isEmpty() empty}. + */ + public static Entry<UriRef, Blob> getBlob(ContentItem ci, Set<String> mimeTypes){ + if(ci == null){ + throw new IllegalArgumentException("The parsed ContentItem MUST NOT be NULL!"); + } + if(mimeTypes == null || mimeTypes.isEmpty()){ + throw new IllegalArgumentException("The parsed Set with mime type MUST NOT be NULL nor empty!"); + } + UriRef cpUri = null; + int index = 0; + ci.getLock().readLock().lock(); + try { + do { + try { + cpUri = ci.getPartUri(index); + if(cpUri != null){ + Blob blob = ci.getPart(cpUri, Blob.class); + if(blob != null && mimeTypes.contains( + blob.getMimeType().toLowerCase())){ + return Collections.singletonMap(cpUri, blob) + .entrySet().iterator().next(); + } // else no match + } // else no more parts + } catch (NoSuchPartException e) {/* ignore*/} + } while(cpUri != null); + } finally { + ci.getLock().readLock().unlock(); + } + return null; // not found + } + /** + * Getter for the Text of an {@link Blob}. This method respects the + * "charset" if present in the {@link Blob#getParameter() parameter} of the + * Blob. + * @param blob the {@link Blob}. MUST NOT be <code>null</code>. + * @return the text + * @throws IOException on any exception while reading from the + * {@link InputStream} provided by the Blob. + * @throws IllegalArgumentException if the parsed Blob is <code>null</code> + */ + public static String getText(Blob blob) throws IOException { + if(blob == null){ + throw new IllegalArgumentException("The parsed Blob MUST NOT be NULL!"); + } + String charset = blob.getParameter().get("charset"); + return IOUtils.toString(blob.getStream(), charset != null ? charset : UTF8); + } } Modified: incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/helper/ContentItemImpl.java URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/helper/ContentItemImpl.java?rev=1239618&r1=1239617&r2=1239618&view=diff ============================================================================== --- incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/helper/ContentItemImpl.java (original) +++ incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/helper/ContentItemImpl.java Thu Feb 2 13:52:27 2012 @@ -136,11 +136,16 @@ public abstract class ContentItemImpl im @SuppressWarnings("unchecked") @Override public <T> T getPart(UriRef uri, Class<T> clazz) throws NoSuchPartException { - if(parts.containsKey(uri)){ - return (T) parts.get(uri); - } else { - throw new NoSuchPartException(uri); - } + readLock.lock(); + try { + if(parts.containsKey(uri)){ + return (T) parts.get(uri); + } else { + throw new NoSuchPartException(uri); + } + }finally { + readLock.unlock(); + } } @Override Modified: incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/Properties.java URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/Properties.java?rev=1239618&r1=1239617&r2=1239618&view=diff ============================================================================== --- incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/Properties.java (original) +++ incubator/stanbol/trunk/enhancer/generic/servicesapi/src/main/java/org/apache/stanbol/enhancer/servicesapi/rdf/Properties.java Thu Feb 2 13:52:27 2012 @@ -168,10 +168,6 @@ public class Properties { public static final UriRef DC_LANGUAGE = new UriRef(NamespaceEnum.dc + "language"); - /** - * Plain text content of a content item. - */ - public static final UriRef NIE_PLAINTEXTCONTENT = new UriRef(NamespaceEnum.nie + "plainTextContent"); /** * The topic of the resource. Used to relate a content item to a Modified: incubator/stanbol/trunk/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java?rev=1239618&r1=1239617&r2=1239618&view=diff ============================================================================== --- incubator/stanbol/trunk/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java (original) +++ incubator/stanbol/trunk/enhancer/jersey/src/main/java/org/apache/stanbol/enhancer/jersey/resource/ContentItemResource.java Thu Feb 2 13:52:27 2012 @@ -32,7 +32,6 @@ import static org.apache.stanbol.enhance import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE; import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.GEO_LAT; import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.GEO_LONG; -import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.NIE_PLAINTEXTCONTENT; import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_ENTITYANNOTATION; import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION; @@ -51,6 +50,7 @@ import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; +import java.util.Map.Entry; import java.util.TreeMap; import javax.servlet.ServletContext; @@ -81,8 +81,10 @@ import org.apache.clerezza.rdf.core.spar import org.apache.clerezza.rdf.ontologies.RDF; import org.apache.commons.io.IOUtils; import org.apache.stanbol.commons.web.base.resource.BaseStanbolResource; +import org.apache.stanbol.enhancer.servicesapi.Blob; import org.apache.stanbol.enhancer.servicesapi.ContentItem; import org.apache.stanbol.enhancer.servicesapi.NoSuchPartException; +import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper; import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper; import org.apache.stanbol.enhancer.servicesapi.helper.ExecutionMetadataHelper; import org.apache.stanbol.enhancer.servicesapi.helper.ExecutionPlanHelper; @@ -156,17 +158,13 @@ public class ContentItemResource extends if (localId != null) { URI rawURI = uriInfo.getBaseUriBuilder().path(storePath).path("raw").path(localId).build(); - if (ci.getMimeType().equals("text/plain")) { - this.textContent = IOUtils.toString(ci.getStream(), "UTF-8"); - } else if (ci.getMimeType().startsWith("image/")) { + Entry<UriRef,Blob> plainTextContentPart = ContentItemHelper.getBlob(contentItem, Collections.singleton("text/plain")); + if (plainTextContentPart != null) { + this.textContent = ContentItemHelper.getText(plainTextContentPart.getValue()); + } + if (ci.getMimeType().startsWith("image/")) { this.imageSrc = rawURI; } - else { - Iterator<Triple> it = ci.getMetadata().filter(ci.getUri(), NIE_PLAINTEXTCONTENT, null); - if (it.hasNext()) { - this.textContent = ((Literal)it.next().getObject()).getLexicalForm(); - } - } this.downloadHref = rawURI; this.metadataHref = uriInfo.getBaseUriBuilder().path(storePath).path("metadata").path(localId).build(); } Modified: incubator/stanbol/trunk/enhancer/jobmanager/event/src/main/java/org/apache/stanbol/enhancer/jobmanager/event/impl/EnhancementJobHandler.java URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/jobmanager/event/src/main/java/org/apache/stanbol/enhancer/jobmanager/event/impl/EnhancementJobHandler.java?rev=1239618&r1=1239617&r2=1239618&view=diff ============================================================================== --- incubator/stanbol/trunk/enhancer/jobmanager/event/src/main/java/org/apache/stanbol/enhancer/jobmanager/event/impl/EnhancementJobHandler.java (original) +++ incubator/stanbol/trunk/enhancer/jobmanager/event/src/main/java/org/apache/stanbol/enhancer/jobmanager/event/impl/EnhancementJobHandler.java Thu Feb 2 13:52:27 2012 @@ -143,11 +143,12 @@ public class EnhancementJobHandler imple try { processEvent(job, execution); } catch (Throwable t) { + String message = String.format("Unexpected Exception while processing " + + "ContentItem %s with EnhancementJobManager: %s", + job.getContentItem().getUri(),EventJobManagerImpl.class); //this ensures that an runtime exception does not - job.setFailed(execution, null, new IllegalStateException( - "Unexpected Exception while processing ContentItem '" - + job.getContentItem().getUri()+"' with EnhancementJobManager: " - + EventJobManagerImpl.class,t)); + job.setFailed(execution, null, new IllegalStateException(message,t)); + log.error(message,t); } //(2) trigger the next actions log.debug("++ w: {}","check for next Executions"); @@ -218,8 +219,13 @@ public class EnhancementJobHandler imple } catch (EngineException e) { job.setFailed(execution, engine, e); } - } else { //required engine is unable to enhance the content - job.setFailed(execution,engine,exception); + } else { //CANNOT_ENHANCE + if(exception != null){ + job.setFailed(execution,engine,exception); + } else { //can not enhance is not an error + //it just says this engine can not enhance this content item + job.setCompleted(execution); + } } } else { //engine with that name is not available job.setFailed(execution, null, null);
