Author: rwesten
Date: Sat Feb 25 16:05:32 2012
New Revision: 1293633
URL: http://svn.apache.org/viewvc?rev=1293633&view=rev
Log:
Fixes STANBOl-509: All EnhancementEngines that create TextAnnotations now use
PlainLiterals; Added also a check for that to the integration tests
Modified:
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
incubator/stanbol/trunk/enhancer/engines/zemanta/src/main/java/org/apache/stanbol/enhancer/engines/zemanta/impl/ZemantaEnhancementEngine.java
incubator/stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/DefaultChainTest.java
Modified:
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java?rev=1293633&r1=1293632&r2=1293633&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/keywordextraction/src/main/java/org/apache/stanbol/enhancer/engines/keywordextraction/engine/KeywordLinkingEngine.java
Sat Feb 25 16:05:32 2012
@@ -387,6 +387,10 @@ public class KeywordLinkingEngine
* @param language
*/
private void writeEnhancements(ContentItem ci, Collection<LinkedEntity>
linkedEntities, String language) {
+ Language languageObject = null;
+ if(language != null && !language.isEmpty()){
+ languageObject = new Language(language);
+ }
MGraph metadata = ci.getMetadata();
for(LinkedEntity linkedEntity : linkedEntities){
Collection<UriRef> textAnnotations = new
ArrayList<UriRef>(linkedEntity.getOccurrences().size());
@@ -402,10 +406,10 @@ public class KeywordLinkingEngine
literalFactory.createTypedLiteral(occurrence.getEnd())));
metadata.add(new TripleImpl(textAnnotation,
Properties.ENHANCER_SELECTION_CONTEXT,
-
literalFactory.createTypedLiteral(occurrence.getContext())));
+ new
PlainLiteralImpl(occurrence.getContext(),languageObject)));
metadata.add(new TripleImpl(textAnnotation,
Properties.ENHANCER_SELECTED_TEXT,
-
literalFactory.createTypedLiteral(occurrence.getSelectedText())));
+ new
PlainLiteralImpl(occurrence.getSelectedText(),languageObject)));
metadata.add(new TripleImpl(textAnnotation,
Properties.ENHANCER_CONFIDENCE,
literalFactory.createTypedLiteral(linkedEntity.getScore())));
Modified:
incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java?rev=1293633&r1=1293632&r2=1293633&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/opencalais/src/main/java/org/apache/stanbol/enhancer/engines/opencalais/impl/OpenCalaisEngine.java
Sat Feb 25 16:05:32 2012
@@ -50,6 +50,7 @@ import java.util.Map.Entry;
import java.util.Set;
import org.apache.clerezza.rdf.core.Graph;
+import org.apache.clerezza.rdf.core.Language;
import org.apache.clerezza.rdf.core.Literal;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
@@ -58,6 +59,7 @@ import org.apache.clerezza.rdf.core.Reso
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.access.TcManager;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.SimpleMGraph;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.clerezza.rdf.core.serializedform.Parser;
@@ -317,6 +319,13 @@ public class OpenCalaisEngine
*/
public void createEnhancements(Collection<CalaisEntityOccurrence> occs,
ContentItem ci) {
LiteralFactory literalFactory = LiteralFactory.getInstance();
+ final Language language; // used for plain literals representing parts
fo the content
+ String langString = getMetadataLanguage(ci.getMetadata(), null);
+ if(langString != null && !langString.isEmpty()){
+ language = new Language(langString);
+ } else {
+ language = null;
+ }
//TODO create TextEnhancement (form, start, end, type?) and
EntityAnnotation (id, name, type)
HashMap<Resource, UriRef> entityAnnotationMap = new HashMap<Resource,
UriRef>();
for (CalaisEntityOccurrence occ : occs) {
@@ -326,14 +335,14 @@ public class OpenCalaisEngine
model.add(new TripleImpl(textAnnotation, DC_TYPE, occ.type));
// for autotagger use the name instead of the matched term (that
might be a pronoun!)
if (onlyNERMode) {
- model.add(new TripleImpl(textAnnotation,
ENHANCER_SELECTED_TEXT,literalFactory.createTypedLiteral(occ.name)));
+ model.add(new TripleImpl(textAnnotation,
ENHANCER_SELECTED_TEXT,new PlainLiteralImpl(occ.name,language)));
}
else {
- model.add(new TripleImpl(textAnnotation,
ENHANCER_SELECTED_TEXT, literalFactory.createTypedLiteral(occ.exact)));
+ model.add(new TripleImpl(textAnnotation,
ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(occ.exact,language)));
}
model.add(new TripleImpl(textAnnotation, ENHANCER_START,
literalFactory.createTypedLiteral(occ.offset)));
model.add(new TripleImpl(textAnnotation, ENHANCER_END,
literalFactory.createTypedLiteral(occ.offset + occ.length)));
- model.add(new TripleImpl(textAnnotation,
ENHANCER_SELECTION_CONTEXT, literalFactory.createTypedLiteral(occ.context)));
+ model.add(new TripleImpl(textAnnotation,
ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occ.context,language)));
//create EntityAnnotation only once but add a reference to the
textAnnotation
if (entityAnnotationMap.containsKey(occ.id)) {
model.add(new TripleImpl(entityAnnotationMap.get(occ.id),
DC_RELATION, textAnnotation));
Modified:
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java?rev=1293633&r1=1293632&r2=1293633&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
Sat Feb 25 16:05:32 2012
@@ -46,11 +46,13 @@ import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.Span;
+import org.apache.clerezza.rdf.core.Language;
import org.apache.clerezza.rdf.core.Literal;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
@@ -172,7 +174,7 @@ public class NEREngineCore implements En
if(nameFinderModel == null){
log.info("No NER Model for {} and language {}
available!",typeLabel,language);
} else {
- findNamedEntities(ci, text, typeUri, typeLabel,
nameFinderModel);
+ findNamedEntities(ci, text, language, typeUri, typeLabel,
nameFinderModel);
}
}
} catch (Exception e) {
@@ -182,6 +184,7 @@ public class NEREngineCore implements En
protected void findNamedEntities(final ContentItem ci,
final String text,
+ final String lang,
final UriRef typeUri,
final String typeLabel,
final TokenNameFinderModel
nameFinderModel) {
@@ -193,6 +196,12 @@ public class NEREngineCore implements En
log.warn("NULL was parsed as text for content item " +
ci.getUri().getUnicodeString() + "! -> call ignored");
return;
}
+ final Language language;
+ if(lang != null && !lang.isEmpty()){
+ language = new Language(lang);
+ } else {
+ language = null;
+ }
log.debug("findNamedEntities typeUri={}, type={}, text=",
new Object[]{ typeUri, typeLabel, StringUtils.abbreviate(text,
100) });
LiteralFactory literalFactory = LiteralFactory.getInstance();
@@ -211,10 +220,10 @@ public class NEREngineCore implements En
for (NameOccurrence occurrence : occurrences) {
UriRef textAnnotation =
EnhancementEngineHelper.createTextEnhancement(ci, this);
- g.add(new TripleImpl(textAnnotation,
ENHANCER_SELECTED_TEXT, literalFactory
- .createTypedLiteral(name)));
- g.add(new TripleImpl(textAnnotation,
ENHANCER_SELECTION_CONTEXT, literalFactory
- .createTypedLiteral(occurrence.context)));
+ g.add(new TripleImpl(textAnnotation,
ENHANCER_SELECTED_TEXT,
+ new PlainLiteralImpl(name, language)));
+ g.add(new TripleImpl(textAnnotation,
ENHANCER_SELECTION_CONTEXT,
+ new PlainLiteralImpl(occurrence.context, language)));
g.add(new TripleImpl(textAnnotation, DC_TYPE, typeUri));
g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE,
literalFactory
.createTypedLiteral(occurrence.confidence)));
Modified:
incubator/stanbol/trunk/enhancer/engines/zemanta/src/main/java/org/apache/stanbol/enhancer/engines/zemanta/impl/ZemantaEnhancementEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/zemanta/src/main/java/org/apache/stanbol/enhancer/engines/zemanta/impl/ZemantaEnhancementEngine.java?rev=1293633&r1=1293632&r2=1293633&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/zemanta/src/main/java/org/apache/stanbol/enhancer/engines/zemanta/impl/ZemantaEnhancementEngine.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/zemanta/src/main/java/org/apache/stanbol/enhancer/engines/zemanta/impl/ZemantaEnhancementEngine.java
Sat Feb 25 16:05:32 2012
@@ -16,6 +16,7 @@
*/
package org.apache.stanbol.enhancer.engines.zemanta.impl;
+import static
org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.getReferences;
import static
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
import static
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
@@ -48,6 +49,7 @@ import org.apache.clerezza.rdf.core.NonL
import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.TripleCollection;
import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.SimpleMGraph;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
import org.apache.commons.io.IOUtils;
@@ -277,27 +279,27 @@ public class ZemantaEnhancementEngine
Iterator<Triple> recognitions = results.filter(null, RDF_TYPE,
ZemantaOntologyEnum.Recognition.getUri());
while (recognitions.hasNext()) {
NonLiteral recognition = recognitions.next().getSubject();
- log.info("process recognition " + recognition);
+ log.debug("process recognition " + recognition);
//first get everything we need for the textAnnotations
Double confidence = parseConfidence(results, recognition);
- log.info(" > confidence :" + confidence);
+ log.debug(" > confidence :" + confidence);
String anchor = EnhancementEngineHelper.getString(results,
recognition, ZemantaOntologyEnum.anchor.getUri());
- log.info(" > anchor :" + anchor);
+ log.debug(" > anchor :" + anchor);
Collection<NonLiteral> textAnnotations =
processTextAnnotation(enhancements, text, ciId, anchor, confidence);
- log.info(" > number of textAnnotations :" +
textAnnotations.size());
+ log.debug(" > number of textAnnotations :" +
textAnnotations.size());
//second we need to create the EntityAnnotation that represent the
//recognition
NonLiteral object = EnhancementEngineHelper.getReference(results,
recognition, ZemantaOntologyEnum.object.getUri());
- log.info(" > object :" + object);
+ log.debug(" > object :" + object);
//The targets represent the linked entities
// ... and yes there can be more of them!
//TODO: can we create an EntityAnnotation with several referred
entities?
// Should we use the owl:sameAs to decide that!
Set<UriRef> sameAsSet = new HashSet<UriRef>();
- for (Iterator<UriRef> sameAs =
EnhancementEngineHelper.getReferences(results, object,
ZemantaOntologyEnum.owlSameAs.getUri()); sameAs.hasNext();
sameAsSet.add(sameAs.next()))
+ for (Iterator<UriRef> sameAs = getReferences(results, object,
ZemantaOntologyEnum.owlSameAs.getUri()); sameAs.hasNext();
sameAsSet.add(sameAs.next()))
;
- log.info(" > sameAs :" + sameAsSet);
+ log.debug(" > sameAs :" + sameAsSet);
//now parse the targets and look if there are others than the one
//merged by using sameAs
Iterator<UriRef> targets =
EnhancementEngineHelper.getReferences(results, object,
ZemantaOntologyEnum.target.getUri());
@@ -305,12 +307,12 @@ public class ZemantaEnhancementEngine
while (targets.hasNext()) {
//the entityRef is the URL of the target
UriRef entity = targets.next();
- log.info(" - target :" + entity);
+ log.debug(" - target :" + entity);
UriRef targetType =
EnhancementEngineHelper.getReference(results, entity,
ZemantaOntologyEnum.targetType.getUri());
- log.info(" o type :" + targetType);
+ log.debug(" o type :" + targetType);
if
(ZemantaOntologyEnum.targetType_RDF.getUri().equals(targetType)) {
String targetTitle =
EnhancementEngineHelper.getString(results, entity,
ZemantaOntologyEnum.title.getUri());
- log.info(" o title :" + targetTitle);
+ log.debug(" o title :" + targetTitle);
if (sameAsSet.contains(entity)) {
if (title == null) {
title = targetTitle;
@@ -342,7 +344,7 @@ public class ZemantaEnhancementEngine
new TripleImpl(entityEnhancement,
ENHANCER_ENTITY_REFERENCE, entity));
}
enhancements.add(
- new TripleImpl(entityEnhancement, ENHANCER_ENTITY_LABEL,
literalFactory.createTypedLiteral(title)));
+ new TripleImpl(entityEnhancement, ENHANCER_ENTITY_LABEL,
new PlainLiteralImpl(title)));
}
}
@@ -395,7 +397,7 @@ public class ZemantaEnhancementEngine
private Collection<NonLiteral> processTextAnnotation(MGraph enhancements,
String text, UriRef ciId, String anchor, Double confidence) {
Collection<NonLiteral> textAnnotations = new ArrayList<NonLiteral>();
int anchorLength = anchor.length();
- Literal anchorLiteral = literalFactory.createTypedLiteral(anchor);
+ Literal anchorLiteral = new PlainLiteralImpl(anchor);
//first search for existing TextAnnotations for the anchor
Map<Integer, Collection<NonLiteral>> existingTextAnnotationsMap =
searchExistingTextAnnotations(enhancements, anchorLiteral);
Modified:
incubator/stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/DefaultChainTest.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/DefaultChainTest.java?rev=1293633&r1=1293632&r2=1293633&view=diff
==============================================================================
---
incubator/stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/DefaultChainTest.java
(original)
+++
incubator/stanbol/trunk/integration-tests/src/test/java/org/apache/stanbol/enhancer/it/DefaultChainTest.java
Sat Feb 25 16:05:32 2012
@@ -71,13 +71,16 @@ public class DefaultChainTest extends En
.assertContentRegexp(
//check execution metadata
"http://stanbol.apache.org/ontology/enhancer/executionMetadata#executionPart",
- //check execution of metaxa & if executionPlan is incuded
+ //check execution of metaxa & if executionPlan is included
"http://stanbol.apache.org/ontology/enhancer/executionplan#engine.*metaxa",
"http://purl.org/dc/terms/creator.*LangIdEnhancementEngine",
"http://purl.org/dc/terms/language.*en",
"http://fise.iks-project.eu/ontology/entity-label.*Paris",
"http://purl.org/dc/terms/creator.*org.apache.stanbol.enhancer.engines.opennlp.*EngineCore",
- "http://fise.iks-project.eu/ontology/entity-label.*Bob Marley"
+ "http://fise.iks-project.eu/ontology/entity-label.*Bob Marley",
+ //the following two lines test the use of plain literals (see
STANBOL-509)
+ "http://fise.iks-project.eu/ontology/selected-text.*\"Bob
Marley\"@en",
+
"http://fise.iks-project.eu/ontology/selection-context>.*people such as Bob
Marley.\"@en"
)
.generateDocumentation(
documentor,