svn commit: r1633587 - in /stanbol/trunk: enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/ enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/

rwesten Wed, 22 Oct 2014 03:10:40 -0700

Author: rwesten
Date: Wed Oct 22 10:09:41 2014
New Revision: 1633587

URL: http://svn.apache.org/r1633587
Log:
first implementation of an NLP 2 NIF 2.0 engine (STANBOL-1397)


Added:
    
stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20Helper.java
    
stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20MetadataEngine.java
    
stanbol/trunk/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/Nif20.java

Added: 
stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20Helper.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20Helper.java?rev=1633587&view=auto
==============================================================================
--- 
stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20Helper.java
 (added)
+++ 
stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20Helper.java
 Wed Oct 22 10:09:41 2014
@@ -0,0 +1,302 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.nlp2rdf.engine;
+
+import static 
org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.EnumMap;
+import java.util.Map;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.commons.io.IOUtils;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+import org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Annotated;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
+import org.apache.stanbol.enhancer.nlp.nif.Nif20;
+import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+
+public final class Nif20Helper {
+    
+    private static final LiteralFactory lf = LiteralFactory.getInstance();
+
+    private Nif20Helper(){}
+    
+    public static final Map<SpanTypeEnum,UriRef> SPAN_TYPE_TO_SSO_TYPE;
+    static {
+        Map<SpanTypeEnum,UriRef> mapping = new 
EnumMap<SpanTypeEnum,UriRef>(SpanTypeEnum.class);
+        //mapping.put(SpanTypeEnum.Text, null);
+        //mapping.put(SpanTypeEnum.TextSection, null);
+        mapping.put(SpanTypeEnum.Sentence, Nif20.Sentence.getUri());
+        mapping.put(SpanTypeEnum.Chunk, Nif20.Phrase.getUri());
+        mapping.put(SpanTypeEnum.Token, Nif20.Word.getUri());
+        SPAN_TYPE_TO_SSO_TYPE = Collections.unmodifiableMap(mapping);
+    }
+    
+    /**
+     * Read-only map that maps from the {@link LexicalCategory} to the OLIA
+     * Concept representing the Phrase (e.g. {@link LexicalCategory#Noun} maps
+     * to "<code>http://purl.org/olia/olia.owl#NounPhrase</code>").
+     */
+    public static final Map<LexicalCategory,UriRef> 
LEXICAL_TYPE_TO_PHRASE_TYPE;
+    static {
+        String olia = "http://purl.org/olia/olia.owl#";;
+        Map<LexicalCategory,UriRef> mapping = new 
EnumMap<LexicalCategory,UriRef>(LexicalCategory.class);
+        mapping.put(LexicalCategory.Noun, new UriRef(olia+"NounPhrase"));
+        mapping.put(LexicalCategory.Verb, new UriRef(olia+"VerbPhrase"));
+        mapping.put(LexicalCategory.Adjective, new 
UriRef(olia+"AdjectivePhrase"));
+        mapping.put(LexicalCategory.Adverb, new UriRef(olia+"AdverbPhrase"));
+        mapping.put(LexicalCategory.Conjuction, new 
UriRef(olia+"ConjuctionPhrase"));
+        LEXICAL_TYPE_TO_PHRASE_TYPE = Collections.unmodifiableMap(mapping);
+    }    
+    /**
+     * Creates a NIF2.0 Fragment URI using the parsed base URI and the 
start/end
+     * indexes.
+     * @param base the base URI
+     * @param start the start position. If <code>&lt; 0</code> than zero is 
added.
+     * @param end the end position or values &lt; 1 when open ended.
+     * @return the NIF 2.0 Fragment URI
+     * @throws IllegalArgumentException if <code>null</code> is parsed as base
+     * {@link UriRef} or the end position is &gt;=0 but &lt= the parsed start
+     * position.
+     */
+    public static final UriRef getNifFragmentURI(UriRef base, int start,int 
end){
+        if(base == null){
+            throw new IllegalArgumentException("Base URI MUST NOT be NULL!");
+        }
+        StringBuilder sb = new StringBuilder(base.getUnicodeString());
+        sb.append("#char=");
+        sb.append(start >= 0 ? start : 0).append(',');
+        if(end >= 0){
+            if(end < start){
+                throw new IllegalArgumentException("End index '"+end+"' < 
start '"+start+"'!");
+            }
+            sb.append(end);
+        } //else open ended ...
+        return new UriRef(sb.toString());
+    }
+ 
+    public static final UriRef getNifRFC5147URI(UriRef base, int start, int 
end){
+        if(base == null){
+            throw new IllegalArgumentException("Base URI MUST NOT be NULL!");
+        }
+        assert start >= 0;
+        assert end < 0 || end >= start;
+        StringBuilder sb = new StringBuilder(base.getUnicodeString());
+        sb.append("#char=");
+        sb.append(start >= 0 ? start : 0);
+        if(end >= 0){
+            sb.append(',').append(end);
+        } //else select the whole string ...
+        return new UriRef(sb.toString());
+    }
+    
+    public static final int NIF_HASH_CONTEXT_LENGTH = 10;
+    public static final int NIF_HASH_MAX_STRING_LENGTH = 20;
+    
+    public static final Charset UTF8 = Charset.forName("UTF8");
+    
+    public static final UriRef getNifHashURI(UriRef base, int start, int end, 
String text){
+        if(base == null){
+            throw new IllegalArgumentException("Base URI MUST NOT be NULL!");
+        }
+        start = start < 0 ? 0 : start;
+        end = end < 0 ? start : end;
+        if(end < start){
+            throw new IllegalArgumentException("End index '"+end+"' < start 
'"+start+"'!");
+        }
+        if(end >= text.length()){
+            throw new IllegalArgumentException("The End index '"+end+"' exeeds 
the "
+                + "length of the text '"+text.length()+"'!");
+        }
+        int contextStart = Math.max(0, start-NIF_HASH_CONTEXT_LENGTH);
+        int contextEnd = Math.min(text.length(), end+NIF_HASH_CONTEXT_LENGTH);
+        StringBuilder sb = new StringBuilder(base.getUnicodeString());
+        sb.append("#hash_");
+        sb.append(NIF_HASH_CONTEXT_LENGTH);
+        sb.append('_');
+        sb.append(end-start);
+        sb.append('_');
+        sb.append(getContextDigest(text, contextStart, start, end, 
contextEnd));
+        sb.append('_');
+        sb.append(text.substring(start, 
+            Math.min(end,start+NIF_HASH_MAX_STRING_LENGTH)));
+        return new UriRef(sb.toString());
+    }
+
+    /**
+     * Creates the UTF8 byte representation for the 
'{prefix}({selected}){suffix}'
+     * calculated based on the parsed parameters
+     * @param text the text
+     * @param contextStart the start index of the prefix
+     * @param start the start index of the selected text part
+     * @param end the end index of the selecte text part
+     * @param contextEnd the end index of the suffix
+     * @return the HASH string representation of the MD5 over 
+     *  <code>'{prefix}({selected}){suffix}'</code> (NOTE the brackets that are
+     *  added at the start/end of the selected text)
+     */
+    private static String getContextDigest(String text, int contextStart, int 
start, int end, int contextEnd) {
+        ByteArrayOutputStream contextOs = new ByteArrayOutputStream();
+        Writer contextWriter = new OutputStreamWriter(contextOs, UTF8);
+        try {
+            if(contextStart<start){
+                contextWriter.append(text, contextStart, start);
+            }
+            contextWriter.append('(');
+            if(start < end){
+                contextWriter.append(text, start, end);
+            }
+            contextWriter.append(')');
+            if(end < contextEnd){
+                contextWriter.append(text,end,contextEnd);
+            }
+            contextWriter.flush();
+            return ContentItemHelper.streamDigest(
+                new ByteArrayInputStream(contextOs.toByteArray()),
+                null, "MD5");
+        } catch (IOException e) {
+            //NO IOExceptions in in-memory stream implementations
+            throw new IllegalStateException(e);
+        } finally {
+            IOUtils.closeQuietly(contextOs);
+        }
+    }
+
+    /**
+     * Writes basic information of the parsed span by using NIF 1.0 including 
the
+     * {@link SsoOntology} Sentence/Phrase/Word type based on 
+     * the {@link Span#getType()}<p>
+     * As {@link AnalysedText} is based on the plain text version of the 
ContentItem
+     * this uses the {@link StringOntology#OffsetBasedString} notation.<p>
+     * <i>NOTE:</i> This DOES NOT write string relations, lemma, pos ... 
information
+     * that might be stored as {@link Annotation} with the parsed {@link Span}.
+     * @param graph the graph to add the triples
+     * @param base the base URI
+     * @param text the {@link AnalysedText}
+     * @param language the {@link Language} or <code>null</code> if not known
+     * @param span the {@link Span} to write.
+     * @return the {@link UriRef} representing the parsed {@link Span} in the
+     * graph
+     */
+    public static UriRef writeSpan(MGraph graph, UriRef base, AnalysedText 
text, Language language, Span span){
+        UriRef segment = getNifRFC5147URI(base, span.getStart(), 
+                       span.getType() == SpanTypeEnum.Text ? -1 : 
span.getEnd());
+        graph.add(new TripleImpl(segment, RDF_TYPE, 
Nif20.RFC5147String.getUri()));
+        if(span.getEnd() - span.getStart() < 100){
+               graph.add(new TripleImpl(segment, Nif20.anchorOf.getUri(), 
+                   new PlainLiteralImpl(span.getSpan(),language)));
+        } else {
+               graph.add(new TripleImpl(segment, Nif20.head.getUri(), 
+                   new 
PlainLiteralImpl(span.getSpan().substring(0,10),language)));
+        }
+        graph.add(new TripleImpl(segment, Nif20.beginIndex.getUri(), 
+            lf.createTypedLiteral(span.getStart())));
+        graph.add(new TripleImpl(segment, Nif20.endIndex.getUri(), 
+            lf.createTypedLiteral(span.getEnd())));
+        switch (span.getType()) {
+            case Token:
+                graph.add(new TripleImpl(segment, RDF_TYPE, 
Nif20.Word.getUri()));
+                break;
+            case Chunk:
+                graph.add(new TripleImpl(segment, RDF_TYPE, 
Nif20.Phrase.getUri()));
+                break;
+            case Sentence:
+                graph.add(new TripleImpl(segment, RDF_TYPE, 
Nif20.Sentence.getUri()));
+                break;
+            case Text:
+                graph.add(new TripleImpl(segment, RDF_TYPE, 
Nif20.Context.getUri()));
+                break;
+            default:
+               // no default:
+        }
+        return segment;
+    }
+    
+    /**
+     * Writes the {@link NlpAnnotations#POS_ANNOTATION} as NIF 1.0 to the 
parsed
+     * RDF graph by using the parsed segmentUri as subject
+     * @param graph the graph
+     * @param annotated the annotated element (e.g. a {@link Token})
+     * @param segmentUri the URI of the resource representing the parsed 
+     * annotated element in the graph
+     */
+    public static void writePos(MGraph graph, Annotated annotated, UriRef 
segmentUri) {
+        Value<PosTag> posTag = 
annotated.getAnnotation(NlpAnnotations.POS_ANNOTATION);
+        if(posTag != null){
+            if(posTag.value().isMapped()){
+                for(Pos pos : posTag.value().getPos()){
+                    graph.add(new TripleImpl(segmentUri, 
Nif20.oliaCategory.getUri(), 
+                        pos.getUri()));
+                }
+                for(LexicalCategory cat : posTag.value().getCategories()){
+                    graph.add(new TripleImpl(segmentUri, 
Nif20.oliaCategory.getUri(), 
+                        cat.getUri()));
+                }
+            }
+            graph.add(new TripleImpl(segmentUri, Nif20.posTag.getUri(), 
+                lf.createTypedLiteral(posTag.value().getTag())));
+            graph.add(new TripleImpl(segmentUri, ENHANCER_CONFIDENCE, 
+                lf.createTypedLiteral(posTag.probability())));
+        }
+    }    
+    
+    /**
+     * Writes a {@link NlpAnnotations#PHRASE_ANNOTATION} as NIF 1.0 to the
+     * parsed RDF graph by using the segmentUri as subject
+     * @param graph the graph
+     * @param annotated the annotated element (e.g. a {@link Chunk})
+     * @param segmentUri the URI of the resource representing the parsed 
+     * annotated element in the graph
+     */
+    public static void writePhrase(MGraph graph, Annotated annotated, UriRef 
segmentUri) {
+        Value<PhraseTag> phraseTag = 
annotated.getAnnotation(NlpAnnotations.PHRASE_ANNOTATION);
+        if(phraseTag != null){
+            UriRef phraseTypeUri = 
LEXICAL_TYPE_TO_PHRASE_TYPE.get(phraseTag.value().getCategory());
+            if(phraseTypeUri != null){ //add the oliaLink for the Phrase
+                graph.add(new TripleImpl(segmentUri, 
Nif20.oliaCategory.getUri(), phraseTypeUri));
+                graph.add(new TripleImpl(segmentUri, ENHANCER_CONFIDENCE, 
+                    lf.createTypedLiteral(phraseTag.probability())));
+            }
+        }
+    }
+
+}

Added: 
stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20MetadataEngine.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20MetadataEngine.java?rev=1633587&view=auto
==============================================================================
--- 
stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20MetadataEngine.java
 (added)
+++ 
stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20MetadataEngine.java
 Wed Oct 22 10:09:41 2014
@@ -0,0 +1,197 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.nlp2rdf.engine;
+
+import static 
org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText;
+
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.EnumSet;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.ConfigurationPolicy;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+import org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.nif.Nif20;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Component(immediate = true, metatype = true, 
+    configurationFactory = true, //allow multiple configuration
+    policy = ConfigurationPolicy.OPTIONAL) //create a default instance
+@Service
+@Properties(value={
+        @Property(name= EnhancementEngine.PROPERTY_NAME,value="nif20")
+})
+public class Nif20MetadataEngine extends 
AbstractEnhancementEngine<RuntimeException,RuntimeException> implements 
ServiceProperties{
+
+    private final Logger log = 
LoggerFactory.getLogger(Nif20MetadataEngine.class);
+    //TODO: replace this with a reald ontology
+    private final static UriRef SENTIMENT_PROPERTY = new 
UriRef(NamespaceEnum.fise+"sentiment-value");
+    private final LiteralFactory lf = LiteralFactory.getInstance();
+    
+    /**
+     * Activate and read the properties. Configures and initialises a 
ChunkerHelper for each language configured in
+     * CONFIG_LANGUAGES.
+     *
+     * @param ce the {@link org.osgi.service.component.ComponentContext}
+     */
+    @Activate
+    protected void activate(ComponentContext ce) throws ConfigurationException 
{
+        log.info("activating POS tagging engine");
+        super.activate(ce);
+        @SuppressWarnings("unchecked")
+        Dictionary<String, Object> properties = ce.getProperties();
+        //TODO: read configuration
+    }
+    
+    @Override
+    public int canEnhance(ContentItem ci) throws EngineException {
+        return getAnalysedText(this, ci, false) != null ? 
+                ENHANCE_ASYNC : CANNOT_ENHANCE;
+    }
+
+    @Override
+    public void computeEnhancements(ContentItem ci) throws EngineException {
+        AnalysedText at = getAnalysedText(this, ci, true);
+        String lang = EnhancementEngineHelper.getLanguage(ci);
+        Language language = lang == null ? null : new Language(lang);
+        //now iterate over the AnalysedText data and create the RDF 
representation
+        //TODO: make configureable
+        boolean sentences = true;
+        boolean phrases = true;
+        boolean words = true;
+        
+        EnumSet<SpanTypeEnum> activeTypes = EnumSet.noneOf(SpanTypeEnum.class);
+        if(sentences){
+            activeTypes.add(SpanTypeEnum.Sentence);
+        }
+        if(phrases){
+            activeTypes.add(SpanTypeEnum.Chunk);
+        }
+        if(words){
+            activeTypes.add(SpanTypeEnum.Token);
+        }
+        MGraph metadata = ci.getMetadata();
+        UriRef base = ci.getUri();
+        ci.getLock().writeLock().lock();
+        try {
+               //write the context
+               UriRef text = Nif20Helper.writeSpan(metadata, base, at, 
language, at);
+               metadata.add(new TripleImpl(text, Nif20.sourceUrl.getUri(), 
ci.getUri()));
+               
+            Iterator<Span> spans = at.getEnclosed(activeTypes);
+            UriRef sentence = null;
+            UriRef phrase = null;
+            UriRef word = null;
+            boolean firstWordInSentence = true;
+            while(spans.hasNext()){
+                Span span = spans.next();
+                //TODO: filter Spans based on additional requirements
+                //(1) write generic information about the span
+                UriRef current = Nif20Helper.writeSpan(metadata, base, at, 
language, span);
+                //write the context
+                metadata.add(new TripleImpl(current, 
Nif20.referenceContext.getUri(), text));
+                //(2) add the relations between the different spans
+                switch (span.getType()) {
+                    case Sentence:
+                        if(sentence != null){
+                            metadata.add(new TripleImpl(sentence, 
Nif20.nextSentence.getUri(), current));
+                        }
+                        sentence = current;
+                        firstWordInSentence = true;
+                        break;
+                    case Chunk:
+                        if(sentence != null){
+                            metadata.add(new TripleImpl(current, 
Nif20.superString.getUri(), sentence));
+                            if(word != null){
+                                metadata.add(new TripleImpl(word, 
Nif20.lastWord.getUri(), sentence));
+                            }
+                        }
+                        phrase = current;
+                        break;
+                    case Token:
+                        if(sentence != null){
+                            metadata.add(new TripleImpl(current, 
Nif20.sentence.getUri(), sentence));
+                            if(firstWordInSentence){
+                                metadata.add(new TripleImpl(current, 
Nif20.firstWord.getUri(), sentence));
+                                firstWordInSentence = false;
+                            }
+                        }
+                        if(phrase != null){
+                            metadata.add(new TripleImpl(current, 
Nif20.subString.getUri(), phrase));
+                        }
+                        if(word != null){
+                            metadata.add(new TripleImpl(word, 
Nif20.nextWord.getUri(), current));
+                            metadata.add(new TripleImpl(current, 
Nif20.previousWord.getUri(), word));
+                        }
+                        word = current;
+                        break;
+                    default:
+                        break;
+                }
+                //(3) add specific information such as POS, chunk type ...
+                Nif20Helper.writePos(metadata, span, current);
+                Nif20Helper.writePhrase(metadata, span, current);
+
+                //OlIA does not include Sentiments
+                
+                Value<Double> sentiment = 
span.getAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION);
+                if(sentiment != null && sentiment.value() != null){
+                    metadata.add(new TripleImpl(current, SENTIMENT_PROPERTY, 
+                        lf.createTypedLiteral(sentiment.value())));
+                }
+            }
+        } finally {
+            ci.getLock().writeLock().unlock();
+        }
+    }
+
+    @Override
+    public Map<String,Object> getServiceProperties() {
+        return 
Collections.singletonMap(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING, 
+            (Object)ServiceProperties.ORDERING_POST_PROCESSING);
+    }
+
+
+
+
+
+}

Added: 
stanbol/trunk/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/Nif20.java
URL: 
http://svn.apache.org/viewvc/stanbol/trunk/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/Nif20.java?rev=1633587&view=auto
==============================================================================
--- 
stanbol/trunk/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/Nif20.java
 (added)
+++ 
stanbol/trunk/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/Nif20.java
 Wed Oct 22 10:09:41 2014
@@ -0,0 +1,545 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.nlp.nif;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+
+public enum Nif20 {
+       /**
+        * A URI Scheme for NIF which is able to refer to a single, consecutive 
+        * string in a context. Note that any scheme subclassing this class, 
+        * requires the existence of beginIndex, endIndex and referenceContext .
+        * <p>
+        * This is an abstract class and should not be serialized. 
+        */
+       CString,
+       /**
+        * An abitrary URI (e.g. a URN) for an arbitrary string of the context. 
+        * This is roughly the same as TextAnnotations are currently 
implemented in Stanbol.
+        */
+       CStringInst,
+       /**
+        * The string that serves as a context for its substrings. The Unicode 
String 
+        * given in the nif:isString property must be used to calculate the 
begin and 
+        * endIndex for all nif:Strings that have a nif:referenceContext 
property to 
+        * this URI. For further information, see 
+        * http://svn.aksw.org/papers/2013/ISWC_NIF/public.pdf 
+        */
+       Context,
+       /**
+        * A collection of contexts used to create an unordered set of context 
via 
+        * the nif:hasContext property. This can be compared to a document 
collection, 
+        * but here it is a collection of nif:Context and therefore a 
collection of 
+        * annotated strings, not documents. 
+        */
+       ContextCollection,
+       /**
+        * see <a href="http://jens-lehmann.org/files/2012/ekaw_nif.pdf";>
+        * Linked-Data Aware URI Schemes for Referencing Text Fragments</a> 
+        * by Sebastian Hellmann, Jens Lehmann und SÃ¶ren Auer in EKAW 2012 for 
more
+        * information.
+        */
+       ContextHashBasedString,
+       /**
+        * see <a href="http://jens-lehmann.org/files/2012/ekaw_nif.pdf";>
+        * Linked-Data Aware URI Schemes for Referencing Text Fragments</a> 
+        * by Sebastian Hellmann, Jens Lehmann und SÃ¶ren Auer in EKAW 2012 for 
more
+        * information.
+        */
+       OffsetBasedString,
+       /**
+        * A Paragraph
+        */
+       Paragraph,
+       /**
+        * A nif:Phrase can be a nif:String, that is a chunk of several words 
or a 
+        * word itself (e.g. a NounPhrase as a Named Entity). The term is 
underspecified 
+        * and can be compatible with many defintitions of phrase. Please 
subClass 
+        * it to specify the meaning (e.g. for Chunking or Phrase Structure 
Grammar). 
+        * Example: ((My dog)(also)(likes)(eating (sausage))) 
+        */
+       Phrase,
+       /**
+        * URIs of this class have to conform with the syntax of <a 
+        * href="http://tools.ietf.org/html/rfc5147";>RFC 5147</a> in a way that 
the 
+        * end on a valid identifier, if you remove the prefix. Note that 
unlike 
+        * RFC 5147 NIF does not requrire '#' URIs. So valid URIs are 
+        * http://example.org#char=0,28 , http://example.org/whatever/char=0,28 
, 
+        * http://example.org/nif?char=0,28
+        */
+       RFC5147String,
+       /**
+        * A Sentence 
+        */
+       Sentence,
+       /**
+        * Individuals of this class are a string, i.e. Unicode characters, who 
+        * have been given a URI and are used in the subject of an RDF 
statement.
+        * <p>
+        * This class is abstract and should not be serialized.
+        * <p>
+        * NIF-Stanbol (nif-stanbol.ttl): subclassOf nifs:Annotation because it 
+        * "annotates" strings for example with begin and end index. The class 
is 
+        * similar to fise:TextAnnotation
+        */
+       String,
+       /**
+        * A structure is a more or less arbitrary label for a partitioning of 
a 
+        * string. We do not follow a strict approach for what a word, phrase, 
+        * sentence, title, paragraph is. These labels enable the definition 
+        * processes for tool chains, e.g. tool analyses nif:Paragraph and 
+        * calculates term frequency.
+        * <p>
+        * This is an abstract class and should not be serialized. 
+        */
+       Structure,
+       /**
+        * A title within a text.
+        */
+       Title,
+       /**
+        * A URI Scheme for NIF, subclasses need to define guidelines on the 
URI 
+        * Scheme as well as the text it refers to. This class is just to keep 
some 
+        * order, and should not be serialized.
+        * <p>
+        * This is an abstract class and should not be serialized. 
+        */
+       URIScheme,
+       /**
+        *  The Word class represents strings that are tokens or words. A 
string is 
+        *  a Word, if it is a word. We don't nitpic about whether it is a a 
pronoun, 
+        *  a name, a punctuation mark or an apostrophe or whether it is 
separated 
+        *  by white space from another Word or something else. The string 
+        *  'He enters the room.' for example has 5 words. Words are assigned 
by a 
+        *  tokenizer NIF Implementation. Single word phrases might be tagged 
as 
+        *  nif:Word and nif:Phrase.
+        *  
+        *  Example 1: "The White House" are three Words separated by whitespace
+        *  
+        *  Comment 1: We adopted the definition style from foaf:Person, see 
+        *  here: http://xmlns.com/foaf/spec/#term_Person We are well aware 
that 
+        *  the world out there is much more complicated, but we are ignorant 
about 
+        *  it, for the following reasons:
+        *  
+        *  Comment 2: <ol>
+        *  <li> NIF has a client-server and the client has the ability to 
+        *  dictate the tokenization to the server (i.e. the NIF 
Implementation) by 
+        *  sending properly tokenized NIF annotated with nif:Word. All NIF 
+        *  Implementations are supposed to honor and respect the current 
assignment 
+        *  of the Word class. Thus the client should decide which NIF 
Implementation 
+        *  should create the tokenization. Therefore this class is not 
descriptive, 
+        *  but prescriptive.
+        *  <li>The client may choose to send an existing tokenization to a NIF 
+        *  Implementation, with the capability to change (for better or for 
worse) 
+        *  the tokenization.
+        *  </ol>
+        *  
+        *  The class has not been named 'Token' as the NLP definition of 
'token' 
+        *  is descriptive (and not well-defined), while the assignment of what 
is 
+        *  a Word and what not is prescriptive, e.g. "can't" could be 
described as 
+        *  one, two or three tokens or defined as being one, two or three 
words. 
+        *  For further reading, we refer the reader to: By all these lovely 
tokens... 
+        *  Merging conflicting tokenizations by Christian Chiarcos, Julia 
Ritz, and 
+        *  Manfred Stede. Language Resources and Evaluation 46(1):53-74 (2012) 
or 
+        *  the short form: http://www.aclweb.org/anthology/W09-3005
+        *  
+        *  There the task at hand is to merge two tokenization T_1 and T_2 
which 
+        *  is normally not the case in the NIF world as tokenization is 
prescribed, 
+        *  i.e. given as a baseline (Note that this ideal state might not be 
+        *  achieved by all implementations.)
+        */
+       Word,
+       //Object Properties
+       /**
+        * see <a 
href="http://svn.aksw.org/papers/2012/PeoplesWeb/public_preprint.pdf>
+        * Towards Web-Scale Collaborative Knowledge Extraction</a> â page 21
+        */
+       annotation,
+       /**
+        * This property should be used to express that one Context is 
contained in 
+        * another Context, e.g. several sentences of a document are modelled 
+        * indivudally and refer to the broader context of the whole document.
+        */
+       broaderContext,
+       /**
+        * A dependency relation pointing from gov to dep.
+        */
+       dependency,
+       /**
+        * Links a nif:ContextCollection to its contexts. 
+        */
+       hasContext,
+       /**
+        * This property links sentences to their first word.
+        */
+       firstWord,
+       /**
+        * This property links sentences to their last word.
+        */
+       lastWord,
+       /**
+        * This property links sentences to their words.
+        */
+       word,
+       /**
+        * This object property models a relation between two nif:Strings. 
+        * The name "inter" is kept generic and can be used to express any kind 
of 
+        * relation in between (inter) two nif:Strings. Extensions can create 
+        * rdfs:subPropertyOf for "head", "dependent", nif:substring and 
+        * nif:nextWord. 
+        */
+       inter,
+       /**
+        * Defines the language of a substring of the context. 
+        * If the language for the nif:Context should be specified, 
+        * nif:predominantLanguage must be used. 
+        */
+       lang,
+       /**
+        * The inverse of nif:broaderContex
+        */
+       narrowerContext,
+       /**
+        * This property can be used to make resources of 
+        * nif:Sentence traversable, it can not be assumed that no gaps 
+        * or whitespaces between sentences or words exist, i.e. string 
adjacency 
+        * is not mandatory. The transitivity axioms are included in 
nif-core-inf.ttl 
+        * and need to be included separately to keep a low reasoning profile. 
+        * They are modeled after skos:broader and skos:broaderTransitive
+        */
+       nextSentence,
+       /**
+        * transitive version of {@link #nextSentence}
+        */
+       nextSentenceTrans,
+       /**
+        * This property can be used to make resources of 
+        * nif:Word traversable, it can not be assumed that no gaps 
+        * or whitespaces between sentences or words exist, i.e. string 
adjacency 
+        * is not mandatory. The transitivity axioms are included in 
nif-core-inf.ttl 
+        * and need to be included separately to keep a low reasoning profile. 
+        * They are modeled after skos:broader and skos:broaderTransitive
+        */
+       nextWord,
+       /**
+        * transitive version of {@link #nextWord}
+        */
+       nextWordTrans,
+       /**
+        * This property links a string to a URI from one of the OLiA 
Annotation model, 
+        *  - members of the {@link Pos} enumeration 
+        */
+       oliaLink,
+       /**
+        * This property is used to link to a <a 
href="http://marl.gi2mo.org/?page_id=1#overview";>marl:Opinion</a>. 
+        * We have not investigated marl, so it might be replaced.
+        * <p>
+        * InverseOf marl:extractedFrom
+        */
+       opinion,
+       /**
+        * Defines the predominant language of the text. If this annotation is 
given 
+        * on a nif:Context, all NIF tools have to treat the text to be in this 
+        * language unless specified differently for a subpart. To change the 
+        * language for a smaller substring nif:lang must be used.
+        * <p>
+        * This property requires a uri as an argument. We expect this to be a 
URI 
+        * from the lexvo.org namespace, e.g. http://lexvo.org/id/iso639-3/eng 
using 
+        * ISO639-3
+        * <p>
+        * Examples:
+        * <p>
+        * "The dealer says: "Rien ne va plus!" "
+        * <p>
+        * has nif:predomintLanguage http://lexvo.org/id/iso639-3/eng and 
+        * nif:lang http://www.lexvo.org/page/iso639-3/fra
+        * <p>
+        * see also: http://www.w3.org/TR/its20/#selection-local
+        * <p>
+        * Tests for RDFUnit (not written yet):
+        * <p>
+        * - write a test for RDFUnit, so people do not use 
+        * http://www.lexvo.org/page/iso639-3/eng 
+        */
+       predLang,
+       /**
+        * This property can be used to make resources of 
+        * nif:Sentence, it can not be assumed that no gaps 
+        * or whitespaces between sentences or words exist, i.e. string 
adjacency 
+        * is not mandatory. The transitivity axioms are included in 
nif-core-inf.ttl 
+        * and need to be included separately to keep a low reasoning profile. 
+        * They are modeled after skos:broader and skos:broaderTransitive
+        */
+       previousSentence,
+       /**
+        * Transitive version of {@link #previousSentence}
+        */
+       previousSentenceTrans,
+       /**
+        * This property can be used to make resources of 
+        * nif:Word, it can not be assumed that no gaps 
+        * or whitespaces between sentences or words exist, i.e. string 
adjacency 
+        * is not mandatory. The transitivity axioms are included in 
nif-core-inf.ttl 
+        * and need to be included separately to keep a low reasoning profile. 
+        * They are modeled after skos:broader and skos:broaderTransitive
+        */
+       previousWord,
+       /**
+        * Transitive version of {@link #previousWord}
+        */
+       previousWordTrans,
+       /**
+        * Links to the URI describing the provenance
+        */
+       oliaProv,
+       /**
+        * Links a URI of a string to its reference context of type 
nif:Context. 
+        * The reference context determines the calculation of begin and end 
index
+        * <p>
+        * Each String that is not an instance of nif:Context MUST have exactly 
one 
+        * reference context.
+        * <p>
+        * Inferences (nif-core-inf.ttl):
+        * <p>
+        * Instances of nif:Context do have itself as reference context, this 
is 
+        * inferred automatically, MAY be materialized, as well.
+        * <p>
+        * OWL validation (nif-core-val.ttl):
+        * <p>
+        * This property is functional.
+        */
+       referenceContext,
+       /**
+        * This property links words to their sentence.
+        */
+       sentence,
+       /**
+        * The URL the context was extracted from, e.g. the blog or news 
article url. 
+        * Doesn't matter whether it is HTML or XML or plain text. rdfs:range 
is 
+        * foaf:Document. Subproperty of prov:hadPrimarySource. In case the 
string 
+        * comes from another NIF String and gives the exact provenance, please 
use 
+        * nif:wasConvertedFrom or a subProperty thereof.
+        */
+       sourceUrl,
+       /**
+        * This property together with nif:subString, nif:superString, and 
their 
+        * transitive extension can be used to express that one string is 
contained 
+        * in another one. Examples: "a" nif:subString "apple" , "apple" 
+        * nif:subString "apple". The transitivity axioms are included in 
+        * nif-core-inf.ttl and need to be included separately to keep a low 
+        * reasoning profile. They are modeled after skos:broader and 
+        * skos:broaderTransitive
+        */
+       subString,
+       /**
+        * Inverse of {@link #subString}
+        */
+       superString,
+       /**
+        * Transitive version of {@link #dependency}
+        */
+       dependencyTrans,
+       /**
+        * Transitive version of {@link #subString}
+        */
+       subStringTrans,
+       /**
+        * Transitive version of {@link #superString}
+        */
+       superStringTrans,
+       /**
+        * This property should be used, when mapping one nif:String or 
nif:Context 
+        * to another and is often confused with nif:sourceUrl.
+        * <p>
+        * While nif:sourceUrl is built on PROV-O and is used to link the 
nif:Context 
+        * to the document URL for provenance information, nif:convertedFrom is 
more 
+        * precise and pinpoints exact locations where a certain NIF String 
+        * "wasConvertedFrom".
+        * <p>
+        * nif:wasConvertedFrom is therefore used to provide *exact* provenance 
+        * during a conversion process, e.g. when removing tags from XHTML and 
then 
+        * linking XPath URIs to NIF index based URIs (e.g. RFC 5147 with 
char=x,y). 
+        * An example of the usage of this property can be found here: 
+        * http://www.w3.org/TR/its20/#conversion-to-nif
+        * <p>
+        * Example
+        * <p>
+        * # "Dublin"
+        * <p>
+        * 
&lt;http://example.com/myitsservice?informat=html&intype=url&input=http://example.com/doc.html&char=11,17&gt;
+        * <p>
+        * nif:wasConvertedFrom
+        * <p>
+        * 
&lt;http://example.com/myitsservice?informat=html&intype=url&input=http://example.com/doc.html&xpath=/html/body[1]/h2[1]/span[1]/text()[1]&gt;.
+        */
+       wasConvertedFrom,
+       //Datatype properties
+       /**
+        * For each string you can include a snippet (e.g. 10-40 characters of 
text), 
+        * that occurs immediately after the subject string.
+        */
+       after,
+       /**
+        * The string, which the URI is representing as an RDF Literal. Some 
use 
+        * cases require this property, as it is necessary for certain sparql 
queries. 
+        */
+       anchorOf,
+       /**
+        * For each string you can include a snippet (e.g. 10-40 characters of 
text), 
+        * that occurs immediately before the subject string.
+        */
+       before,
+       /**
+        * The begin index of a character range as defined in 
+        * http://tools.ietf.org/html/rfc5147#section-2.2.1 and 
+        * http://tools.ietf.org/html/rfc5147#section-2.2.2, measured as the 
gap 
+        * between two characters, starting to count from 0 (the position 
before 
+        * the first character of a text).
+        * <p>
+        * Example: Index "2" is the postion between "Mr" and "." in "Mr. 
Sandman".
+        * <p>
+        * Note: RFC 5147 is re-used for the definition of character ranges. 
RFC 5147 
+        * is assuming a text/plain MIME type. NIF builds upon Unicode and is 
content 
+        * agnostic.
+        * <p>
+        * Requirement (1): This property has the same value the "Character 
position" 
+        * of RFC 5147 and it MUST therefore be castable to 
xsd:nonNegativeInteger, 
+        * i.e. it MUST not have negative values.
+        * <p>
+        * Requirement (2): The index of the subject string MUST be calculated 
+        * relative to the nif:referenceContext of the subject. If available, 
this 
+        * is the rdf:Literal of the nif:isString property.
+        */
+       beginIndex,
+       /**
+        * The confidence is relative to the tool and can be between 0.0 and 
1.0, 
+        * it is for nif:oliaLink and therefore also for nif:oliaCategory.
+        */
+       oliaConf,
+       /**
+        * The end index of a character range as defined in 
+        * http://tools.ietf.org/html/rfc5147#section-2.2.1 and 
+        * http://tools.ietf.org/html/rfc5147#section-2.2.2, measured as the 
gap 
+        * between two characters, starting to count from 0 (the position 
before 
+        * the first character of a text).
+        * <p>
+        * Example: Index "2" is the postion between "Mr" and "." in "Mr. 
Sandman".
+        * <p>
+        * Note: RFC 5147 is re-used for the definition of character ranges. 
RFC 5147 
+        * is assuming a text/plain MIME type. NIF builds upon Unicode and is 
content 
+        * agnostic.
+        * <p>
+        * Requirement (1): This property has the same value the "Character 
position" 
+        * of RFC 5147 and it must therefore be an xsd:nonNegativeInteger .
+        * <p>
+        * Requirement (2): The index of the subject string MUST be calculated 
+        * relative to the nif:referenceContext of the subject. If available, 
this 
+        * is the rdf:Literal of the nif:isString property.
+        */
+       endIndex,
+       /**
+        * The first few chars of the nif:anchorOf. Typically used if the 
nif:anchorOf
+        * is to long for inclusion as RDF literal.
+        */
+       head,
+       /**
+        * The reference text as rdf:Literal for this nif:Context resource.
+        * NIF requires that the reference text (i.e. the context) is always 
+        * included in the RDF as an rdf:Literal.
+        * <p>
+        * Note, that the isString property is *the* place to keep the string 
itself 
+        * in RDF.
+        * <p>
+        * All other nif:Strings and nif:URISchemes relate to the text of this 
+        * property to calculate character position and indices.
+        */
+       isString,
+       /**
+        * The lemma(s) of the nif:String.
+        */
+       lemma,
+       /**
+        * see <a 
href=http://svn.aksw.org/papers/2012/PeoplesWeb/public_preprint.pdf";>
+        * Towards Web-Scale Collaborative Knowledge Extraction</a>â page 21 .
+        */
+       literalAnnotation,
+       /**
+        * To include the pos tag as it comes out of the NLP tool as RDF 
Literal. 
+        * This property is discouraged to use alone, please use oliaLink and 
+        * oliaCategory. We included it, because some people might still want 
it 
+        * and will even create their own property, if the string variant is 
missing  
+        */
+       posTag,
+       /**
+        * Between -1 negative and 1 positive
+        */
+       sentimentValue,
+       /**
+        * The stem(s) of the nif:String.
+        */
+       stem,
+       //Annotation properties
+       /**
+        * A simple annotation for machine learning purposes. The object can be 
+        * anything, e.g. the literal "A. PRESS: Reportage" from Brown or any 
URI. 
+        */
+       category,
+       /**
+        * see <a 
href=http://svn.aksw.org/papers/2012/PeoplesWeb/public_preprint.pdf";>
+        * Towards Web-Scale Collaborative Knowledge Extraction</a>â page 12 .
+        */
+       classAnnotation,
+       /**
+        * This property marks the most specific class from itsrdf:taClassRef. 
+        * The rule is: from the set S of itsrdf:taClassRef attached to this 
resource 
+        * taMscRef points to the one that does not have any subclasses in the 
set 
+        * S except itself. So if taClassRef is owl:Thing, dbo:Agent, 
dbo:Person, 
+        * dbp:Actor taMsClassRef is dbo:Actor 
+        */
+       taMsClassRef,
+       /**
+        * This property links a string URI to classes of the OLiA Reference 
model. 
+        * It provides a direct link for querying, thus it is a redundant 
optimization.
+        * <p>
+        * Values are expected to be member of {@link Pos}
+        */
+       oliaCategory,
+       ;
+    public final static String NAMESPACE = 
"http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#";;
+
+    UriRef uri;
+    
+    private Nif20() {
+        uri = new UriRef(NAMESPACE+name());
+    }
+    
+    public String getLocalName(){
+        return name();
+    }
+    
+    public UriRef getUri(){
+        return uri;
+    }
+    
+    @Override
+    public String toString() {
+        return uri.getUnicodeString();
+    }
+
+}

svn commit: r1633587 - in /stanbol/trunk: enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/ enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/

Reply via email to