Author: wkasper
Date: Fri Feb 17 11:42:08 2012
New Revision: 1245410

URL: http://svn.apache.org/viewvc?rev=1245410&view=rev
Log:
STANBOL-478: Added option to include plainText directly in the metadata graph.

Modified:
    incubator/stanbol/trunk/enhancer/engines/metaxa/README.md
    
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
    
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties

Modified: incubator/stanbol/trunk/enhancer/engines/metaxa/README.md
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/README.md?rev=1245410&r1=1245409&r2=1245410&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/metaxa/README.md (original)
+++ incubator/stanbol/trunk/enhancer/engines/metaxa/README.md Fri Feb 17 
11:42:08 2012
@@ -147,6 +147,10 @@ following set of document formats:
 The plain text content of a document in the content is stored in as a Blob. To 
retrieve it, use
 
     String text = 
ContentItemHelper.getText(ContentItemHelper.getBlob(contentItem, 
java.util.Collections.singleton("text/plain")));
+    
+An alternative is to have extracted plain text content included directly into 
the metadata by setting the property 
<pre><code>org.apache.stanbol.enhancer.engines.metaxa.includeText</pre></code> 
to true. Extracted text then is available as value of the property
+
+               
http://www.semanticdesktop.org/ontologies/2007/01/19/nie#plainTextContent
 
 ### Vocabularies
 
@@ -273,6 +277,14 @@ The alternative configuration files then
 
 * 
<pre><code>org.apache.stanbol.enhancer.engines.metaxa.htmlextractors</pre></code>
 
+Other configuration options:
+
+* 
<pre><code>org.apache.stanbol.enhancer.engines.metaxa.includeText</pre></code> 
provides an option to include extracted plain text directly into the metadata 
as value of the property
+
+               
http://www.semanticdesktop.org/ontologies/2007/01/19/nie#plainTextContent
+               
+* 
<pre><code>org.apache.stanbol.enhancer.engines.metaxa.ignoreMimeTypes</pre></code>
 allows to specify a set of mime types that Metaxa should ignore. By default, 
plain text documents are ignored.
+
 ## Usage
 
 Assuming that the Stanbol endpoint with the full launcher is running at
@@ -283,7 +295,7 @@ and the engine is activated, from the co
 
 * stateless interface
 
-    curl -i -X POST -H "Content-Type:text/html" -T testpage.html 
http://localhost:8080/engines
+    curl -i -X POST -H "Content-Type:text/html" -T testpage.html 
http://localhost:8080/enhancer
 
 * stateful interface
 

Modified: 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java?rev=1245410&r1=1245409&r2=1245410&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
 Fri Feb 17 11:42:08 2012
@@ -103,7 +103,7 @@ public class MetaxaEngine 
     public static final Integer defaultOrder = ORDERING_PRE_PROCESSING;
 
     /**
-     * name of a file defining the available docuemnt extractors for Metaxa. 
By defualt, the builtin file 'extractionregistry.xml' is used.
+     * name of a file defining the available docuemnt extractors for Metaxa. 
By default, the builtin file 'extractionregistry.xml' is used.
      */
     @Property(value=MetaxaEngine.DEFAULT_EXTRACTION_REGISTRY)
     public static final String GLOBAL_EXTRACTOR_REGISTRY = 
"org.apache.stanbol.enhancer.engines.metaxa.extractionregistry";
@@ -116,6 +116,12 @@ public class MetaxaEngine 
 
     @Property(value={"text/plain"},cardinality=1000)
     public static final String IGNORE_MIME_TYPES = 
"org.apache.stanbol.enhancer.engines.metaxa.ignoreMimeTypes";
+
+    /**
+     * a boolean option whether extracted text should be included in the 
metadata as value of the NIE.plainTextContent property
+     */
+    @Property(boolValue=false)
+    public static final String INCLUDE_TEXT_IN_METADATA = 
"org.apache.stanbol.enhancer.engines.metaxa.includeText";
     private MetaxaCore extractor;
     
     BundleContext bundleContext;
@@ -124,6 +130,7 @@ public class MetaxaEngine 
     public static final String DEFAULT_HTML_EXTRACTOR_REGISTRY = 
"htmlextractors.xml";
     
     private Set<String> ignoredMimeTypes;
+    private boolean includeText = false;
 
     /**
      * The activate method.
@@ -168,6 +175,11 @@ public class MetaxaEngine 
         } else {
             ignoredMimeTypes = Collections.singleton("text/plain");
         }
+        value = ce.getProperties().get(INCLUDE_TEXT_IN_METADATA);
+        if (value instanceof Boolean) {
+          includeText = ((Boolean)value).booleanValue();
+          log.info("Include Text set to: {}",value);
+        }
     }
 
     /**
@@ -234,6 +246,12 @@ public class MetaxaEngine 
                     if(oneStmt.getSubject().equals(docId) && 
                             
oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)){
                         out.write(oneStmt.getObject().toString());
+                        if (includeText) {
+                          NonLiteral subject = (NonLiteral) 
asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
+                          UriRef predicate = (UriRef) 
asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
+                          Resource object = 
asClerezzaResource(oneStmt.getObject(), blankNodeMap);
+                          g.add(new TripleImpl(subject, predicate, object));
+                        }
                     } else { //add metadata to the metadata of the contentItem
                         NonLiteral subject = (NonLiteral) 
asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
                         UriRef predicate = (UriRef) 
asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);

Modified: 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1245410&r1=1245409&r2=1245410&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties
 (original)
+++ 
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties
 Fri Feb 17 11:42:08 2012
@@ -41,4 +41,8 @@ resource on the bundle classpath that sp
 
 org.apache.stanbol.enhancer.engines.metaxa.ignoreMimeTypes.name=Ignored Mime 
Types
 org.apache.stanbol.enhancer.engines.metaxa.ignoreMimeTypes.description=This 
allows to \
-provide a list of MIME TYPES that are not processed by this engine.
\ No newline at end of file
+provide a list of MIME TYPES that are not processed by this engine.
+
+org.apache.stanbol.enhancer.engines.metaxa.includeText.name=Include extracted 
plain text
+org.apache.stanbol.enhancer.engines.metaxa.includeText.description=An option 
to specify whether \
+extracted plain text should be included in the metadata.


Reply via email to