Author: wkasper
Date: Fri Feb 17 11:42:08 2012
New Revision: 1245410
URL: http://svn.apache.org/viewvc?rev=1245410&view=rev
Log:
STANBOL-478: Added option to include plainText directly in the metadata graph.
Modified:
incubator/stanbol/trunk/enhancer/engines/metaxa/README.md
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties
Modified: incubator/stanbol/trunk/enhancer/engines/metaxa/README.md
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/README.md?rev=1245410&r1=1245409&r2=1245410&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/metaxa/README.md (original)
+++ incubator/stanbol/trunk/enhancer/engines/metaxa/README.md Fri Feb 17
11:42:08 2012
@@ -147,6 +147,10 @@ following set of document formats:
The plain text content of a document in the content is stored in as a Blob. To
retrieve it, use
String text =
ContentItemHelper.getText(ContentItemHelper.getBlob(contentItem,
java.util.Collections.singleton("text/plain")));
+
+An alternative is to have extracted plain text content included directly into
the metadata by setting the property
<pre><code>org.apache.stanbol.enhancer.engines.metaxa.includeText</pre></code>
to true. Extracted text then is available as value of the property
+
+
http://www.semanticdesktop.org/ontologies/2007/01/19/nie#plainTextContent
### Vocabularies
@@ -273,6 +277,14 @@ The alternative configuration files then
*
<pre><code>org.apache.stanbol.enhancer.engines.metaxa.htmlextractors</pre></code>
+Other configuration options:
+
+*
<pre><code>org.apache.stanbol.enhancer.engines.metaxa.includeText</pre></code>
provides an option to include extracted plain text directly into the metadata
as value of the property
+
+
http://www.semanticdesktop.org/ontologies/2007/01/19/nie#plainTextContent
+
+*
<pre><code>org.apache.stanbol.enhancer.engines.metaxa.ignoreMimeTypes</pre></code>
allows to specify a set of mime types that Metaxa should ignore. By default,
plain text documents are ignored.
+
## Usage
Assuming that the Stanbol endpoint with the full launcher is running at
@@ -283,7 +295,7 @@ and the engine is activated, from the co
* stateless interface
- curl -i -X POST -H "Content-Type:text/html" -T testpage.html
http://localhost:8080/engines
+ curl -i -X POST -H "Content-Type:text/html" -T testpage.html
http://localhost:8080/enhancer
* stateful interface
Modified:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java?rev=1245410&r1=1245409&r2=1245410&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
(original)
+++
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/java/org/apache/stanbol/enhancer/engines/metaxa/MetaxaEngine.java
Fri Feb 17 11:42:08 2012
@@ -103,7 +103,7 @@ public class MetaxaEngine
public static final Integer defaultOrder = ORDERING_PRE_PROCESSING;
/**
- * name of a file defining the available docuemnt extractors for Metaxa.
By defualt, the builtin file 'extractionregistry.xml' is used.
+ * name of a file defining the available docuemnt extractors for Metaxa.
By default, the builtin file 'extractionregistry.xml' is used.
*/
@Property(value=MetaxaEngine.DEFAULT_EXTRACTION_REGISTRY)
public static final String GLOBAL_EXTRACTOR_REGISTRY =
"org.apache.stanbol.enhancer.engines.metaxa.extractionregistry";
@@ -116,6 +116,12 @@ public class MetaxaEngine
@Property(value={"text/plain"},cardinality=1000)
public static final String IGNORE_MIME_TYPES =
"org.apache.stanbol.enhancer.engines.metaxa.ignoreMimeTypes";
+
+ /**
+ * a boolean option whether extracted text should be included in the
metadata as value of the NIE.plainTextContent property
+ */
+ @Property(boolValue=false)
+ public static final String INCLUDE_TEXT_IN_METADATA =
"org.apache.stanbol.enhancer.engines.metaxa.includeText";
private MetaxaCore extractor;
BundleContext bundleContext;
@@ -124,6 +130,7 @@ public class MetaxaEngine
public static final String DEFAULT_HTML_EXTRACTOR_REGISTRY =
"htmlextractors.xml";
private Set<String> ignoredMimeTypes;
+ private boolean includeText = false;
/**
* The activate method.
@@ -168,6 +175,11 @@ public class MetaxaEngine
} else {
ignoredMimeTypes = Collections.singleton("text/plain");
}
+ value = ce.getProperties().get(INCLUDE_TEXT_IN_METADATA);
+ if (value instanceof Boolean) {
+ includeText = ((Boolean)value).booleanValue();
+ log.info("Include Text set to: {}",value);
+ }
}
/**
@@ -234,6 +246,12 @@ public class MetaxaEngine
if(oneStmt.getSubject().equals(docId) &&
oneStmt.getPredicate().equals(NIE_PLAINTEXT_PROPERTY)){
out.write(oneStmt.getObject().toString());
+ if (includeText) {
+ NonLiteral subject = (NonLiteral)
asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
+ UriRef predicate = (UriRef)
asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
+ Resource object =
asClerezzaResource(oneStmt.getObject(), blankNodeMap);
+ g.add(new TripleImpl(subject, predicate, object));
+ }
} else { //add metadata to the metadata of the contentItem
NonLiteral subject = (NonLiteral)
asClerezzaResource(oneStmt.getSubject(), blankNodeMap);
UriRef predicate = (UriRef)
asClerezzaResource(oneStmt.getPredicate(), blankNodeMap);
Modified:
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1245410&r1=1245409&r2=1245410&view=diff
==============================================================================
---
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties
(original)
+++
incubator/stanbol/trunk/enhancer/engines/metaxa/src/main/resources/OSGI-INF/metatype/metatype.properties
Fri Feb 17 11:42:08 2012
@@ -41,4 +41,8 @@ resource on the bundle classpath that sp
org.apache.stanbol.enhancer.engines.metaxa.ignoreMimeTypes.name=Ignored Mime
Types
org.apache.stanbol.enhancer.engines.metaxa.ignoreMimeTypes.description=This
allows to \
-provide a list of MIME TYPES that are not processed by this engine.
\ No newline at end of file
+provide a list of MIME TYPES that are not processed by this engine.
+
+org.apache.stanbol.enhancer.engines.metaxa.includeText.name=Include extracted
plain text
+org.apache.stanbol.enhancer.engines.metaxa.includeText.description=An option
to specify whether \
+extracted plain text should be included in the metadata.