Author: tommaso
Date: Fri Sep 17 10:01:52 2010
New Revision: 998047
URL: http://svn.apache.org/viewvc?rev=998047&view=rev
Log:
[CLEREZZA-193] - fixed metadata generation refactoring mediatype based content
extraction, fixed utils feature value processing
Added:
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/mediatype/
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/mediatype/MediaTypeTextExtractor.java
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/mediatype/PlainTextExtractor.java
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/mediatype/UnsupportedMediaTypeException.java
Modified:
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/UIMABaseMetadataGenerator.java
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/test/java/org/apache/clerezza/uima/metadatagenerator/UIMABaseMetadataGeneratorTest.java
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.utils/src/main/java/org/apache/clerezza/uima/utils/AEProvider.java
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.utils/src/main/java/org/apache/clerezza/uima/utils/UIMAUtils.java
Modified:
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/UIMABaseMetadataGenerator.java
URL:
http://svn.apache.org/viewvc/incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/UIMABaseMetadataGenerator.java?rev=998047&r1=998046&r2=998047&view=diff
==============================================================================
---
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/UIMABaseMetadataGenerator.java
(original)
+++
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/UIMABaseMetadataGenerator.java
Fri Sep 17 10:01:52 2010
@@ -4,6 +4,9 @@ import org.apache.clerezza.rdf.metadata.
import org.apache.clerezza.rdf.ontologies.DC;
import org.apache.clerezza.rdf.ontologies.DCTERMS;
import org.apache.clerezza.rdf.utils.GraphNode;
+import
org.apache.clerezza.uima.metadatagenerator.mediatype.MediaTypeTextExtractor;
+import org.apache.clerezza.uima.metadatagenerator.mediatype.PlainTextExtractor;
+import
org.apache.clerezza.uima.metadatagenerator.mediatype.UnsupportedMediaTypeException;
import org.apache.clerezza.uima.utils.ExternalServicesFacade;
import org.apache.clerezza.uima.utils.UIMAUtils;
import org.apache.felix.scr.annotations.Component;
@@ -12,12 +15,16 @@ import org.apache.felix.scr.annotations.
import org.apache.uima.UIMAException;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.jcas.tcas.Annotation;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import javax.ws.rs.core.MediaType;
import java.util.List;
+import java.util.Set;
+import java.util.TreeSet;
/**
- * An implementation of <code>MetaDataGenerator</code> generates meta data
about specified data
+ * An implementation of {...@link MetaDataGenerator} generates meta data about
specified data
* depending on its media type using Apache UIMA.
*/
@Component(metatype = true)
@@ -27,10 +34,27 @@ import java.util.List;
})
public class UIMABaseMetadataGenerator implements MetaDataGenerator {
- private ExternalServicesFacade facade = new ExternalServicesFacade();
+ private final static Logger log =
LoggerFactory.getLogger(UIMABaseMetadataGenerator.class);
+
+ private ExternalServicesFacade facade;
+
+ private Set<MediaTypeTextExtractor> textExtractors;
+
+ public UIMABaseMetadataGenerator() {
+ this.facade = new ExternalServicesFacade();
+ this.textExtractors = new TreeSet<MediaTypeTextExtractor>();
+ }
+
+ public UIMABaseMetadataGenerator(ExternalServicesFacade facade) {
+ this.facade = facade;
+ this.textExtractors = new TreeSet<MediaTypeTextExtractor>();
+ }
@Override
public void generate(GraphNode node, byte[] data, MediaType mediaType) {
+ if (textExtractors.isEmpty()) {
+ initializeExtractors();
+ }
try {
String text = getTextToAnalyze(data, mediaType);
@@ -46,18 +70,28 @@ public class UIMABaseMetadataGenerator i
// add alchemyAPI's annotations' nodes
addAlchemyAPIEntities(node, text);
+ log.info(new StringBuilder(node.toString()).append(" graph node
enriched").toString());
} catch (Throwable e) {
- // do nothing
+ log.error(new StringBuilder("Unable to extract metadata due to
").append(e.toString()).toString());
}
}
+ /* initialize text extractors sorted set */
+ private void initializeExtractors() {
+ this.textExtractors.add(new PlainTextExtractor());
+ }
+
private String getTextToAnalyze(byte[] data, MediaType mediaType) throws
UnsupportedMediaTypeException {
+ // since extractors are sorted, the first I found supporting this
mediaType is good
String text = null;
- if (MediaType.TEXT_PLAIN.equals(mediaType)) {
- text = new String(data);
+ for (MediaTypeTextExtractor textExtractor : this.textExtractors) {
+ if (textExtractor.supports(mediaType)) {
+ text = textExtractor.extract(data);
+ break;
+ }
}
if (text == null) {
- throw new UnsupportedMediaTypeException(mediaType.getType());
+ throw new UnsupportedMediaTypeException(mediaType);
}
return text;
}
@@ -65,13 +99,13 @@ public class UIMABaseMetadataGenerator i
private void addCategory(GraphNode node, String data) throws UIMAException {
// get category to bind it to the node
- String category = facade.getCategory(data);
+ String category = this.facade.getCategory(data);
node.addPropertyValue(DC.subject, category);
}
private void addLanguage(GraphNode node, String data) throws UIMAException {
// get language to bind it to the node
- String language = facade.getLanguage(data);
+ String language = this.facade.getLanguage(data);
node.addPropertyValue(DCTERMS.language, language);
}
@@ -85,14 +119,9 @@ public class UIMABaseMetadataGenerator i
private void addAlchemyAPIEntities(GraphNode existingNode, String data)
throws UIMAException {
// analyze document text and get the corresponding AlchemyAPI Tags
- List<FeatureStructure> alchemyAPIEntities = facade.getAlchemyAPITags(data);
+ List<FeatureStructure> alchemyAPIEntities =
this.facade.getAlchemyAPITags(data);
// convert entities to nodes inside the current graph
UIMAUtils.enhanceNode(existingNode, alchemyAPIEntities);
}
- private class UnsupportedMediaTypeException extends Throwable {
- private UnsupportedMediaTypeException(String s) {
- super(s);
- }
- }
}
Added:
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/mediatype/MediaTypeTextExtractor.java
URL:
http://svn.apache.org/viewvc/incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/mediatype/MediaTypeTextExtractor.java?rev=998047&view=auto
==============================================================================
---
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/mediatype/MediaTypeTextExtractor.java
(added)
+++
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/mediatype/MediaTypeTextExtractor.java
Fri Sep 17 10:01:52 2010
@@ -0,0 +1,14 @@
+package org.apache.clerezza.uima.metadatagenerator.mediatype;
+
+import javax.ws.rs.core.MediaType;
+
+/**
+ * A MediaTypeTextExtractor should extract text from a (list of) specified
{...@link javax.ws.rs.core.MediaType}
+ */
+public interface MediaTypeTextExtractor {
+
+ public boolean supports(MediaType mediaType);
+
+ public String extract(byte[] bytes) throws UnsupportedMediaTypeException;
+
+}
Added:
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/mediatype/PlainTextExtractor.java
URL:
http://svn.apache.org/viewvc/incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/mediatype/PlainTextExtractor.java?rev=998047&view=auto
==============================================================================
---
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/mediatype/PlainTextExtractor.java
(added)
+++
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/mediatype/PlainTextExtractor.java
Fri Sep 17 10:01:52 2010
@@ -0,0 +1,19 @@
+package org.apache.clerezza.uima.metadatagenerator.mediatype;
+
+import javax.ws.rs.core.MediaType;
+
+/**
+ * Base implementation of {...@link javax.ws.rs.core.MediaType}
+ */
+public class PlainTextExtractor implements MediaTypeTextExtractor {
+ @Override
+ public boolean supports(MediaType mediaType) {
+ return mediaType != null &&
mediaType.getType().equals(MediaType.TEXT_PLAIN_TYPE.getType()) &&
+
mediaType.getSubtype().equals(MediaType.TEXT_PLAIN_TYPE.getSubtype());
+ }
+
+ @Override
+ public String extract(byte[] bytes) throws UnsupportedMediaTypeException {
+ return new String(bytes);
+ }
+}
Added:
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/mediatype/UnsupportedMediaTypeException.java
URL:
http://svn.apache.org/viewvc/incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/mediatype/UnsupportedMediaTypeException.java?rev=998047&view=auto
==============================================================================
---
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/mediatype/UnsupportedMediaTypeException.java
(added)
+++
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/main/java/org/apache/clerezza/uima/metadatagenerator/mediatype/UnsupportedMediaTypeException.java
Fri Sep 17 10:01:52 2010
@@ -0,0 +1,14 @@
+package org.apache.clerezza.uima.metadatagenerator.mediatype;
+
+import javax.ws.rs.core.MediaType;
+
+/**
+ * When a {...@link javax.ws.rs.core.MediaType} is not supported this
exception is thrown
+ */
+public class UnsupportedMediaTypeException extends Exception {
+ private static final String UNSUPPORTED = " is not supported";
+
+ public UnsupportedMediaTypeException(MediaType mediaType) {
+ super(new
StringBuilder(mediaType.getType()).append(UNSUPPORTED).toString());
+ }
+}
Modified:
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/test/java/org/apache/clerezza/uima/metadatagenerator/UIMABaseMetadataGeneratorTest.java
URL:
http://svn.apache.org/viewvc/incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/test/java/org/apache/clerezza/uima/metadatagenerator/UIMABaseMetadataGeneratorTest.java?rev=998047&r1=998046&r2=998047&view=diff
==============================================================================
---
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/test/java/org/apache/clerezza/uima/metadatagenerator/UIMABaseMetadataGeneratorTest.java
(original)
+++
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.metadata-generator/src/test/java/org/apache/clerezza/uima/metadatagenerator/UIMABaseMetadataGeneratorTest.java
Fri Sep 17 10:01:52 2010
@@ -4,17 +4,21 @@ import org.apache.clerezza.rdf.core.MGra
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.SimpleMGraph;
import org.apache.clerezza.rdf.utils.GraphNode;
+import org.apache.clerezza.uima.utils.ExternalServicesFacade;
import org.junit.Test;
import javax.ws.rs.core.MediaType;
+import java.util.HashMap;
+import java.util.Map;
import static org.junit.Assert.fail;
+
/**
* Testcase for {...@link UIMABaseMetadataGenerator}
- *
*/
public class UIMABaseMetadataGeneratorTest {
-
+ private static final String TEXT_TO_ANALYZE = "Italy, the defending
champions and four-time World Cup winners, suffer a shock World Cup defeat to
Slovakia, who win a remarkable game 3-2 to book their place in the last 16";
+
@Test
public void testConstructor() {
try {
@@ -24,20 +28,40 @@ public class UIMABaseMetadataGeneratorTe
fail(e.getLocalizedMessage());
}
}
-
+
@Test
public void testGenerateMethodWithUnsupportedMediaType() {
try {
UIMABaseMetadataGenerator baseMetadataGenerator = new
UIMABaseMetadataGenerator();
- String textToAnalyze = "Italy, the defending champions and four-time
World Cup winners, suffer a shock World Cup defeat to Slovakia, who win a
remarkable game 3-2 to book their place in the last 16";
+ String textToAnalyze = TEXT_TO_ANALYZE;
MGraph mGraph = new SimpleMGraph();
- GraphNode node = new GraphNode(new UriRef("test"), mGraph.getGraph());
+ GraphNode node = new GraphNode(new UriRef("test"), mGraph);
MediaType wrongMediaType = MediaType.valueOf("multipart/form-data;
boundary=AaB03x");
baseMetadataGenerator.generate(node, textToAnalyze.getBytes(),
wrongMediaType);
} catch (Exception e) {
fail(e.getLocalizedMessage());
}
-
+
+ }
+
+ @Test
+ public void testGenerateMethodWithsupportedMediaType() {
+ try {
+
+ ExternalServicesFacade externalServicesFacade = new
ExternalServicesFacade();
+ Map<String, Object> parameterSettings = new HashMap<String, Object>();
+ parameterSettings.put("apikey",
"04490000a72fe7ec5cb3497f14e77f338c86f2fe");
+ parameterSettings.put("licenseID", "g6h9zamsdtwhb93nc247ecrs");
+ externalServicesFacade.setParameterSetting(parameterSettings);
+ UIMABaseMetadataGenerator baseMetadataGenerator = new
UIMABaseMetadataGenerator(externalServicesFacade);
+ String textToAnalyze = TEXT_TO_ANALYZE;
+ MGraph mGraph = new SimpleMGraph();
+ GraphNode node = new GraphNode(new UriRef("test"), mGraph);
+ baseMetadataGenerator.generate(node, textToAnalyze.getBytes(),
MediaType.TEXT_PLAIN_TYPE);
+ } catch (Exception e) {
+ fail(e.getLocalizedMessage());
+ }
+
}
}
Modified:
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.utils/src/main/java/org/apache/clerezza/uima/utils/AEProvider.java
URL:
http://svn.apache.org/viewvc/incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.utils/src/main/java/org/apache/clerezza/uima/utils/AEProvider.java?rev=998047&r1=998046&r2=998047&view=diff
==============================================================================
---
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.utils/src/main/java/org/apache/clerezza/uima/utils/AEProvider.java
(original)
+++
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.utils/src/main/java/org/apache/clerezza/uima/utils/AEProvider.java
Fri Sep 17 10:01:52 2010
@@ -1,17 +1,14 @@
package org.apache.clerezza.uima.utils;
-import java.net.URL;
-import java.util.Map;
-
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceSpecifier;
-import org.apache.uima.resource.metadata.ConfigurationParameterSettings;
-import
org.apache.uima.resource.metadata.impl.ConfigurationParameterSettings_impl;
import org.apache.uima.util.XMLInputSource;
-import org.apache.uima.util.XMLParser;
+
+import java.net.URL;
+import java.util.Map;
/**
* provide the AnalysisEngine using the default descriptor or using a custom
descriptor (absolute)
@@ -78,7 +75,8 @@ public class AEProvider {
// eventually add/override descriptor's configuration parameters
AnalysisEngineDescription desc =
UIMAFramework.getXMLParser().parseAnalysisEngineDescription(in);
for (String parameter : parameterSettings.keySet()) {
-
desc.getAnalysisEngineMetaData().getConfigurationParameterSettings().setParameterValue(parameter,parameterSettings.get(parameter));
+ if
(desc.getAnalysisEngineMetaData().getConfigurationParameterSettings().getParameterValue(parameter)!=null)
+
desc.getAnalysisEngineMetaData().getConfigurationParameterSettings().setParameterValue(parameter,parameterSettings.get(parameter));
}
// create AE here
Modified:
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.utils/src/main/java/org/apache/clerezza/uima/utils/UIMAUtils.java
URL:
http://svn.apache.org/viewvc/incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.utils/src/main/java/org/apache/clerezza/uima/utils/UIMAUtils.java?rev=998047&r1=998046&r2=998047&view=diff
==============================================================================
---
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.utils/src/main/java/org/apache/clerezza/uima/utils/UIMAUtils.java
(original)
+++
incubator/clerezza/trunk/org.apache.clerezza.parent/org.apache.clerezza.uima/org.apache.clerezza.uima.utils/src/main/java/org/apache/clerezza/uima/utils/UIMAUtils.java
Fri Sep 17 10:01:52 2010
@@ -7,9 +7,12 @@ import org.apache.clerezza.uima.utils.ex
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.Type;
import org.apache.uima.cas.text.AnnotationIndex;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
import java.util.ArrayList;
import java.util.List;
@@ -20,6 +23,8 @@ import java.util.concurrent.locks.Lock;
*/
public class UIMAUtils {
+ private final static Logger log = LoggerFactory.getLogger(UIMAUtils.class);
+
public static List<FeatureStructure> getAllFSofType(int type, JCas cas)
throws FeatureStructureNotFoundException {
List<FeatureStructure> featureStructures = new
ArrayList<FeatureStructure>();
@@ -65,6 +70,7 @@ public class UIMAUtils {
for (FeatureStructure uimaObject : uimaObjects) {
// create a new node for the current Annotation
GraphNode annotationNode = new GraphNode(ENTITY.Annotation,
existingNode.getGraph());
+ log.info(new StringBuilder("Node created for Type
").append(uimaObject.getType().toString()).toString());
// set Annotation specific properties for the node
if (uimaObject instanceof Annotation) {
@@ -77,15 +83,30 @@ public class UIMAUtils {
annotationNode.addPropertyValue(ENTITY.uimaType,
uimaObject.getType().getName());
/* inspect features of the annotation */
- for (Feature feature : uimaObject.getType().getFeatures()) {
+ Type type = uimaObject.getType();
+ for (Feature feature : type.getFeatures()) {
// create a new feature node
GraphNode featureNode = new GraphNode(ENTITY.Feature,
existingNode.getGraph());
+ log.info(new StringBuilder("Node created for Feature
").append(feature.getName()).toString());
+
// set feature name and value if not null
featureNode.addPropertyValue(ENTITY.featureName, feature.getName());
- FeatureStructure featureValue = uimaObject.getFeatureValue(feature);
- if (featureValue != null)
+
+ String featureValue = null;
+ try {
+ featureValue = uimaObject.getFeatureValueAsString(feature);
+ }
+ catch (Exception e) {
+ // do nothing at the moment
+ log.warn(new StringBuilder("Unable to create feature value -
").append(e.toString()).toString());
+ }
+
+ if (featureValue != null) {
featureNode.addPropertyValue(ENTITY.featureValue, featureValue);
+ log.info(new StringBuilder("Added feature
").append(feature.getName()).append(" with value ")
+ .append(featureValue.toString()).toString());
+ }
// add feature to the annotation node
annotationNode.addProperty(ENTITY.hasFeature, featureNode.getNode());