Author: rwesten
Date: Thu May 31 04:45:41 2012
New Revision: 1344563
URL: http://svn.apache.org/viewvc?rev=1344563&view=rev
Log:
STANBOL-583: Applied the patch provided by Alessio Bosca on 2012-05-30
* had to manually merge the example texts in the NER test because of encoding
issues.
* unit tests do complete after applying the patch!
Modified:
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LemmatizerClientHTTP.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/CeliNamedEntityExtractionEnhancementEngine.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/CeliNamedEntityExtractionEnhancementEngineTest.java
Modified:
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java?rev=1344563&r1=1344562&r2=1344563&view=diff
==============================================================================
---
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java
(original)
+++
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java
Thu May 31 04:45:41 2012
@@ -250,45 +250,45 @@ public class ClassificationClientHTTP {
// An even better variant would be to write a UnitTest for that!!
// This would be recommended of the called service is still in beta
// and may change at any time
-// public static void main(String[] args) throws Exception {
-// String lang = "fr";
-// String text = "Brigitte Bardot, née le 28 septembre " +
-// "1934 à Paris, est une actrice de cinéma et chanteuse
française.";
-//
-// //For request testing
-// //Writer request = new StringWriter();
-//
-// //For response testing
-// HttpURLConnection con = Utils.createPostRequest(
-// new
URL("http://linguagrid.org/LSGrid/ws/dbpedia-classification"),
-// Collections.singletonMap("Content-Type", CONTENT_TYPE));
-// Writer request = new OutputStreamWriter(con.getOutputStream(),UTF8);
-//
-// //"stream" the request content directly to the buffered writer
-// BufferedWriter writer = new BufferedWriter(request);
-//
-// writer.write(SOAP_PREFIX);
-// writer.write("<clas:classify>");
-// writer.write("<clas:user>wiki</clas:user>");//TODO: should the user
be configurable?
-// writer.write("<clas:model>");
-// writer.write(lang);
-// writer.write("</clas:model>");
-// writer.write("<clas:text>");
-// StringEscapeUtils.escapeXml(writer, text); //write the escaped text
directly to the request
-// writer.write("</clas:text>");
-// writer.write("</clas:classify>");
-// writer.write(SOAP_SUFFIX);
-// writer.close();
-//
-// //log the Request (if request testing)
-// //log.info("Request \n{}",request.toString());
-//
-// //for response testing we need to call the service
-// //Call the service
-// long start = System.currentTimeMillis();
-// InputStream stream = con.getInputStream();
-// log.info("Request to took {}ms",System.currentTimeMillis()-start);
-// log.info("Response:\n{}",IOUtils.toString(stream));
-// stream.close();
-// }
+ public static void main(String[] args) throws Exception {
+ String lang = "fr";
+ String text = "Brigitte Bardot, née le 28 septembre " +
+ "1934 à Paris, est une actrice de cinéma et chanteuse
française.";
+
+ //For request testing
+ //Writer request = new StringWriter();
+
+ //For response testing
+ HttpURLConnection con = Utils.createPostRequest(
+ new URL("http://linguagrid.org/LSGrid/ws/dbpedia-classification"),
+ Collections.singletonMap("Content-Type", CONTENT_TYPE));
+ Writer request = new OutputStreamWriter(con.getOutputStream(),UTF8);
+
+ //"stream" the request content directly to the buffered writer
+ BufferedWriter writer = new BufferedWriter(request);
+
+ writer.write(SOAP_PREFIX);
+ writer.write("<clas:classify>");
+ writer.write("<clas:user>wiki</clas:user>");//TODO: should the user be
configurable?
+ writer.write("<clas:model>");
+ writer.write(lang);
+ writer.write("</clas:model>");
+ writer.write("<clas:text>");
+ StringEscapeUtils.escapeXml(writer, text); //write the escaped text
directly to the request
+ writer.write("</clas:text>");
+ writer.write("</clas:classify>");
+ writer.write(SOAP_SUFFIX);
+ writer.close();
+
+ //log the Request (if request testing)
+ //log.info("Request \n{}",request.toString());
+
+ //for response testing we need to call the service
+ //Call the service
+ long start = System.currentTimeMillis();
+ InputStream stream = con.getInputStream();
+ log.info("Request to took {}ms",System.currentTimeMillis()-start);
+ log.info("Response:\n{}",IOUtils.toString(stream));
+ stream.close();
+ }
}
Modified:
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LemmatizerClientHTTP.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LemmatizerClientHTTP.java?rev=1344563&r1=1344562&r2=1344563&view=diff
==============================================================================
---
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LemmatizerClientHTTP.java
(original)
+++
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LemmatizerClientHTTP.java
Thu May 31 04:45:41 2012
@@ -72,7 +72,7 @@ public class LemmatizerClientHTTP {
BufferedWriter writer = new BufferedWriter(new
OutputStreamWriter(con.getOutputStream(),UTF8));
//write the SOAP envelope, header and start the body
writer.write(SOAP_REQUEST_PREFIX);
- //wrtie the data (language and text)
+ //write the data (language and text)
writer.write("<mor:inputText lang=\"");
writer.write(lang);
writer.write("\" text=\"");
Modified:
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/CeliNamedEntityExtractionEnhancementEngine.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/CeliNamedEntityExtractionEnhancementEngine.java?rev=1344563&r1=1344562&r2=1344563&view=diff
==============================================================================
---
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/CeliNamedEntityExtractionEnhancementEngine.java
(original)
+++
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/CeliNamedEntityExtractionEnhancementEngine.java
Thu May 31 04:45:41 2012
@@ -75,12 +75,12 @@ public class CeliNamedEntityExtractionEn
private static Map<String, UriRef> entityTypes = new HashMap<String,
UriRef>();
static {
entityTypes.put("pers", OntologicalClasses.DBPEDIA_PERSON);
+ entityTypes.put("PER", OntologicalClasses.DBPEDIA_PERSON);
entityTypes.put("loc", OntologicalClasses.DBPEDIA_PLACE);
+ entityTypes.put("GPE", OntologicalClasses.DBPEDIA_PLACE);
entityTypes.put("org", OntologicalClasses.DBPEDIA_ORGANISATION);
entityTypes.put("time", OntologicalClasses.SKOS_CONCEPT);
- entityTypes.put("prod", OntologicalClasses.SKOS_CONCEPT);
- entityTypes.put("amount", OntologicalClasses.SKOS_CONCEPT);
}
/**
* The supported languages (configured via the {@link
#SUPPORTED_LANGUAGES}
@@ -114,7 +114,7 @@ public class CeliNamedEntityExtractionEn
@Property(value =
"http://linguagrid.org/LSGrid/ws/com.celi-france.linguagrid.namedentityrecognition.v0u0.demo")
public static final String SERVICE_URL =
"org.apache.stanbol.enhancer.engines.celi.ner.url";
- @Property(value = "fr",cardinality=1000)
+ @Property(value = {"fr","it"},cardinality=1000)
public static final String SUPPORTED_LANGUAGES =
"org.apache.stanbol.enhancer.engines.celi.ner.languages";
private String licenseKey;
@@ -128,7 +128,8 @@ public class CeliNamedEntityExtractionEn
super.activate(ctx);
@SuppressWarnings("unchecked")
Dictionary<String, Object> properties = ctx.getProperties();
-
+ log.info("Activate CELI NER engine:");
+ log.info(" > name: {}",getName());
this.licenseKey = (String) properties.get(LICENSE_KEY);
if (licenseKey == null || licenseKey.isEmpty()) {
log.warn("no CELI license key configured for this
Engine, a guest account will be used (max 100 requests per day). Go on
http://linguagrid.org for getting a proper license key.");
@@ -140,6 +141,7 @@ public class CeliNamedEntityExtractionEn
this.serviceURL = new URL(url);
this.client = new NERserviceClientHTTP(this.serviceURL,
this.licenseKey);
+ log.info(" > CELI service: {}",serviceURL);
//init the supported languages (now configurable)
Object languagObject = properties.get(SUPPORTED_LANGUAGES);
@@ -173,6 +175,7 @@ public class CeliNamedEntityExtractionEn
languagObject));
}
this.supportedLangs = Collections.unmodifiableSet(languages);
+ log.info(" > supported languages: {}",supportedLangs);
}
@Override
@@ -229,7 +232,7 @@ public class CeliNamedEntityExtractionEn
}
Language lang = new Language(language); //used for the palin literals
in TextAnnotations
try {
- List<NamedEntity> lista =
this.client.extractEntities(text);
+ List<NamedEntity> lista =
this.client.extractEntities(text, language);
LiteralFactory literalFactory =
LiteralFactory.getInstance();
MGraph g = ci.getMetadata();
@@ -269,7 +272,7 @@ public class CeliNamedEntityExtractionEn
private Resource getEntityRefForType(String type) {
if (!entityTypes.containsKey(type))
- return null;
+ return OntologicalClasses.SKOS_CONCEPT;
else
return entityTypes.get(type);
}
Modified:
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java?rev=1344563&r1=1344562&r2=1344563&view=diff
==============================================================================
---
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java
(original)
+++
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java
Thu May 31 04:45:41 2012
@@ -41,14 +41,14 @@ public class NERserviceClientHTTP {
* The XML version, encoding; SOAP envelope, heder and starting element
of the body;
* processTextRequest and text starting element.
*/
- private static final String REQUEST_PREFIX = "<?xml version=\"1.0\"
encoding=\""+UTF8.name()+"\"?>" +
+ private static final String SOAP_PREFIX = "<?xml version=\"1.0\"
encoding=\""+UTF8.name()+"\"?>" +
"<soapenv:Envelope
xmlns:soapenv=\"http://schemas.xmlsoap.org/soap/envelope/\" " +
"xmlns:v0u0=\"http://linguagrid.org/ns/namedentityrecognition/v0u0\"><soapenv:Header/>"
+
- "<soapenv:Body><v0u0:processTextRequest><v0u0:text>";
+ "<soapenv:Body>";
/**
* closes the text, processTextRequest, SOAP body and envelope
*/
- private static final String REQUEST_SUFFIX =
"</v0u0:text></v0u0:processTextRequest></soapenv:Body></soapenv:Envelope>";
+ private static final String SOAP_SUFFIX =
"</soapenv:Body></soapenv:Envelope>";
private final URL serviceEP;
private final String licenseKey;
@@ -70,7 +70,7 @@ public class NERserviceClientHTTP {
}
- public List<NamedEntity> extractEntities(String text) throws
SOAPException, IOException {
+ public List<NamedEntity> extractEntities(String text, String lang)
throws SOAPException, IOException {
if(text == null || text.isEmpty()){
//no text -> no extractions
return Collections.emptyList();
@@ -80,9 +80,11 @@ public class NERserviceClientHTTP {
HttpURLConnection con = Utils.createPostRequest(serviceEP,
requestHeaders);
//write content
BufferedWriter writer = new BufferedWriter(new
OutputStreamWriter(con.getOutputStream(),UTF8));
- writer.write(REQUEST_PREFIX);
+ writer.write(SOAP_PREFIX);
+ writer.write("<v0u0:processTextRequest><v0u0:text>");
StringEscapeUtils.escapeXml(writer, text);
- writer.write(REQUEST_SUFFIX);
+
writer.write("</v0u0:text><v0u0:language>"+lang+"</v0u0:language></v0u0:processTextRequest>");
+ writer.write(SOAP_SUFFIX);
writer.close();
//Call the service
long start = System.currentTimeMillis();
Modified:
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/CeliNamedEntityExtractionEnhancementEngineTest.java
URL:
http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/CeliNamedEntityExtractionEnhancementEngineTest.java?rev=1344563&r1=1344562&r2=1344563&view=diff
==============================================================================
---
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/CeliNamedEntityExtractionEnhancementEngineTest.java
(original)
+++
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/CeliNamedEntityExtractionEnhancementEngineTest.java
Thu May 31 04:45:41 2012
@@ -40,14 +40,19 @@ public class CeliNamedEntityExtractionEn
private static final ContentItemFactory ciFactory =
InMemoryContentItemFactory.getInstance();
- private static final String TEXT = "Brigitte Bardot, née le 28 septembre
1934 à Paris, est une actrice de cinéma et chanteuse française.";
-
+ private static final String TEXT_it = "Wolfgang Amadeus Mozart, nome di " +
+ "battesimo Joannes Chrysostomus Wolfgangus Theophilus Mozart " +
+ "(Salisburgo, 27 gennaio 1756 â Vienna, 5 dicembre 1791), è
stato " +
+ "un compositore, pianista, organista e violinista.";
+ private static final String TEXT_fr = "Brigitte Bardot, née le 28
septembre " +
+ "1934 à Paris, est une actrice de cinéma et chanteuse
française.";
+
@BeforeClass
public static void setUpServices() throws IOException,
ConfigurationException {
Dictionary<String, Object> properties = new Hashtable<String,
Object>();
properties.put(EnhancementEngine.PROPERTY_NAME, "celiNer");
properties.put(CeliNamedEntityExtractionEnhancementEngine.SERVICE_URL,
"http://linguagrid.org/LSGrid/ws/com.celi-france.linguagrid.namedentityrecognition.v0u0.demo");
-
properties.put(CeliNamedEntityExtractionEnhancementEngine.SUPPORTED_LANGUAGES,
"fr");
+
properties.put(CeliNamedEntityExtractionEnhancementEngine.SUPPORTED_LANGUAGES,
"fr;it");
MockComponentContext context = new MockComponentContext(properties);
nerEngine.activate(context);
}
@@ -60,17 +65,12 @@ public class CeliNamedEntityExtractionEn
public static ContentItem wrapAsContentItem(final String text) throws
IOException {
return ciFactory.createContentItem(new StringSource(text));
}
-
- @Test
- public void tesetEngine() throws Exception {
- ContentItem ci = wrapAsContentItem(TEXT);
+
+ private void testInput(String txt,String lang) throws EngineException,
IOException{
+ ContentItem ci = wrapAsContentItem(txt);
try {
- //add a simple triple to statically define the language of
the test
- //content
- ci.getMetadata().add(new TripleImpl(ci.getUri(),
DC_LANGUAGE, new PlainLiteralImpl("fr")));
- //unit test should not depend on each other (if possible)
-
//CeliLanguageIdentifierEnhancementEngineTest.addEnanchements(ci);
-
+ //add a simple triple to statically define the language of
the test content
+ ci.getMetadata().add(new TripleImpl(ci.getUri(),
DC_LANGUAGE, new PlainLiteralImpl(lang)));
nerEngine.computeEnhancements(ci);
TestUtils.logEnhancements(ci);
@@ -79,7 +79,7 @@ public class CeliNamedEntityExtractionEn
expectedValues.put(Properties.ENHANCER_EXTRACTED_FROM,
ci.getUri());
expectedValues.put(Properties.DC_CREATOR,
LiteralFactory.getInstance().createTypedLiteral(
nerEngine.getClass().getName()));
- int textAnnoNum =
validateAllTextAnnotations(ci.getMetadata(), TEXT, expectedValues);
+ int textAnnoNum =
validateAllTextAnnotations(ci.getMetadata(), txt, expectedValues);
log.info(textAnnoNum + " TextAnnotations found ...");
int entityAnnoNum =
EnhancementStructureHelper.validateAllEntityAnnotations(ci.getMetadata(),expectedValues);
log.info(entityAnnoNum + " EntityAnnotations found ...");
@@ -90,6 +90,12 @@ public class CeliNamedEntityExtractionEn
}
throw e;
}
+ }
+
+ @Test
+ public void tesetEngine() throws Exception {
+
this.testInput(CeliNamedEntityExtractionEnhancementEngineTest.TEXT_it, "it");
+
this.testInput(CeliNamedEntityExtractionEnhancementEngineTest.TEXT_fr, "fr");
}
// private int checkAllEntityAnnotations(MGraph g) {