Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/DBPSpotlightDisambiguateEnhancementEngine.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/DBPSpotlightDisambiguateEnhancementEngine.java?rev=1375110&view=auto ============================================================================== --- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/DBPSpotlightDisambiguateEnhancementEngine.java (added) +++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/DBPSpotlightDisambiguateEnhancementEngine.java Mon Aug 20 17:14:56 2012 @@ -0,0 +1,497 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.dbpspotlight.disambiguate; + +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_LABEL; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_REFERENCE; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_ENTITY_TYPE; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE; + +import java.io.BufferedReader; +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.URLEncoder; +import java.util.Collection; +import java.util.Collections; +import java.util.Dictionary; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Hashtable; +import java.util.Iterator; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.clerezza.rdf.core.Language; +import org.apache.clerezza.rdf.core.Literal; +import org.apache.clerezza.rdf.core.MGraph; +import org.apache.clerezza.rdf.core.NonLiteral; +import org.apache.clerezza.rdf.core.Resource; +import org.apache.clerezza.rdf.core.Triple; +import org.apache.clerezza.rdf.core.UriRef; +import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl; +import org.apache.clerezza.rdf.core.impl.TripleImpl; +import org.apache.clerezza.rdf.core.serializedform.Serializer; +import org.apache.felix.scr.annotations.Component; +import org.apache.felix.scr.annotations.Properties; +import org.apache.felix.scr.annotations.Property; +import org.apache.felix.scr.annotations.Service; +import org.apache.stanbol.enhancer.servicesapi.Blob; +import org.apache.stanbol.enhancer.servicesapi.ContentItem; +import org.apache.stanbol.enhancer.servicesapi.EngineException; +import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; +import org.apache.stanbol.enhancer.servicesapi.InvalidContentException; +import org.apache.stanbol.enhancer.servicesapi.ServiceProperties; +import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine; +import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper; +import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper; +import org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses; +import org.osgi.service.cm.ConfigurationException; +import org.osgi.service.component.ComponentContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; + +/** + * {@link DBPSpotlightDisambiguateEnhancementEngine} provides functionality to + * enhance document with their language. + * + * @author Iavor Jelev, Babelmonkeys (GzEvD) + */ +@Component(metatype = true, immediate = true, label = "%stanbol.DBPSpotlightDisambiguateEnhancementEngine.name", description = "%stanbol.DBPSpotlightDisambiguateEnhancementEngine.description") +@Service +@Properties(value = { @Property(name = EnhancementEngine.PROPERTY_NAME, value = "dbpspotlightdisambiguate") }) +public class DBPSpotlightDisambiguateEnhancementEngine extends + AbstractEnhancementEngine<IOException, RuntimeException> implements + EnhancementEngine, ServiceProperties { + + // all parameters which can be used to configure the EnhancementEngine + @Property(value = "http://spotlight.dbpedia.org/rest/annotate") + public static final String SL_URL_KEY = "stanbol.DBPSpotlightDisambiguateEnhancementEngine.url"; + + @Property(value = "Document") + public static final String SL_DISAMBIGUATOR = "stanbol.DBPSpotlightDisambiguateEnhancementEngine.disambiguator"; + + @Property() + public static final String SL_RESTRICTION = "stanbol.DBPSpotlightDisambiguateEnhancementEngine.types"; + + @Property() + public static final String SL_SPARQL = "stanbol.DBPSpotlightDisambiguateEnhancementEngine.sparql"; + + @Property() + public static final String SL_SUPPORT = "stanbol.DBPSpotlightDisambiguateEnhancementEngine.support"; + + @Property() + public static final String SL_CONFIDENCE = "stanbol.DBPSpotlightDisambiguateEnhancementEngine.confidence"; + + /** + * The default value for the Execution of this Engine. Currently set to + * {@link ServiceProperties#ORDERING_PRE_PROCESSING} + */ + public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION - 31; + + /** + * This contains the only MIME type directly supported by this enhancement + * engine. + */ + private static final String TEXT_PLAIN_MIMETYPE = "text/plain"; + /** Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE} */ + private static final Set<String> SUPPORTED_MIMTYPES = Collections + .singleton(TEXT_PLAIN_MIMETYPE); + /** This contains the logger. */ + private static final Logger log = LoggerFactory + .getLogger(DBPSpotlightDisambiguateEnhancementEngine.class); + /** holds the url of the Spotlight REST endpoint */ + private String spotlightUrl; + /** holds the chosen of disambiguator to be used */ + private String spotlightDisambiguator; + /** holds the type restriction for the results, if the user wishes one */ + private String spotlightTypesRestriction; + /** holds the chosen minimal support value */ + private String spotlightSupport; + /** holds the chosen minimal confidence value */ + private String spotlightConfidence; + /** holds the sparql restriction for the results, if the user wishes one */ + private String spotlightSparql; + /** + * holds the existing TextAnnotations, which are used as input for DBpedia + * Spotlight, and later for linking of the results + */ + private Hashtable<String, UriRef> textAnnotationsMap; + + /** + * Initialize all parameters from the configuration panel, or with their + * default values + * + * @param ce + * the {@link ComponentContext} + */ + @SuppressWarnings("unchecked") + protected void activate(ComponentContext ce) throws ConfigurationException, + IOException { + + super.activate(ce); + + Dictionary<String, Object> properties = ce.getProperties(); + spotlightUrl = properties.get(SL_URL_KEY) == null ? "http://spotlight.dbpedia.org/rest/annotate" + : (String) properties.get(SL_URL_KEY); + spotlightDisambiguator = properties.get(SL_DISAMBIGUATOR) == null ? null + : (String) properties.get(SL_DISAMBIGUATOR); + spotlightTypesRestriction = properties.get(SL_RESTRICTION) == null ? null + : (String) properties.get(SL_RESTRICTION); + spotlightSparql = properties.get(SL_SPARQL) == null ? null + : (String) properties.get(SL_SPARQL); + spotlightSupport = properties.get(SL_SUPPORT) == null ? "-1" + : (String) properties.get(SL_SUPPORT); + spotlightConfidence = properties.get(SL_CONFIDENCE) == null ? "-1" + : (String) properties.get(SL_CONFIDENCE); + } + + /** + * Check if the content can be enhanced + * + * @param ci + * the {@link ContentItem} + */ + public int canEnhance(ContentItem ci) throws EngineException { + if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null) { + return ENHANCE_SYNCHRONOUS; + } else { + return CANNOT_ENHANCE; + } + } + + /** + * Calculate the enhancements by doing a POST request to the DBpedia + * Spotlight endpoint and processing the results + * + * @param ci + * the {@link ContentItem} + */ + public void computeEnhancements(ContentItem ci) throws EngineException { + Entry<UriRef, Blob> contentPart = ContentItemHelper.getBlob(ci, + SUPPORTED_MIMTYPES); + if (contentPart == null) { + throw new IllegalStateException( + "No ContentPart with Mimetype '" + + TEXT_PLAIN_MIMETYPE + + "' found for ContentItem " + + ci.getUri() + + ": This is also checked in the canEnhance method! -> This " + + "indicated an Bug in the implementation of the " + + "EnhancementJobManager!"); + } + String text = ""; + try { + text = ContentItemHelper.getText(contentPart.getValue()); + + } catch (IOException e) { + throw new InvalidContentException(this, ci, e); + } + + // Retrieve the existing text annotations (requires read lock) + MGraph graph = ci.getMetadata(); + String xmlTextAnnotations = this.getSpottedXml(text, graph); + Collection<Annotation> dbpslGraph = doPostRequest(text, + xmlTextAnnotations); + if (dbpslGraph != null) { + // Acquire a write lock on the ContentItem when adding the + // enhancements + ci.getLock().writeLock().lock(); + try { + createEnhancements(dbpslGraph, ci); + if (log.isDebugEnabled()) { + Serializer serializer = Serializer.getInstance(); + ByteArrayOutputStream debugStream = new ByteArrayOutputStream(); + serializer.serialize(debugStream, ci.getMetadata(), + "application/rdf+xml"); + try { + log.debug("DBpedia Enhancements:\n{}", + debugStream.toString("UTF-8")); + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + } + } + } finally { + ci.getLock().writeLock().unlock(); + } + } + } + + /** + * The method adds the returned DBpedia Spotlight annotations to the content + * item's metadata. For each DBpedia resource an EntityAnnotation is created + * and linked to the according TextAnnotation. + * + * @param occs + * a Collection of entity information + * @param ci + * the content item + */ + public void createEnhancements(Collection<Annotation> occs, + ContentItem ci) { + final Language language; // used for plain literals representing parts + // fo the content + String langString = getMetadataLanguage(ci.getMetadata(), null); + + if (langString != null && !langString.isEmpty()) { + language = new Language(langString); + } else { + language = null; + } + + HashMap<Resource, UriRef> entityAnnotationMap = new HashMap<Resource, UriRef>(); + + for (Annotation occ : occs) { + + if (textAnnotationsMap.get(occ.surfaceForm) != null) { + UriRef textAnnotation = textAnnotationsMap.get(occ.surfaceForm); + MGraph model = ci.getMetadata(); + UriRef entityAnnotation = EnhancementEngineHelper + .createEntityEnhancement(ci, this); + entityAnnotationMap.put(occ.uri, entityAnnotation); + Literal label = new PlainLiteralImpl(occ.surfaceForm, language); + model.add(new TripleImpl(entityAnnotation, DC_RELATION, + textAnnotation)); + model.add(new TripleImpl(entityAnnotation, + ENHANCER_ENTITY_LABEL, label)); + + HashSet<String> t = occ.getTypeNames(); + if (t != null) { + Iterator<String> it = t.iterator(); + while (it.hasNext()) + model.add(new TripleImpl(entityAnnotation, + ENHANCER_ENTITY_TYPE, new UriRef(it.next()))); + } + model.add(new TripleImpl(entityAnnotation, + ENHANCER_ENTITY_REFERENCE, occ.uri)); + } + } + } + + /** + * Sends a POST request to the DBpediaSpotlight url. + * + * @param text + * a <code>String</code> with the text to be analyzed + * @param xmlTextAnnotations + * @param textAnnotations + * @return a <code>String</code> with the server response + * @throws EngineException + * if the request cannot be sent + */ + public Collection<Annotation> doPostRequest(String text, + String xmlTextAnnotations) throws EngineException { + StringBuilder data = new StringBuilder(); + + try { + data.append(URLEncoder.encode("spotter=SpotXmlParser", "UTF-8") + + "&"); + if (spotlightDisambiguator != null + && !spotlightDisambiguator.isEmpty()) + data.append(URLEncoder.encode("disambiguator", "UTF-8") + "=" + + URLEncoder.encode(spotlightDisambiguator, "UTF-8") + + "&"); + if (spotlightTypesRestriction != null + && !spotlightTypesRestriction.isEmpty()) + data.append(URLEncoder.encode("types", "UTF-8") + "=" + + URLEncoder.encode(spotlightTypesRestriction, "UTF-8") + + "&"); + if (spotlightSupport != null && !spotlightSupport.isEmpty()) + data.append(URLEncoder.encode("support", "UTF-8") + "=" + + URLEncoder.encode(spotlightSupport, "UTF-8") + "&"); + if (spotlightConfidence != null && !spotlightConfidence.isEmpty()) + data.append(URLEncoder.encode("confidence", "UTF-8") + "=" + + URLEncoder.encode(spotlightConfidence, "UTF-8") + "&"); + if (spotlightSparql != null && !spotlightSparql.isEmpty() + && spotlightTypesRestriction == null) + data.append(URLEncoder.encode("sparql", "UTF-8") + "=" + + URLEncoder.encode(spotlightSparql, "UTF-8") + "&"); + data.append(URLEncoder.encode("text", "UTF-8") + "=" + + URLEncoder.encode(xmlTextAnnotations, "UTF-8")); + } catch (UnsupportedEncodingException e) { + throw new EngineException( + "Data for the httprequest could not be converted. Error: " + + e.getMessage()); + } + + HttpURLConnection connection = null; + StringBuffer response = new StringBuffer(); + + try { + // Create connection + URL url = new URL(spotlightUrl); + connection = (HttpURLConnection) url.openConnection(); + connection.setRequestMethod("POST"); + connection.setRequestProperty("Content-Type", + "application/x-www-form-urlencoded"); + connection.setRequestProperty("Accept", "text/xml"); + + connection.setUseCaches(false); + connection.setDoInput(true); + connection.setDoOutput(true); + + // Send request + DataOutputStream wr = new DataOutputStream( + connection.getOutputStream()); + wr.writeBytes(data.toString()); + wr.flush(); + wr.close(); + + // Get Response + InputStream is = connection.getInputStream(); + BufferedReader rd = new BufferedReader(new InputStreamReader(is)); + String line; + while ((line = rd.readLine()) != null) { + response.append(line); + response.append('\r'); + } + rd.close(); + + } catch (Exception e) { + log.error("[request - error] The following error occurred: " + + e.getMessage()); + + } finally { + + if (connection != null) { + connection.disconnect(); + } + } + + XMLParser xmlParser = new XMLParser(); + try { + Document xmlDoc = xmlParser.loadXMLFromString(response.toString()); + NodeList nlist = xmlParser.getElementsByTagName(xmlDoc, "Resource"); + Collection<Annotation> annos = this.getAnnotations(nlist); + + return annos; + } catch (Exception e) { + throw new EngineException( + "Response XML could not be parsed. Error: " + + e.getMessage()); + } + } + + private String getSpottedXml(String text, MGraph graph) { + StringBuilder xml = new StringBuilder(); + textAnnotationsMap = new Hashtable<String, UriRef>(); + + xml.append(String.format("<annotation text=\"%s\">", text)); + try { + for (Iterator<Triple> it = graph.filter(null, RDF_TYPE, + TechnicalClasses.ENHANCER_TEXTANNOTATION); it.hasNext();) { + // Triple tAnnotation = it.next(); + UriRef uri = (UriRef) it.next().getSubject(); + String surfaceForm = EnhancementEngineHelper.getString(graph, + uri, ENHANCER_SELECTED_TEXT); + if (surfaceForm != null) { + String offset = EnhancementEngineHelper.getString(graph, + uri, ENHANCER_START); + textAnnotationsMap.put(surfaceForm, uri); + xml.append(String.format( + "<surfaceForm name=\"%s\" offset=\"%s\"/>", + surfaceForm, offset)); + } + } + } catch (Exception e) { + log.error(e.getMessage()); + } + + return xml.append("</annotation>").toString(); + } + + /** + * This method creates the Collection of Annotations, which the method + * <code>createEnhancement</code> adds to the meta data of the content item. + * + * @param nList + * NodeList of all Resources contained in the XML response from + * DBpedia Spotlight + * @return a Collection<DBPSLAnnotation> with all annotations + */ + private Collection<Annotation> getAnnotations(NodeList nList) { + Collection<Annotation> dbpslAnnos = new HashSet<Annotation>(); + + for (int temp = 0; temp < nList.getLength(); temp++) { + Annotation dbpslann = new Annotation(); + Element node = (Element) nList.item(temp); + dbpslann.uri = new UriRef(node.getAttribute("URI")); + dbpslann.support = (new Integer(node.getAttribute("support"))) + .intValue(); + dbpslann.types = node.getAttribute("types"); + dbpslann.surfaceForm = node.getAttribute("surfaceForm"); + dbpslann.offset = (new Integer(node.getAttribute("offset"))) + .intValue(); + dbpslann.similarityScore = (new Double( + node.getAttribute("similarityScore"))).doubleValue(); + dbpslann.percentageOfSecondRank = (new Double( + node.getAttribute("percentageOfSecondRank"))).doubleValue(); + + dbpslAnnos.add(dbpslann); + } + + return dbpslAnnos; + } + + public Map<String, Object> getServiceProperties() { + return Collections.unmodifiableMap(Collections.singletonMap( + ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder)); + } + + public String getMetadataLanguage(MGraph model, NonLiteral subj) { + Iterator<Triple> it = model.filter(subj, DC_LANGUAGE, null); + if (it.hasNext()) { + Resource langNode = it.next().getObject(); + return getLexicalForm(langNode); + } + return null; + } + + public String getLexicalForm(Resource res) { + if (res == null) { + return null; + } else if (res instanceof Literal) { + return ((Literal) res).getLexicalForm(); + } else { + return res.toString(); + } + } + + /** + * This method is used by the test class to set the endpoint url + * + * @param url + * String the url of the Spotlight endpoint + */ + public void setEndpointUrl(String url) { + spotlightUrl = url; + } + +}
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/XMLParser.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/XMLParser.java?rev=1375110&view=auto ============================================================================== --- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/XMLParser.java (added) +++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/XMLParser.java Mon Aug 20 17:14:56 2012 @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.dbpspotlight.disambiguate; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.w3c.dom.Document; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + +/** + * Parses the XML results given by DBPedia Spotlight. + * + * @author <a href="mailto:[email protected]">Iavor Jelev</a> + */ + +public class XMLParser { + + public NodeList getElementsByTagName(Document doc, String tagName) { + + return doc.getElementsByTagName(tagName); + } + + public Document loadXMLFromString(String xml) throws SAXException, + IOException { + Document doc = loadXMLFromInputStream(new ByteArrayInputStream( + xml.getBytes())); + doc.getDocumentElement().normalize(); + + return doc; + } + + public Document loadXMLFromInputStream(InputStream is) throws SAXException, + IOException { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setNamespaceAware(true); + DocumentBuilder builder = null; + try { + builder = factory.newDocumentBuilder(); + } catch (ParserConfigurationException ex) { + } + Document doc = builder.parse(is); + is.close(); + doc.getDocumentElement().normalize(); + + return doc; + } + + public Document loadXMLFromFile(String filePath) + throws ParserConfigurationException, SAXException, IOException { + File fXmlFile = new File(filePath); + DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); + DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); + Document doc = dBuilder.parse(fXmlFile); + doc.getDocumentElement().normalize(); + + return doc; + } +} \ No newline at end of file Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/core/DBPSpotlightDisambiguateEnhancementTest.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/core/DBPSpotlightDisambiguateEnhancementTest.java?rev=1375110&view=auto ============================================================================== --- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/core/DBPSpotlightDisambiguateEnhancementTest.java (added) +++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-disambiguate/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/disambiguate/core/DBPSpotlightDisambiguateEnhancementTest.java Mon Aug 20 17:14:56 2012 @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.dbpspotlight.disambiguate.core; + +import java.util.Collection; + +import org.apache.commons.io.IOUtils; +import org.apache.stanbol.enhancer.engines.dbpspotlight.disambiguate.Annotation; +import org.apache.stanbol.enhancer.engines.dbpspotlight.disambiguate.DBPSpotlightDisambiguateEnhancementEngine; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; +import org.osgi.service.cm.ConfigurationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class provides a JUnit test for DBpedia Spotlight Annotate + * EnhancementEngine. + * + * @author Iavor Jelev, babelmonkeys / GzEvD + */ +public class DBPSpotlightDisambiguateEnhancementTest { + + /** + * This contains the logger. + */ + private static final Logger LOG = LoggerFactory + .getLogger(DBPSpotlightDisambiguateEnhancementTest.class); + private static String SPL_URL = System + .getProperty(DBPSpotlightDisambiguateEnhancementEngine.SL_URL_KEY) == null ? "http://spotlight.dbpedia.org/rest/annotate" + : (String) System + .getProperty(DBPSpotlightDisambiguateEnhancementEngine.SL_URL_KEY); + private static String TEST_TEXT = "President Obama is meeting Angela Merkel in Berlin on Monday."; + private static DBPSpotlightDisambiguateEnhancementEngine dbpslight; + private static String testFile = "spots.xml"; + private static String spotsXml; + + @BeforeClass + public static void oneTimeSetup() throws ConfigurationException { + dbpslight = new DBPSpotlightDisambiguateEnhancementEngine(); + dbpslight.setEndpointUrl(SPL_URL); + } + + @Test + public void testEntityExtraction() { + Collection<Annotation> entities; + try { + spotsXml = IOUtils.toString(this.getClass().getClassLoader() + .getResourceAsStream(testFile)); + System.out.println(SPL_URL); + entities = dbpslight.doPostRequest(TEST_TEXT, spotsXml); + LOG.info("Found entities: {}", entities.size()); + LOG.debug("Entities:\n{}", entities); + Assert.assertFalse("No entities were found!", entities.isEmpty()); + } catch (Exception e) { + Assert.assertFalse("An EngineException occurred! The message was: " + + e.getMessage(), true); + } + } +} Modified: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/pom.xml URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/pom.xml?rev=1375110&r1=1375107&r2=1375110&view=diff ============================================================================== --- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/pom.xml (original) +++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/pom.xml Mon Aug 20 17:14:56 2012 @@ -22,7 +22,7 @@ </parent> <groupId>org.apache.stanbol</groupId> - <artifactId>org.apache.stanbol.enhancer.engines.dbpspotlightspot</artifactId> + <artifactId>org.apache.stanbol.enhancer.engines.dbpspotlight.spot</artifactId> <packaging>bundle</packaging> <name>Apache Stanbol Enhancer Enhancement Engine : DBPedia Spotlight Spot</name> @@ -43,7 +43,7 @@ <configuration> <instructions> <Export-Package> - org.apache.stanbol.enhancer.engines.dbpspotlightspot;version=${project.version} + org.apache.stanbol.enhancer.engines.dbpspotlight.spot;version=${project.version} </Export-Package> <Embed-Dependency> </Embed-Dependency> Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementEngine.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementEngine.java?rev=1375110&view=auto ============================================================================== --- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementEngine.java (added) +++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/DBPSpotlightSpotEnhancementEngine.java Mon Aug 20 17:14:56 2012 @@ -0,0 +1,429 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.dbpspotlight.spot; + +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START; + +import java.io.BufferedReader; +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.URLEncoder; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Dictionary; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.clerezza.rdf.core.Language; +import org.apache.clerezza.rdf.core.Literal; +import org.apache.clerezza.rdf.core.LiteralFactory; +import org.apache.clerezza.rdf.core.MGraph; +import org.apache.clerezza.rdf.core.NonLiteral; +import org.apache.clerezza.rdf.core.Resource; +import org.apache.clerezza.rdf.core.Triple; +import org.apache.clerezza.rdf.core.UriRef; +import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl; +import org.apache.clerezza.rdf.core.impl.TripleImpl; +import org.apache.clerezza.rdf.core.serializedform.Serializer; +import org.apache.felix.scr.annotations.Component; +import org.apache.felix.scr.annotations.Properties; +import org.apache.felix.scr.annotations.Property; +import org.apache.felix.scr.annotations.Service; +import org.apache.stanbol.enhancer.servicesapi.Blob; +import org.apache.stanbol.enhancer.servicesapi.ContentItem; +import org.apache.stanbol.enhancer.servicesapi.EngineException; +import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; +import org.apache.stanbol.enhancer.servicesapi.InvalidContentException; +import org.apache.stanbol.enhancer.servicesapi.ServiceProperties; +import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine; +import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper; +import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper; +import org.osgi.service.cm.ConfigurationException; +import org.osgi.service.component.ComponentContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; + +/** + * {@link DBPSpotlightSpotEnhancementEngine} provides functionality to enhance + * document with their language. + * + * @author Iavor Jelev, Babelmonkeys (GzEvD) + */ +@Component(metatype = true, immediate = true, label = "%stanbol.DBPSpotlightSpotEnhancementEngine.name", description = "%stanbol.DBPSpotlightSpotEnhancementEngine.description") +@Service +@Properties(value = { @Property(name = EnhancementEngine.PROPERTY_NAME, value = "dbpspotlightspot") }) +public class DBPSpotlightSpotEnhancementEngine extends + AbstractEnhancementEngine<IOException, RuntimeException> implements + EnhancementEngine, ServiceProperties { + + /** + * a configurable value of the text segment length to check + */ + @Property(value = "http://spotlight.dbpedia.org/rest/spot") + public static final String SL_URL_KEY = "stanbol.DBPSpotlightSpotEnhancementEngine.url"; + + @Property(value = "LingPipeSpotter") + public static final String SL_SPOTTER = "stanbol.DBPSpotlightSpotEnhancementEngine.spotter"; + + /** + * The default value for the Execution of this Engine. Currently set to + * {@link ServiceProperties#ORDERING_PRE_PROCESSING} + */ + public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION - 29; + + /** + * This contains the only MIME type directly supported by this enhancement + * engine. + */ + private static final String TEXT_PLAIN_MIMETYPE = "text/plain"; + /** + * Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE} + */ + private static final Set<String> SUPPORTED_MIMTYPES = Collections + .singleton(TEXT_PLAIN_MIMETYPE); + + /** + * This contains a list of languages supported by DBpedia Spotlight. If the + * metadata doesn't contain a value for the language as the value of the + * {@link Property.DC_LANG property} the content can't be processed. + */ + protected static final Set<String> SUPPORTED_LANGUAGES = Collections + .unmodifiableSet(new HashSet<String>(Arrays.asList("en"))); + + /** holds the logger. */ + private static final Logger log = LoggerFactory + .getLogger(DBPSpotlightSpotEnhancementEngine.class); + + /** holds the url of the Spotlight REST endpoint */ + private String spotlightUrl; + /** holds the chosen of spotter to be used */ + private String spotlightSpotter; + + /** + * Initialize all parameters from the configuration panel, or with their + * default values + * + * @param ce + * the {@link ComponentContext} + */ + @SuppressWarnings("unchecked") + protected void activate(ComponentContext ce) throws ConfigurationException, + IOException { + + super.activate(ce); + + Dictionary<String, Object> properties = ce.getProperties(); + spotlightUrl = properties.get(SL_URL_KEY) == null ? "http://spotlight.dbpedia.org/rest/spot" + : (String) properties.get(SL_URL_KEY); + spotlightSpotter = properties.get(SL_SPOTTER) == null ? null + : (String) properties.get(SL_SPOTTER); + } + + /** + * Check if the content can be enhanced + * + * @param ci + * the {@link ContentItem} + */ + public int canEnhance(ContentItem ci) throws EngineException { + if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null) { + String language = getMetadataLanguage(ci.getMetadata(), null); + if (language != null && !SUPPORTED_LANGUAGES.contains(language)) { + log.info( + "DBpedia Spotlight can not process ContentItem {} because " + + "language {} is not supported (supported: {})", + new Object[] { ci.getUri(), language, + SUPPORTED_LANGUAGES }); + return CANNOT_ENHANCE; + } + return ENHANCE_SYNCHRONOUS; + } + return CANNOT_ENHANCE; + } + + /** + * Calculate the enhancements by doing a POST request to the DBpedia + * Spotlight endpoint and processing the results + * + * @param ci + * the {@link ContentItem} + */ + public void computeEnhancements(ContentItem ci) throws EngineException { + Entry<UriRef, Blob> contentPart = ContentItemHelper.getBlob(ci, + SUPPORTED_MIMTYPES); + if (contentPart == null) { + throw new IllegalStateException( + "No ContentPart with Mimetype '" + + TEXT_PLAIN_MIMETYPE + + "' found for ContentItem " + + ci.getUri() + + ": This is also checked in the canEnhance method! -> This " + + "indicated an Bug in the implementation of the " + + "EnhancementJobManager!"); + } + String text = ""; + try { + text = ContentItemHelper.getText(contentPart.getValue()); + } catch (IOException e) { + throw new InvalidContentException(this, ci, e); + } + + Collection<SurfaceForm> dbpslGraph = doPostRequest(text); + if (dbpslGraph != null) { + // Acquire a write lock on the ContentItem when adding the + // enhancements + ci.getLock().writeLock().lock(); + try { + createEnhancements(dbpslGraph, ci); + if (log.isDebugEnabled()) { + Serializer serializer = Serializer.getInstance(); + ByteArrayOutputStream debugStream = new ByteArrayOutputStream(); + serializer.serialize(debugStream, ci.getMetadata(), + "application/rdf+xml"); + try { + log.debug("DBpedia Spotlight Spot Enhancements:\n{}", + debugStream.toString("UTF-8")); + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + } + } + } finally { + ci.getLock().writeLock().unlock(); + } + } + } + + /** + * The method adds the returned DBpedia Spotlight surface forms to the + * content item's metadata. For each one an TextAnnotation is created. + * + * @param occs + * a Collection of entity information + * @param ci + * the content item + */ + public void createEnhancements(Collection<SurfaceForm> occs, + ContentItem ci) { + LiteralFactory literalFactory = LiteralFactory.getInstance(); + final Language language; // used for plain literals representing parts + // fo the content + String langString = getMetadataLanguage(ci.getMetadata(), null); + + if (langString != null && !langString.isEmpty()) { + language = new Language(langString); + } else { + language = null; + } + + HashMap<String, UriRef> entityAnnotationMap = new HashMap<String, UriRef>(); + + for (SurfaceForm occ : occs) { + UriRef textAnnotation = EnhancementEngineHelper + .createTextEnhancement(ci, this); + MGraph model = ci.getMetadata(); + + model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, + new PlainLiteralImpl(occ.name, language))); + model.add(new TripleImpl(textAnnotation, ENHANCER_START, + literalFactory.createTypedLiteral(occ.offset))); + model.add(new TripleImpl(textAnnotation, ENHANCER_END, + literalFactory.createTypedLiteral(occ.offset + + occ.name.length()))); + model.add(new TripleImpl(textAnnotation, DC_TYPE, new UriRef( + occ.type))); + // TODO ################## model.add(new TripleImpl(textAnnotation, + // ENHANCER_SELECTION_CONTEXT, new + // PlainLiteralImpl(occ.context,language))); + + if (entityAnnotationMap.containsKey(occ.name)) { + model.add(new TripleImpl(entityAnnotationMap.get(occ.name), + DC_RELATION, textAnnotation)); + } else { + entityAnnotationMap.put(occ.name, textAnnotation); + } + } + } + + /** + * Sends a POST request to the DBpediaSpotlight url. + * + * @param text + * a <code>String</code> with the text to be analyzed + * @return a <code>String</code> with the server response + * @throws EngineException + * if the request cannot be sent + */ + public Collection<SurfaceForm> doPostRequest(String text) + throws EngineException { + StringBuilder data = new StringBuilder(); + try { + if (spotlightSpotter != null && !spotlightSpotter.isEmpty()) + data.append(URLEncoder.encode("spotter", "UTF-8") + "=" + + URLEncoder.encode(spotlightSpotter, "UTF-8") + "&"); + data.append(URLEncoder.encode("text", "UTF-8") + "=" + + URLEncoder.encode(text, "UTF-8")); + } catch (UnsupportedEncodingException e) { + throw new EngineException( + "Data for the httprequest could not be converted. Error: " + + e.getMessage()); + } + + HttpURLConnection connection = null; + StringBuffer response = new StringBuffer(); + + try { + // Create connection + URL url = new URL(spotlightUrl); + connection = (HttpURLConnection) url.openConnection(); + connection.setRequestMethod("POST"); + connection.setRequestProperty("Content-Type", + "application/x-www-form-urlencoded"); + connection.setRequestProperty("Accept", "text/xml"); + + connection.setUseCaches(false); + connection.setDoInput(true); + connection.setDoOutput(true); + + // Send request + DataOutputStream wr = new DataOutputStream( + connection.getOutputStream()); + wr.writeBytes(data.toString()); + wr.flush(); + wr.close(); + + // Get Response + InputStream is = connection.getInputStream(); + BufferedReader rd = new BufferedReader(new InputStreamReader(is)); + String line; + while ((line = rd.readLine()) != null) { + response.append(line); + response.append('\r'); + } + rd.close(); + + } catch (Exception e) { + + log.error("[request] Request could not be made. Error: " + + e.getMessage()); + e.printStackTrace(); + return null; + + } finally { + + if (connection != null) { + connection.disconnect(); + } + } + + XMLParser xmlParser = new XMLParser(); + try { + Document xmlDoc = xmlParser.loadXMLFromString(response.toString()); + NodeList nlist = xmlParser.getElementsByTagName(xmlDoc, + "surfaceForm"); + Collection<SurfaceForm> annos = this.getAnnotations(nlist); + + return annos; + } catch (Exception e) { + log.error("[response] Response XML could not be parsed. Error: " + + e.getMessage()); + throw new EngineException( + "Response XML could not be parsed. Error: " + + e.getMessage()); + } + } + + /** + * This method creates the Collection of surface forms, which the method + * <code>createEnhancement</code> adds to the meta data of the content item + * as TextAnnotations. + * + * @param nList + * NodeList of all Resources contained in the XML response from + * DBpedia Spotlight + * @return a Collection<DBPSLSurfaceForm> with all annotations + */ + private Collection<SurfaceForm> getAnnotations(NodeList nList) { + Collection<SurfaceForm> dbpslAnnos = new HashSet<SurfaceForm>(); + + for (int temp = 0; temp < nList.getLength(); temp++) { + SurfaceForm dbpslann = new SurfaceForm(); + Element node = (Element) nList.item(temp); + dbpslann.name = node.getAttribute("name"); + dbpslann.offset = (new Integer(node.getAttribute("offset"))) + .intValue(); + dbpslann.type = node.getAttribute("type"); + + dbpslAnnos.add(dbpslann); + } + + return dbpslAnnos; + } + + public Map<String, Object> getServiceProperties() { + return Collections.unmodifiableMap(Collections.singletonMap( + ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder)); + } + + public String getMetadataLanguage(MGraph model, NonLiteral subj) { + Iterator<Triple> it = model.filter(subj, DC_LANGUAGE, null); + if (it.hasNext()) { + Resource langNode = it.next().getObject(); + return getLexicalForm(langNode); + } + return null; + } + + public String getLexicalForm(Resource res) { + if (res == null) { + return null; + } else if (res instanceof Literal) { + return ((Literal) res).getLexicalForm(); + } else { + return res.toString(); + } + } + + /** + * This method is used by the test class to set the endpoint url + * + * @param url + * String the url of the Spotlight endpoint + */ + public void setEndpointUrl(String url) { + spotlightUrl = url; + } + +} Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/SurfaceForm.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/SurfaceForm.java?rev=1375110&view=auto ============================================================================== --- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/SurfaceForm.java (added) +++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/SurfaceForm.java Mon Aug 20 17:14:56 2012 @@ -0,0 +1,36 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.dbpspotlight.spot; + +//import org.apache.clerezza.rdf.core.Resource; + +/** + * Stores the surface forms given by DBPedia Spotlight Spot. + * + * @author <a href="mailto:[email protected]">Iavor Jelev</a> + */ +public class SurfaceForm { + + public String name; + public String type; + public Integer offset; + + public String toString() { + return String.format("[name=%s, offset=%i, type=%s]", name, offset, + type); + } +} Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/XMLParser.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/XMLParser.java?rev=1375110&view=auto ============================================================================== --- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/XMLParser.java (added) +++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/XMLParser.java Mon Aug 20 17:14:56 2012 @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.dbpspotlight.spot; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.w3c.dom.Document; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + +/** + * Parses the XML results given by DBPedia Spotlight. + * + * @author <a href="mailto:[email protected]">Iavor Jelev</a> + */ + +public class XMLParser { + + public NodeList getElementsByTagName(Document doc, String tagName) { + + return doc.getElementsByTagName(tagName); + } + + public Document loadXMLFromString(String xml) throws SAXException, + IOException { + Document doc = loadXMLFromInputStream(new ByteArrayInputStream( + xml.getBytes())); + doc.getDocumentElement().normalize(); + + return doc; + } + + public Document loadXMLFromInputStream(InputStream is) throws SAXException, + IOException { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setNamespaceAware(true); + DocumentBuilder builder = null; + try { + builder = factory.newDocumentBuilder(); + } catch (ParserConfigurationException ex) { + } + Document doc = builder.parse(is); + is.close(); + doc.getDocumentElement().normalize(); + + return doc; + } + + public Document loadXMLFromFile(String filePath) + throws ParserConfigurationException, SAXException, IOException { + File fXmlFile = new File(filePath); + DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); + DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); + Document doc = dBuilder.parse(fXmlFile); + doc.getDocumentElement().normalize(); + + return doc; + } +} \ No newline at end of file Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/core/DBPSpotlightSpotEnhancementTest.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/core/DBPSpotlightSpotEnhancementTest.java?rev=1375110&view=auto ============================================================================== --- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/core/DBPSpotlightSpotEnhancementTest.java (added) +++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight-spot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlight/spot/core/DBPSpotlightSpotEnhancementTest.java Mon Aug 20 17:14:56 2012 @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.dbpspotlight.spot.core; + +import java.util.Collection; + +import org.apache.stanbol.enhancer.engines.dbpspotlight.spot.DBPSpotlightSpotEnhancementEngine; +import org.apache.stanbol.enhancer.engines.dbpspotlight.spot.SurfaceForm; +import org.apache.stanbol.enhancer.servicesapi.EngineException; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; +import org.osgi.service.cm.ConfigurationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class provides a JUnit test for DBpedia Spotlight Spot + * EnhancementEngine. + * + * @author Iavor Jelev, babelmonkeys / GzEvD + */ +public class DBPSpotlightSpotEnhancementTest { + + /** + * This contains the logger. + */ + private static final Logger LOG = LoggerFactory + .getLogger(DBPSpotlightSpotEnhancementTest.class); + private static String SPL_URL = System + .getProperty(DBPSpotlightSpotEnhancementEngine.SL_URL_KEY) == null ? "http://spotlight.dbpedia.org/rest/spot" + : (String) System + .getProperty(DBPSpotlightSpotEnhancementEngine.SL_URL_KEY); + private static String TEST_TEXT = "President Obama is meeting Angela Merkel in Berlin on Monday"; + private static DBPSpotlightSpotEnhancementEngine dbpslight; + + @BeforeClass + public static void oneTimeSetup() throws ConfigurationException { + dbpslight = new DBPSpotlightSpotEnhancementEngine(); + dbpslight.setEndpointUrl(SPL_URL); + } + + @Test + public void testEntityExtraction() { + Collection<SurfaceForm> entities; + try { + entities = dbpslight.doPostRequest(TEST_TEXT); + LOG.info("Found entities: {}", entities.size()); + LOG.debug("Entities:\n{}", entities); + Assert.assertFalse("No entities were found!", entities.isEmpty()); + } catch (EngineException e) { + Assert.assertFalse("An EngineException occurred! The message was: " + + e.getMessage(), true); + } + } + +} Modified: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/pom.xml URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/pom.xml?rev=1375110&r1=1375109&r2=1375110&view=diff ============================================================================== --- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/pom.xml (original) +++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/pom.xml Mon Aug 20 17:14:56 2012 @@ -57,9 +57,9 @@ <module>opencalais</module> <!-- http://opencalais.com/ --> <module>zemanta</module> <!-- htt://zemanta.com --> <!-- DBpedia.org Spotlight Enhancement Engines (STANBOL-706) --> - <module>dbpspotlightannotate</module> - <module>dbpspotlightcandidates</module> - <module>dbpspotlightdisambiguate</module> - <module>dbpspotlightspot</module> + <module>dbpedia-spotlight-annotate</module> + <module>dbpedia-spotlight-candidates</module> + <module>dbpedia-spotlight-disambiguate</module> + <module>dbpedia-spotlight-spot</module> </modules> </project>
