Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/pom.xml URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/pom.xml?rev=1374984&view=auto ============================================================================== --- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/pom.xml (added) +++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/pom.xml Mon Aug 20 12:11:01 2012 @@ -0,0 +1,121 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + + <modelVersion>4.0.0</modelVersion> + + <parent> + <artifactId>org.apache.stanbol.enhancer.parent</artifactId> + <groupId>org.apache.stanbol</groupId> + <version>0.9.0-incubating</version> + <relativePath>../../parent</relativePath> + </parent> + + <groupId>org.apache.stanbol</groupId> + <artifactId>org.apache.stanbol.enhancer.engines.dbpspotlightspot</artifactId> + <packaging>bundle</packaging> + + <name>Apache Stanbol Enhancer Enhancement Engine : DBPedia Spotlight Spot</name> + <description>an enhancement engine for spotting</description> + + <inceptionYear>2010</inceptionYear> + + <!--scm> + <connection> + scm:svn:http://svn.apache.org/repos/asf/incubator/stanbol/tags/0.9.0-incubating/enhancer/engines/langid/ + </connection> + <developerConnection> + scm:svn:https://svn.apache.org/repos/asf/incubator/stanbol/tags/0.9.0-incubating/enhancer/engines/langid/ + </developerConnection> + <url>http://incubator.apache.org/stanbol/</url> + </scm--> + + <build> + <plugins> + <plugin> + <groupId>org.apache.felix</groupId> + <artifactId>maven-bundle-plugin</artifactId> + <extensions>true</extensions> + <configuration> + <instructions> + <Export-Package> + org.apache.stanbol.enhancer.engines.dbpspotlightspot;version=${project.version} + </Export-Package> + <Embed-Dependency> + </Embed-Dependency> + </instructions> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.felix</groupId> + <artifactId>maven-scr-plugin</artifactId> + </plugin> + <plugin> + <groupId>org.apache.rat</groupId> + <artifactId>apache-rat-plugin</artifactId> + <configuration> + <excludes> + <!-- AL20 licensed files: See src/test/resources/README --> + <exclude>src/test/resources/en.txt</exclude> + </excludes> + </configuration> + </plugin> + </plugins> + </build> + + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + + <dependencies> + <dependency> + <groupId>org.apache.stanbol</groupId> + <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId> + </dependency> + + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-core</artifactId> + </dependency> + + <dependency> + <groupId>org.apache.felix</groupId> + <artifactId>org.apache.felix.scr.annotations</artifactId> + </dependency> + <dependency> + <groupId>org.apache.clerezza</groupId> + <artifactId>rdf.core</artifactId> + </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-api</artifactId> + </dependency> + + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + </dependencies> + +</project>
Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/license/THIRD-PARTY.properties URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/license/THIRD-PARTY.properties?rev=1374984&view=auto ============================================================================== --- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/license/THIRD-PARTY.properties (added) +++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/license/THIRD-PARTY.properties Mon Aug 20 12:11:01 2012 @@ -0,0 +1,17 @@ +# Generated by org.codehaus.mojo.license.AddThirdPartyMojo +#------------------------------------------------------------------------------- +# Already used licenses in project : +# - Apache License +# - Common Development and Distribution License (CDDL) v1.0 +# - Common Public License Version 1.0 +# - ICU License +# - MIT License +# - The Apache Software License, Version 2.0 +#------------------------------------------------------------------------------- +# Please fill the missing licenses for dependencies : +# +# +#Wed Feb 15 19:06:13 CET 2012 +javax.servlet--servlet-api--2.4=Common Development And Distribution License (CDDL), Version 1.0 +org.osgi--org.osgi.compendium--4.1.0=The Apache Software License, Version 2.0 +org.osgi--org.osgi.core--4.1.0=The Apache Software License, Version 2.0 Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/DBPSLSurfaceForm.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/DBPSLSurfaceForm.java?rev=1374984&view=auto ============================================================================== --- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/DBPSLSurfaceForm.java (added) +++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/DBPSLSurfaceForm.java Mon Aug 20 12:11:01 2012 @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.dbpspotlightspot; + +//import org.apache.clerezza.rdf.core.Resource; + +/** + * Stores the surface forms given by DBPedia Spotlight Spot. + * + * @author <a href="mailto:[email protected]">Iavor Jelev</a> + */ +public class DBPSLSurfaceForm { + + public String name; + public String type; + public Integer offset; + + public String toString() { + return String.format( "[name=%s, offset=%i, type=%s]", name, offset, type ) ; + } +} Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/DBPSpotlightSpotEnhancementEngine.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/DBPSpotlightSpotEnhancementEngine.java?rev=1374984&view=auto ============================================================================== --- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/DBPSpotlightSpotEnhancementEngine.java (added) +++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/DBPSpotlightSpotEnhancementEngine.java Mon Aug 20 12:11:01 2012 @@ -0,0 +1,393 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.dbpspotlightspot; + +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT; +import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START; + +import java.io.BufferedReader; +import java.io.ByteArrayOutputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.UnsupportedEncodingException; +import java.net.HttpURLConnection; +import java.net.URL; +import java.net.URLEncoder; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.Dictionary; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Map; +import java.util.Map.Entry; +import java.util.Set; + +import org.apache.clerezza.rdf.core.Language; +import org.apache.clerezza.rdf.core.Literal; +import org.apache.clerezza.rdf.core.LiteralFactory; +import org.apache.clerezza.rdf.core.MGraph; +import org.apache.clerezza.rdf.core.NonLiteral; +import org.apache.clerezza.rdf.core.Resource; +import org.apache.clerezza.rdf.core.Triple; +import org.apache.clerezza.rdf.core.UriRef; +import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl; +import org.apache.clerezza.rdf.core.impl.TripleImpl; +import org.apache.clerezza.rdf.core.serializedform.Serializer; +import org.apache.felix.scr.annotations.Component; +import org.apache.felix.scr.annotations.Properties; +import org.apache.felix.scr.annotations.Property; +import org.apache.felix.scr.annotations.Service; +import org.apache.stanbol.enhancer.servicesapi.Blob; +import org.apache.stanbol.enhancer.servicesapi.ContentItem; +import org.apache.stanbol.enhancer.servicesapi.EngineException; +import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine; +import org.apache.stanbol.enhancer.servicesapi.InvalidContentException; +import org.apache.stanbol.enhancer.servicesapi.ServiceProperties; +import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine; +import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper; +import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper; +import org.osgi.service.cm.ConfigurationException; +import org.osgi.service.component.ComponentContext; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.w3c.dom.Document; +import org.w3c.dom.Element; +import org.w3c.dom.NodeList; + +/** + * {@link DBPSpotlightSpotEnhancementEngine} provides functionality to enhance document + * with their language. + * + * @author Iavor Jelev, Babelmonkeys (GzEvD) + */ +@Component( + metatype = true, + immediate = true, + label = "%stanbol.DBPSpotlightSpotEnhancementEngine.name", + description = "%stanbol.DBPSpotlightSpotEnhancementEngine.description") +@Service +@Properties(value={ + @Property(name=EnhancementEngine.PROPERTY_NAME,value="dbpspotlightspot") +}) +public class DBPSpotlightSpotEnhancementEngine + extends AbstractEnhancementEngine<IOException,RuntimeException> + implements EnhancementEngine, ServiceProperties { + + /** + * a configurable value of the text segment length to check + */ + @Property(value = "http://spotlight.dbpedia.org/rest/spot") + public static final String SL_URL_KEY = "stanbol.DBPSpotlightSpotEnhancementEngine.url"; + + @Property(value = "LingPipeSpotter") + public static final String SL_SPOTTER = "stanbol.DBPSpotlightSpotEnhancementEngine.spotter"; + + + /** + * The default value for the Execution of this Engine. Currently set to + * {@link ServiceProperties#ORDERING_PRE_PROCESSING} + */ + public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION - 29; + + /** + * This contains the only MIME type directly supported by this enhancement engine. + */ + private static final String TEXT_PLAIN_MIMETYPE = "text/plain"; + /** + * Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE} + */ + private static final Set<String> SUPPORTED_MIMTYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE); + + /** + * This contains a list of languages supported by DBpedia Spotlight. + * If the metadata doesn't contain a value for the language as the value of the {@link Property.DC_LANG property} + * the content can't be processed. + */ + protected static final Set<String> SUPPORTED_LANGUAGES = + Collections.unmodifiableSet(new HashSet<String>( + Arrays.asList("en"))); + + /** holds the logger. */ + private static final Logger log = LoggerFactory.getLogger(DBPSpotlightSpotEnhancementEngine.class); + + /** holds the url of the Spotlight REST endpoint */ + private String spotlightUrl; + /** holds the chosen of spotter to be used */ + private String spotlightSpotter; + + + + /** + * Initialize all parameters from the configuration panel, or with their default values + * @param ce the {@link ComponentContext} + */ + @SuppressWarnings("unchecked") + protected void activate(ComponentContext ce) throws ConfigurationException, IOException { + + super.activate(ce); + + Dictionary<String, Object> properties = ce.getProperties(); + spotlightUrl = properties.get( SL_URL_KEY ) == null ? "http://spotlight.dbpedia.org/rest/spot" : (String) properties.get( SL_URL_KEY ); + spotlightSpotter = properties.get( SL_SPOTTER ) == null ? null : (String) properties.get( SL_SPOTTER ); + } + + + /** + * Check if the content can be enhanced + * @param ci the {@link ContentItem} + */ + public int canEnhance(ContentItem ci) throws EngineException { + if(ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null){ + String language = getMetadataLanguage(ci.getMetadata(), null); + if (language != null && !SUPPORTED_LANGUAGES.contains(language)) { + log.info("DBpedia Spotlight can not process ContentItem {} because " + + "language {} is not supported (supported: {})", + new Object[]{ci.getUri(),language,SUPPORTED_LANGUAGES}); + return CANNOT_ENHANCE; + } + return ENHANCE_SYNCHRONOUS; + } + return CANNOT_ENHANCE; + } + + + /** + * Calculate the enhancements by doing a POST request to the DBpedia Spotlight endpoint and processing the results + * @param ci the {@link ContentItem} + */ + public void computeEnhancements( ContentItem ci ) throws EngineException { + Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES); + if(contentPart == null){ + throw new IllegalStateException("No ContentPart with Mimetype '" + + TEXT_PLAIN_MIMETYPE+"' found for ContentItem "+ci.getUri() + + ": This is also checked in the canEnhance method! -> This " + + "indicated an Bug in the implementation of the " + + "EnhancementJobManager!"); + } + String text = ""; + try { + text = ContentItemHelper.getText(contentPart.getValue()); + } catch (IOException e) { + throw new InvalidContentException(this, ci, e); + } + + Collection<DBPSLSurfaceForm> dbpslGraph = doPostRequest( text ); + if ( dbpslGraph != null ) { + //Acquire a write lock on the ContentItem when adding the enhancements + ci.getLock().writeLock().lock(); + try { + createEnhancements( dbpslGraph, ci); + if (log.isDebugEnabled()) { + Serializer serializer = Serializer.getInstance(); + ByteArrayOutputStream debugStream = new ByteArrayOutputStream(); + serializer.serialize(debugStream, ci.getMetadata(), "application/rdf+xml"); + try { + log.debug("DBpedia Spotlight Spot Enhancements:\n{}",debugStream.toString("UTF-8")); + } catch (UnsupportedEncodingException e) { + e.printStackTrace(); + } + } + } finally { + ci.getLock().writeLock().unlock(); + } + } + } + + + /** + * The method adds the returned DBpedia Spotlight surface forms to the content item's metadata. + * For each one an TextAnnotation is created. + * + * @param occs a Collection of entity information + * @param ci the content item + */ + public void createEnhancements( Collection<DBPSLSurfaceForm> occs, ContentItem ci ) { + LiteralFactory literalFactory = LiteralFactory.getInstance(); + final Language language; // used for plain literals representing parts fo the content + String langString = getMetadataLanguage(ci.getMetadata(), null); + + if(langString != null && !langString.isEmpty()){ + language = new Language(langString); + } else { + language = null; + } + + HashMap<String, UriRef> entityAnnotationMap = new HashMap<String, UriRef>(); + + for (DBPSLSurfaceForm occ : occs) { + UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement( ci, this ); + MGraph model = ci.getMetadata(); + + model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT,new PlainLiteralImpl(occ.name,language))); + model.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(occ.offset))); + model.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(occ.offset + occ.name.length()))); + model.add(new TripleImpl(textAnnotation, DC_TYPE, new UriRef( occ.type ))); + // TODO ################## model.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(occ.context,language))); + + if (entityAnnotationMap.containsKey(occ.name)) { + model.add(new TripleImpl(entityAnnotationMap.get(occ.name), DC_RELATION, textAnnotation)); + } + else { + entityAnnotationMap.put(occ.name,textAnnotation); + } + } + } + + + + + /** + * Sends a POST request to the DBpediaSpotlight url. + * @param text a <code>String</code> with the text to be analyzed + * @return a <code>String</code> with the server response + * @throws EngineException if the request cannot be sent + */ + public Collection<DBPSLSurfaceForm> doPostRequest( String text ) throws EngineException { + StringBuilder data = new StringBuilder(); + try { + if ( spotlightSpotter != null && !spotlightSpotter.isEmpty() ) + data.append( URLEncoder.encode( "spotter", "UTF-8" ) + "=" + URLEncoder.encode( spotlightSpotter, "UTF-8" ) + "&" ); + data.append( URLEncoder.encode( "text", "UTF-8" ) + "=" + URLEncoder.encode( text, "UTF-8" ) ); + } catch (UnsupportedEncodingException e) { + throw new EngineException( "Data for the httprequest could not be converted. Error: " + e.getMessage() ); + } + + HttpURLConnection connection = null; + StringBuffer response = new StringBuffer(); + + try { + //Create connection + URL url = new URL( spotlightUrl ); + connection = ( HttpURLConnection )url.openConnection(); + connection.setRequestMethod( "POST" ); + connection.setRequestProperty( "Content-Type", "application/x-www-form-urlencoded" ); + connection.setRequestProperty( "Accept", "text/xml" ); + + connection.setUseCaches( false ); + connection.setDoInput( true ); + connection.setDoOutput( true ); + + //Send request + DataOutputStream wr = new DataOutputStream ( + connection.getOutputStream ()); + wr.writeBytes( data.toString() ); + wr.flush (); + wr.close (); + + //Get Response + InputStream is = connection.getInputStream(); + BufferedReader rd = new BufferedReader( new InputStreamReader( is ) ); + String line; + while((line = rd.readLine()) != null) { + response.append( line ); + response.append( '\r' ); + } + rd.close(); + + } catch (Exception e) { + + log.error( "[request] Request could not be made. Error: " + e.getMessage() ); + e.printStackTrace(); + return null; + + } finally { + + if(connection != null) { + connection.disconnect(); + } + } + + + XMLParser xmlParser = new XMLParser(); + try { + Document xmlDoc = xmlParser.loadXMLFromString( response.toString() ); + NodeList nlist = xmlParser.getElementsByTagName( xmlDoc, "surfaceForm" ); + Collection<DBPSLSurfaceForm> annos = this.getAnnotations( nlist ); + + return annos; + } catch ( Exception e) { + log.error( "[response] Response XML could not be parsed. Error: " + e.getMessage() ); + throw new EngineException( "Response XML could not be parsed. Error: " + e.getMessage() ); + } + } + + + /** + * This method creates the Collection of surface forms, which the method <code>createEnhancement</code> + * adds to the meta data of the content item as TextAnnotations. + * @param nList NodeList of all Resources contained in the XML response from DBpedia Spotlight + * @return a Collection<DBPSLSurfaceForm> with all annotations + */ + private Collection<DBPSLSurfaceForm> getAnnotations( NodeList nList ) { + Collection<DBPSLSurfaceForm> dbpslAnnos = new HashSet<DBPSLSurfaceForm>(); + + for (int temp = 0; temp < nList.getLength(); temp++) { + DBPSLSurfaceForm dbpslann = new DBPSLSurfaceForm(); + Element node = (Element) nList.item(temp); + dbpslann.name = node.getAttribute( "name" ); + dbpslann.offset = (new Integer( node.getAttribute( "offset" ) ) ).intValue(); + dbpslann.type = node.getAttribute( "type" ); + + dbpslAnnos.add( dbpslann ); + } + + return dbpslAnnos; + } + + + public Map<String, Object> getServiceProperties() { + return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder)); + } + + + public String getMetadataLanguage(MGraph model, NonLiteral subj) { + Iterator<Triple> it = model.filter(subj, DC_LANGUAGE, null); + if (it.hasNext()) { + Resource langNode = it.next().getObject(); + return getLexicalForm(langNode); + } + return null; + } + + public String getLexicalForm(Resource res) { + if (res == null) { + return null; + } else if (res instanceof Literal) { + return ((Literal) res).getLexicalForm(); + } else { + return res.toString(); + } + } + + + /** + * This method is used by the test class to set the endpoint url + * @param url String the url of the Spotlight endpoint + */ + public void setEndpointUrl( String url ) { + spotlightUrl = url; + } + +} Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/XMLParser.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/XMLParser.java?rev=1374984&view=auto ============================================================================== --- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/XMLParser.java (added) +++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/XMLParser.java Mon Aug 20 12:11:01 2012 @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.dbpspotlightspot; + +import java.io.ByteArrayInputStream; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; + +import javax.xml.parsers.DocumentBuilder; +import javax.xml.parsers.DocumentBuilderFactory; +import javax.xml.parsers.ParserConfigurationException; + +import org.w3c.dom.Document; +import org.w3c.dom.NodeList; +import org.xml.sax.SAXException; + + +/** + * Parses the XML results given by DBPedia Spotlight. + * + * @author <a href="mailto:[email protected]">Iavor Jelev</a> + */ + +public class XMLParser { + + public NodeList getElementsByTagName( Document doc, String tagName ) { + + return doc.getElementsByTagName( tagName ); + } + + + public Document loadXMLFromString( String xml ) throws SAXException, IOException { + Document doc = loadXMLFromInputStream( new ByteArrayInputStream( xml.getBytes() ) ); + doc.getDocumentElement().normalize(); + + return doc; + } + + + public Document loadXMLFromInputStream( InputStream is ) throws SAXException, IOException { + DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); + factory.setNamespaceAware( true ); + DocumentBuilder builder = null; + try { + builder = factory.newDocumentBuilder(); + } + catch ( ParserConfigurationException ex ) { + } + Document doc = builder.parse(is); + is.close(); + doc.getDocumentElement().normalize(); + + return doc; + } + + + public Document loadXMLFromFile( String filePath ) throws ParserConfigurationException, SAXException, IOException { + File fXmlFile = new File( filePath ); + DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance(); + DocumentBuilder dBuilder = dbFactory.newDocumentBuilder(); + Document doc = dBuilder.parse(fXmlFile); + doc.getDocumentElement().normalize(); + + return doc; + } +} \ No newline at end of file Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/resources/OSGI-INF/metatype/metatype.properties URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1374984&view=auto ============================================================================== --- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/resources/OSGI-INF/metatype/metatype.properties (added) +++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/main/resources/OSGI-INF/metatype/metatype.properties Mon Aug 20 12:11:01 2012 @@ -0,0 +1,32 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + + + +# This file contains localization strings for configuration labels and +# descriptions as used in the metatype.xml descriptor generated by the +# the maven SCR plugin + +stanbol.DBPSpotlightSpotEnhancementEngine.name = DBpedia Spotlight Spotter: Named Entity Recognition +stanbol.DBPSpotlightSpotEnhancementEngine.description = This engine performs just Named Entity Recognition, \ + so it is suited for EnhancementChain scenario, in which another Engine links the recognized TextAnnotations \ + to Ontology Types +stanbol.DBPSpotlightSpotEnhancementEngine.url.name = Spotlight URL +stanbol.DBPSpotlightSpotEnhancementEngine.url.description = The URL which will be used for the request +stanbol.DBPSpotlightSpotEnhancementEngine.spotter.name = Spotter +stanbol.DBPSpotlightSpotEnhancementEngine.spotter.description = The algorithm which will be used for Spotting \ + (aka Term Recognition). Currently available: NER, LingPipeSpotter, OpenNLPChunkerSpotter, Kea Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/core/DBPSpotlightSpotEnhancementTest.java URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/core/DBPSpotlightSpotEnhancementTest.java?rev=1374984&view=auto ============================================================================== --- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/core/DBPSpotlightSpotEnhancementTest.java (added) +++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/test/java/org/apache/stanbol/enhancer/engines/dbpspotlightspot/core/DBPSpotlightSpotEnhancementTest.java Mon Aug 20 12:11:01 2012 @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.stanbol.enhancer.engines.dbpspotlightspot.core; + +import java.util.Collection; + +import org.apache.stanbol.enhancer.engines.dbpspotlightspot.DBPSLSurfaceForm; +import org.apache.stanbol.enhancer.engines.dbpspotlightspot.DBPSpotlightSpotEnhancementEngine; +import org.apache.stanbol.enhancer.servicesapi.EngineException; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; +import org.osgi.service.cm.ConfigurationException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class provides a JUnit test for DBpedia Spotlight Spot EnhancementEngine. + * @author Iavor Jelev, babelmonkeys / GzEvD + */ +public class DBPSpotlightSpotEnhancementTest { + + /** + * This contains the logger. + */ + private static final Logger LOG = LoggerFactory.getLogger(DBPSpotlightSpotEnhancementTest.class); + private static String SPL_URL = System.getProperty(DBPSpotlightSpotEnhancementEngine.SL_URL_KEY) == null ? + "http://spotlight.dbpedia.org/rest/spot" : (String) System.getProperty(DBPSpotlightSpotEnhancementEngine.SL_URL_KEY); + private static String TEST_TEXT = "President Obama is meeting Angela Merkel in Berlin on Monday"; + private static DBPSpotlightSpotEnhancementEngine dbpslight; + + @BeforeClass + public static void oneTimeSetup() throws ConfigurationException { + dbpslight = new DBPSpotlightSpotEnhancementEngine(); + dbpslight.setEndpointUrl( SPL_URL ); + } + + + @Test + public void testEntityExtraction() { + Collection<DBPSLSurfaceForm> entities; + try { + entities = dbpslight.doPostRequest( TEST_TEXT ); + LOG.info("Found entities: {}",entities.size()); + LOG.debug("Entities:\n{}",entities); + Assert.assertFalse("No entities were found!", entities.isEmpty()); + } catch (EngineException e) { + Assert.assertFalse("An EngineException occurred! The message was: " + e.getMessage(), true); + } + } + +} Added: incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/test/resources/README URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/test/resources/README?rev=1374984&view=auto ============================================================================== --- incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/test/resources/README (added) +++ incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpspotlightspot/src/test/resources/README Mon Aug 20 12:11:01 2012 @@ -0,0 +1,15 @@ +Licensed to the Apache Software Foundation (ASF) under one or more +contributor license agreements. See the NOTICE file distributed with +this work for additional information regarding copyright ownership. +The ASF licenses this file to You under the Apache License, Version 2.0 +(the "License"); you may not use this file except in compliance with +the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +
