Author: mattmann
Date: Mon May 25 00:56:24 2015
New Revision: 1681541
URL: http://svn.apache.org/r1681541
Log:
fix for TIKA-1614 Geo Topic Parser contributed by aranyali
<[email protected]> and modified and updated by Chris Mattmann thi closes
#43.
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/geo/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
Modified:
tika/trunk/tika-app/pom.xml
tika/trunk/tika-bundle/pom.xml
tika/trunk/tika-parsers/pom.xml
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
Modified: tika/trunk/tika-app/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-app/pom.xml?rev=1681541&r1=1681540&r2=1681541&view=diff
==============================================================================
--- tika/trunk/tika-app/pom.xml (original)
+++ tika/trunk/tika-app/pom.xml Mon May 25 00:56:24 2015
@@ -94,8 +94,7 @@
<dependency>
<artifactId>commons-io</artifactId>
<groupId>commons-io</groupId>
- <version>2.1</version>
- <scope>test</scope>
+ <version>2.4</version>
</dependency>
</dependencies>
Modified: tika/trunk/tika-bundle/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-bundle/pom.xml?rev=1681541&r1=1681540&r2=1681541&view=diff
==============================================================================
--- tika/trunk/tika-bundle/pom.xml (original)
+++ tika/trunk/tika-bundle/pom.xml Mon May 25 00:56:24 2015
@@ -125,7 +125,8 @@
</Bundle-Activator>
<Embed-Dependency>
tika-parsers;inline=true,
- commons-compress, xz, commons-codec, commons-csv, junrar,
+ commons-compress, xz, commons-codec, commons-csv,
+ commons-io, commons-exec, junrar,
pdfbox,fontbox,jempbox,bcmail-jdk15on,bcprov-jdk15on,bcpkix-jdk15on,
poi,poi-scratchpad,poi-ooxml,poi-ooxml-schemas,
xmlbeans,
@@ -134,11 +135,11 @@
juniversalchardet,
vorbis-java-core, vorbis-java-tika,
isoparser, aspectjrt,
- metadata-extractor, xmpcore,
- boilerpipe, rome,
+ metadata-extractor, xmpcore, json-simple,
+ boilerpipe, rome, opennlp-tools, opennlp-maxent,
geoapi, sis-metadata, sis-netcdf, sis-utility,
sis-storage, apache-mime4j-core, apache-mime4j-dom,
- jsr-275, jhighlight, java-libpst,
+ jsr-275, jhighlight, java-libpst, jwnl,
netcdf4, grib, cdm, httpservices, jcip-annotations,
jmatio, guava
</Embed-Dependency>
@@ -184,6 +185,11 @@
opendap.dap.http;resolution:=optional,
opendap.dap;resolution:=optional,
opendap.dap.parser;resolution:=optional,
+ opennlp.maxent;resolution:=optional,
+ opennlp.tools.namefind;resolution:=optional,
+ net.didion.jwnl;resolution:=optional,
+ org.apache.commons.exec;resolution:=optional,
+ org.apache.commons.io;resolution:=optional,
org.apache.commons.httpclient;resolution:=optional,
org.apache.commons.httpclient.auth;resolution:=optional,
org.apache.commons.httpclient.methods;resolution:=optional,
@@ -233,6 +239,7 @@
org.jdom2;resolution:=optional,
org.jdom2.input;resolution:=optional,
org.jdom2.output;resolution:=optional,
+ org.json.simple;resolution:=optional,
org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional,
org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional,
org.osgi.framework;resolution:=optional,
Modified: tika/trunk/tika-parsers/pom.xml
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1681541&r1=1681540&r2=1681541&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Mon May 25 00:56:24 2015
@@ -224,6 +224,32 @@
<scope>provided</scope>
</dependency>
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-tools</artifactId>
+ <version>1.5.3</version>
+ </dependency>
+
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ <version>2.4</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.commons</groupId>
+ <artifactId>commons-exec</artifactId>
+ <version>1.3</version>
+ </dependency>
+
+ <dependency>
+
<groupId>com.googlecode.json-simple</groupId>
+ <artifactId>json-simple</artifactId>
+ <version>1.1.1</version>
+ </dependency>
+
+
+
<!-- Test dependencies -->
<dependency>
<groupId>junit</groupId>
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java?rev=1681541&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
Mon May 25 00:56:24 2015
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright
owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Set;
+import java.util.logging.Logger;
+
+import org.apache.commons.exec.CommandLine;
+import org.apache.commons.exec.DefaultExecutor;
+import org.apache.commons.exec.ExecuteException;
+import org.apache.commons.exec.ExecuteWatchdog;
+import org.apache.commons.exec.PumpStreamHandler;
+import org.apache.commons.exec.environment.EnvironmentUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.JSONValue;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class GeoParser extends AbstractParser {
+
+ private static final long serialVersionUID = -2241391757440215491L;
+ private static final MediaType MEDIA_TYPE = MediaType
+ .application("geotopic");
+ private static final Set<MediaType> SUPPORTED_TYPES =
Collections
+ .singleton(MEDIA_TYPE);
+ private GeoParserConfig config = new
GeoParserConfig();
+ private static final Logger LOG =
Logger.getLogger(GeoParser.class.getName());
+
+ @Override
+ public Set<MediaType>
getSupportedTypes(ParseContext parseContext) {
+ return SUPPORTED_TYPES;
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler,
+ Metadata metadata, ParseContext context) throws IOException,
+ SAXException, TikaException {
+
+ /*----------------configure this parser by ParseContext
Object---------------------*/
+ config =
context.get(GeoParserConfig.class,
+ config);
+ String
nerModelPath = config.getNERPath();
+
+
if(!isAvailable()){
+
return;
+
}
+
+ /*----------------get locationNameEntities and best nameEntity
for the input stream---------------------*/
+ NameEntityExtractor extractor = new
NameEntityExtractor(nerModelPath);
+
extractor.getAllNameEntitiesfromInput(stream);
+
extractor.getBestNameEntity();
+
ArrayList<String> locationNameEntities = extractor.locationNameEntities;
+
String bestner = extractor.bestNameEntity;
+
+ /*------------------------resolve geonames for each ner, store
results in a hashmap---------------------*/
+ HashMap<String,
ArrayList<String>> resolvedGeonames = searchGeoNames(locationNameEntities);
+
+ /*----------------store locationNameEntities and their geonames
in a geotag, each input has one geotag---------------------*/
+ GeoTag geotag = new GeoTag();
+
geotag.toGeoTag(resolvedGeonames, bestner);
+
+ /* add resolved entities in metadata */
+
+ metadata.add("Geographic_NAME", geotag.Geographic_NAME);
+
metadata.add("Geographic_LONGITUDE", geotag.Geographic_LONGTITUDE);
+
metadata.add("Geographic_LATITUDE", geotag.Geographic_LATITUDE);
+
for (int i = 0; i <
geotag.alternatives.size(); ++i) {
+
GeoTag alter = (GeoTag)
geotag.alternatives.get(i);
+
metadata.add("Optional_NAME" + (i + 1), alter.Geographic_NAME);
+
metadata.add("Optional_LONGITUDE" + (i + 1),
+
alter.Geographic_LONGTITUDE);
+
metadata.add("Optional_LATITUDE" + (i + 1),
+
alter.Geographic_LATITUDE);
+
}
+
}
+
+ public HashMap<String, ArrayList<String>> searchGeoNames(
+ ArrayList<String> locationNameEntities) throws
ExecuteException,
+
IOException {
+
CommandLine cmdLine = new CommandLine("lucene-geo-gazetteer");
+
ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+
cmdLine.addArgument("-s");
+
for (String name :
locationNameEntities) {
+
cmdLine.addArgument(name);
+
}
+
+ LOG.fine("Executing: " + cmdLine);
+ DefaultExecutor exec = new
DefaultExecutor();
+ exec.setExitValue(0);
+ ExecuteWatchdog
watchdog = new ExecuteWatchdog(60000);
+
exec.setWatchdog(watchdog);
+
PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream);
+
exec.setStreamHandler(streamHandler);
+
int exitValue = exec.execute(cmdLine,
+
EnvironmentUtils.getProcEnvironment());
+
String outputJson =
outputStream.toString("UTF-8");
+
JSONArray json = (JSONArray)
JSONValue.parse(outputJson);
+
+ HashMap<String, ArrayList<String>> returnHash = new
HashMap<String, ArrayList<String>>();
+ for (int i = 0; i < json.size(); i++) {
+ JSONObject obj = (JSONObject)
json.get(i);
+ for (Object key :
obj.keySet()) {
+ String
theKey = (String) key;
+
JSONArray vals = (JSONArray) obj.get(theKey);
+
ArrayList<String> stringVals = new ArrayList<String>(
+
vals.size());
+
for (int j = 0;
j < vals.size(); j++) {
+
String val = (String) vals.get(j);
+
stringVals.add(val);
+
}
+
+ returnHash.put(theKey, stringVals);
+ }
+ }
+
+ return returnHash;
+
+ }
+
+ public boolean isAvailable(){
+ return ExternalParser.check(new
String[]{"lucene-geo-gazetteer", "--help"}, -1)
+
&& config.getNERPath() != null && !config.getNERPath().equals("");
+
}
+
+}
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java?rev=1681541&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
Mon May 25 00:56:24 2015
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import java.io.File;
+import java.io.Serializable;
+import java.net.URISyntaxException;
+
+public class GeoParserConfig implements Serializable {
+
+ private static final long serialVersionUID = 1L;
+ private String nerModelPath = null;
+
+ public GeoParserConfig() {
+ try {
+ if (GeoParserConfig.class.getResource(
+ "en-ner-location.bin")
!= null){
+ this.nerModelPath = new
File(GeoParserConfig.class.getResource(
+
"en-ner-location.bin").toURI()).getAbsolutePath();
+ }
+ } catch (URISyntaxException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public void setNERModelPath(String path) {
+ if (path == null)
+ return;
+ File file = new File(path);
+ if (file.isDirectory() || !file.exists()) {
+ return;
+ }
+
nerModelPath = path;
+ }
+
+ public String getNERPath() {
+ return nerModelPath;
+ }
+
+}
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java?rev=1681541&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
Mon May 25 00:56:24 2015
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+public class GeoTag {
+ String Geographic_NAME;
+ String Geographic_LONGTITUDE;
+ String Geographic_LATITUDE;
+ ArrayList<GeoTag> alternatives = new ArrayList<GeoTag>();
+
+ public void setMain(String name, String longitude, String latitude) {
+ Geographic_NAME = name;
+ Geographic_LONGTITUDE = longitude;
+ Geographic_LATITUDE =
latitude;
+ }
+
+ public void addAlternative(GeoTag geotag) {
+ alternatives.add(geotag);
+ }
+
+ /*
+ * Store resolved geoName entities in a GeoTag
+ *
+ * @param resolvedGeonames resolved entities
+ *
+ * @param bestNER best name entity among all the extracted entities
for the
+ * input stream
+ */
+ public void toGeoTag(HashMap<String, ArrayList<String>>
resolvedGeonames,
+ String bestNER) {
+
+ for (String key : resolvedGeonames.keySet()) {
+ ArrayList<String> cur = resolvedGeonames.get(key);
+ if (key.equals(bestNER)) {
+
this.Geographic_NAME = cur.get(0);
+
this.Geographic_LONGTITUDE = cur.get(1);
+
this.Geographic_LATITUDE
= cur.get(2);
+
} else {
+
GeoTag alter = new GeoTag();
+
alter.Geographic_NAME = cur.get(0);
+
alter.Geographic_LONGTITUDE =
cur.get(1);
+
alter.Geographic_LATITUDE = cur.get(2);
+
this.addAlternative(alter);
+
}
+
}
+
}
+}
Added:
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java?rev=1681541&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
(added)
+++
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
Mon May 25 00:56:24 2015
@@ -0,0 +1,108 @@
+package org.apache.tika.parser.geo.topic;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.Span;
+
+import org.apache.commons.io.IOUtils;
+
+public class NameEntityExtractor {
+ private String nerModelPath = null;
+ ArrayList<String> locationNameEntities;
+ String bestNameEntity;
+ private HashMap<String, Integer> tf;
+
+ public NameEntityExtractor(String nerModelpath) {
+ this.locationNameEntities = new ArrayList<String>();
+ this.bestNameEntity = null;
+ this.nerModelPath
= nerModelpath;
+
tf = new HashMap<String, Integer>();
+
+ }
+
+ /*
+ * Use OpenNLP to extract location names that's appearing in the steam.
+ * OpenNLP's default Name Finder accuracy is not very good, please
refer to
+ * its documentation.
+ *
+ * @param stream stream that passed from this.parse()
+ */
+
+ public void getAllNameEntitiesfromInput(InputStream stream)
+ throws InvalidFormatException, IOException {
+
+ InputStream modelIn = new FileInputStream(nerModelPath);
+ TokenNameFinderModel model = new
TokenNameFinderModel(modelIn);
+ NameFinderME nameFinder = new
NameFinderME(model);
+ String[] in =
IOUtils.toString(stream, "UTF-8").split(" ");
+
+ Span nameE[] = nameFinder.find(in);
+
+ String spanNames = Arrays.toString(Span.spansToStrings(nameE,
in));
+ spanNames = spanNames.substring(1, spanNames.length() -
1);
+ modelIn.close();
+ String[] tmp = spanNames.split(",");
+
+ for (String name : tmp) {
+ name = name.trim();
+ this.locationNameEntities.add(name);
+ }
+
+ }
+
+ /*
+ * Get the best location entity extracted from the input stream. Simply
+ * return the most frequent entity, If there several highest frequent
+ * entity, pick one randomly. May not be the optimal solution, but
works.
+ *
+ * @param locationNameEntities OpenNLP name finder's results,
stored in
+ * ArrayList
+ */
+ public void getBestNameEntity() {
+ if (this.locationNameEntities.size() == 0)
+ return;
+
+ for (int i = 0; i < this.locationNameEntities.size(); ++i) {
+ if (tf.containsKey(this.locationNameEntities.get(i)))
+ tf.put(this.locationNameEntities.get(i),
+
tf.get(this.locationNameEntities.get(i)) + 1);
+
else
+
tf.put(this.locationNameEntities.get(i), 1);
+
}
+
int max = 0;
+
List<Map.Entry<String, Integer>> list = new
ArrayList<Map.Entry<String, Integer>>(
+
tf.entrySet());
+
Collections.shuffle(list);
+
Collections.sort(list, new
Comparator<Map.Entry<String, Integer>>() {
+
public int compare(Map.Entry<String, Integer> o1,
+
Map.Entry<String, Integer> o2) {
+
return o2.getValue().compareTo(o1.getValue()); // descending
+
// order
+
+ }
+ });
+
+ this.locationNameEntities.clear();// update so that they are in
+
// descending order
+
for (Map.Entry<String, Integer> entry : list) {
+
this.locationNameEntities.add(entry.getKey());
+
if (entry.getValue() > max)
{
+
max
= entry.getValue();
+
this.bestNameEntity = entry.getKey();
+
}
+
}
+
}
+
+}
Modified:
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1681541&r1=1681540&r2=1681541&view=diff
==============================================================================
---
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
(original)
+++
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
Mon May 25 00:56:24 2015
@@ -63,3 +63,5 @@ org.apache.tika.parser.grib.GribParser
org.apache.tika.parser.jdbc.SQLite3Parser
org.apache.tika.parser.isatab.ISArchiveParser
org.apache.tika.parser.geoinfo.GeographicInformationParser
+org.apache.tika.parser.geo.topic.GeoParser
+
Added:
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
URL:
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java?rev=1681541&view=auto
==============================================================================
---
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
(added)
+++
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
Mon May 25 00:56:24 2015
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.PrintStream;
+import java.io.UnsupportedEncodingException;
+import java.util.HashMap;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.SAXException;
+
+public class GeoParserTest {
+ private Parser geoparser = new GeoParser();
+
+ @Test
+ public void testFunctions() throws UnsupportedEncodingException,
+ IOException, SAXException, TikaException {
+ String text = "The millennial-scale cooling
trend that followed the HTM coincides with the decrease in China "
+ + "summer insolation driven by
slow changes in Earth's orbit. Despite the nearly linear forcing, the
transition from the HTM to "
+ + "the
Little Ice Age (1500-1900 AD) was neither gradual nor uniform. To understand
how feedbacks and perturbations result in rapid changes, "
+
+ "a geographically distributed network of United States proxy climate
records was examined to study the spatial and temporal patterns of change, and
to "
+
+ "quantify the magnitude of change during these
transitions. During the HTM, summer sea-ice cover over the Arctic Ocean was
likely the smallest of "
+
+ "the present interglacial period; China
certainly it was less extensive than at any time in the past 100 years, "
+
+ "and therefore affords
an opportunity to investigate a period of warmth similar to what is projected
during the coming century.";
+
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ GeoParserConfig config = new
GeoParserConfig();
+
context.set(GeoParserConfig.class, config);
+
+ InputStream s = new
ByteArrayInputStream(text.getBytes("UTF-8"));
+ /* if it's not available no tests to run */
+ if (!((GeoParser)geoparser).isAvailable()) return;
+
+ geoparser.parse(s, new BodyContentHandler(), metadata, context);
+
+ assertNotNull(metadata.get("Geographic_NAME"));
+ assertNotNull(metadata.get("Geographic_LONGITUDE"));
+
assertNotNull(metadata.get("Geographic_LATITUDE"));
+ assertEquals("China",
metadata.get("Geographic_NAME"));
+
assertEquals("United States", metadata.get("Optional_NAME1"));
+
assertEquals("27.33931", metadata.get("Geographic_LATITUDE"));
+
assertEquals("-108.60288",
metadata.get("Geographic_LONGITUDE"));
+
assertEquals("39.76",
metadata.get("Optional_LATITUDE1"));
+
assertEquals("-98.5", metadata.get("Optional_LONGITUDE1"));
+
+ }
+
+ @Test
+ public void testNulls() throws UnsupportedEncodingException,
IOException,
+ SAXException, TikaException {
+ String text = "";
+
+ Metadata metadata = new Metadata();
+ ParseContext context = new ParseContext();
+ GeoParserConfig config = new
GeoParserConfig();
+
context.set(GeoParserConfig.class, config);
+
geoparser.parse(new ByteArrayInputStream(text.getBytes("UTF-8")),
+
new BodyContentHandler(), metadata,
context);
+
assertNull(metadata.get("Geographic_NAME"));
+
assertNull(metadata.get("Geographic_LONGITUDE"));
+
assertNull(metadata.get("Geographic_LATITUDE"));
+
+ }
+}