Author: mattmann
Date: Mon May 25 00:56:24 2015
New Revision: 1681541

URL: http://svn.apache.org/r1681541
Log:
fix for TIKA-1614 Geo Topic Parser contributed by aranyali 
<[email protected]> and modified and updated by Chris Mattmann thi closes 
#43.

Added:
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/
    tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
    
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/geo/
    tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/
    
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
Modified:
    tika/trunk/tika-app/pom.xml
    tika/trunk/tika-bundle/pom.xml
    tika/trunk/tika-parsers/pom.xml
    
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser

Modified: tika/trunk/tika-app/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-app/pom.xml?rev=1681541&r1=1681540&r2=1681541&view=diff
==============================================================================
--- tika/trunk/tika-app/pom.xml (original)
+++ tika/trunk/tika-app/pom.xml Mon May 25 00:56:24 2015
@@ -94,8 +94,7 @@
     <dependency>
       <artifactId>commons-io</artifactId>
       <groupId>commons-io</groupId>
-      <version>2.1</version>
-      <scope>test</scope>
+      <version>2.4</version>
     </dependency>
   </dependencies>
 

Modified: tika/trunk/tika-bundle/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-bundle/pom.xml?rev=1681541&r1=1681540&r2=1681541&view=diff
==============================================================================
--- tika/trunk/tika-bundle/pom.xml (original)
+++ tika/trunk/tika-bundle/pom.xml Mon May 25 00:56:24 2015
@@ -125,7 +125,8 @@
             </Bundle-Activator>
             <Embed-Dependency>
               tika-parsers;inline=true,
-              commons-compress, xz, commons-codec, commons-csv, junrar,
+              commons-compress, xz, commons-codec, commons-csv, 
+              commons-io, commons-exec, junrar,
               
pdfbox,fontbox,jempbox,bcmail-jdk15on,bcprov-jdk15on,bcpkix-jdk15on,
               poi,poi-scratchpad,poi-ooxml,poi-ooxml-schemas,
               xmlbeans,
@@ -134,11 +135,11 @@
               juniversalchardet,
               vorbis-java-core, vorbis-java-tika,
               isoparser, aspectjrt,
-              metadata-extractor, xmpcore,
-              boilerpipe, rome,
+              metadata-extractor, xmpcore, json-simple, 
+              boilerpipe, rome, opennlp-tools, opennlp-maxent,
               geoapi, sis-metadata, sis-netcdf, sis-utility, 
               sis-storage, apache-mime4j-core, apache-mime4j-dom,
-              jsr-275, jhighlight, java-libpst,
+              jsr-275, jhighlight, java-libpst, jwnl, 
               netcdf4, grib, cdm, httpservices, jcip-annotations, 
               jmatio, guava
             </Embed-Dependency>
@@ -184,6 +185,11 @@
               opendap.dap.http;resolution:=optional,
               opendap.dap;resolution:=optional,
               opendap.dap.parser;resolution:=optional,
+             opennlp.maxent;resolution:=optional,
+             opennlp.tools.namefind;resolution:=optional,
+             net.didion.jwnl;resolution:=optional,
+             org.apache.commons.exec;resolution:=optional,
+             org.apache.commons.io;resolution:=optional,
               org.apache.commons.httpclient;resolution:=optional,
               org.apache.commons.httpclient.auth;resolution:=optional,
               org.apache.commons.httpclient.methods;resolution:=optional,
@@ -233,6 +239,7 @@
               org.jdom2;resolution:=optional,
               org.jdom2.input;resolution:=optional,
               org.jdom2.output;resolution:=optional,
+             org.json.simple;resolution:=optional,
               
org.openxmlformats.schemas.officeDocument.x2006.math;resolution:=optional,
               
org.openxmlformats.schemas.schemaLibrary.x2006.main;resolution:=optional,
               org.osgi.framework;resolution:=optional,

Modified: tika/trunk/tika-parsers/pom.xml
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/pom.xml?rev=1681541&r1=1681540&r2=1681541&view=diff
==============================================================================
--- tika/trunk/tika-parsers/pom.xml (original)
+++ tika/trunk/tika-parsers/pom.xml Mon May 25 00:56:24 2015
@@ -224,6 +224,32 @@
         <scope>provided</scope>
     </dependency>
 
+       <dependency>
+         <groupId>org.apache.opennlp</groupId>
+           <artifactId>opennlp-tools</artifactId>
+             <version>1.5.3</version>
+             </dependency>
+             
+             <dependency>
+               <groupId>commons-io</groupId>
+                 <artifactId>commons-io</artifactId>
+                   <version>2.4</version>
+                   </dependency>
+                   
+                   <dependency>
+                       <groupId>org.apache.commons</groupId>
+                           <artifactId>commons-exec</artifactId>
+                               <version>1.3</version>
+                               </dependency>
+                               
+                               <dependency>
+                                   
<groupId>com.googlecode.json-simple</groupId>
+                                       <artifactId>json-simple</artifactId>
+                                           <version>1.1.1</version>
+                                           </dependency>
+                                           
+
+    
     <!-- Test dependencies -->
     <dependency>
       <groupId>junit</groupId>

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java?rev=1681541&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java
 Mon May 25 00:56:24 2015
@@ -0,0 +1,153 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright 
owlocationNameEntitieship.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Set;
+import java.util.logging.Logger;
+
+import org.apache.commons.exec.CommandLine;
+import org.apache.commons.exec.DefaultExecutor;
+import org.apache.commons.exec.ExecuteException;
+import org.apache.commons.exec.ExecuteWatchdog;
+import org.apache.commons.exec.PumpStreamHandler;
+import org.apache.commons.exec.environment.EnvironmentUtils;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+import org.json.simple.JSONArray;
+import org.json.simple.JSONObject;
+import org.json.simple.JSONValue;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+
+public class GeoParser extends AbstractParser {
+
+       private static final long serialVersionUID = -2241391757440215491L;
+       private static final MediaType MEDIA_TYPE = MediaType
+                      .application("geotopic");
+                      private static final Set<MediaType> SUPPORTED_TYPES = 
Collections
+                                     .singleton(MEDIA_TYPE);
+                                     private GeoParserConfig config = new 
GeoParserConfig();
+                                     private static final Logger LOG = 
Logger.getLogger(GeoParser.class.getName());
+                                     
+                                     @Override
+                                     public Set<MediaType> 
getSupportedTypes(ParseContext parseContext) {
+                                            return SUPPORTED_TYPES;
+                                            }
+
+       @Override
+       public void parse(InputStream stream, ContentHandler handler,
+                   Metadata metadata, ParseContext context) throws IOException,
+                                      SAXException, TikaException {
+
+               /*----------------configure this parser by ParseContext 
Object---------------------*/
+                                           config = 
context.get(GeoParserConfig.class,
+                                                       config);
+                                                               String 
nerModelPath = config.getNERPath();
+                                                                      
+                                                                       
if(!isAvailable()){
+                                                                               
        return;
+                                                                               
                }
+
+               /*----------------get locationNameEntities and best nameEntity 
for the input stream---------------------*/
+                                     NameEntityExtractor extractor = new 
NameEntityExtractor(nerModelPath);
+                                                         
extractor.getAllNameEntitiesfromInput(stream);
+                                                               
extractor.getBestNameEntity();
+                                                                       
ArrayList<String> locationNameEntities = extractor.locationNameEntities;
+                                                                               
          String bestner = extractor.bestNameEntity;
+
+               /*------------------------resolve geonames for each ner, store 
results in a hashmap---------------------*/
+                                                 HashMap<String, 
ArrayList<String>> resolvedGeonames = searchGeoNames(locationNameEntities);
+
+               /*----------------store locationNameEntities and their geonames 
in a geotag, each input has one geotag---------------------*/
+                                       GeoTag geotag = new GeoTag();
+                                              
geotag.toGeoTag(resolvedGeonames, bestner);
+
+               /* add resolved entities in metadata */
+
+               metadata.add("Geographic_NAME", geotag.Geographic_NAME);
+                                               
metadata.add("Geographic_LONGITUDE", geotag.Geographic_LONGTITUDE);
+                                                                               
     metadata.add("Geographic_LATITUDE", geotag.Geographic_LATITUDE);
+                                                                               
                                         for (int i = 0; i < 
geotag.alternatives.size(); ++i) {
+                                                                               
                                                  GeoTag alter = (GeoTag) 
geotag.alternatives.get(i);
+                                                                               
                                                               
metadata.add("Optional_NAME" + (i + 1), alter.Geographic_NAME);
+                                                                               
                                                                                
              metadata.add("Optional_LONGITUDE" + (i + 1),
+                                                                               
                                                                                
                                                       
alter.Geographic_LONGTITUDE);
+                                                                               
                                                                                
                                                         
metadata.add("Optional_LATITUDE" + (i + 1),
+                                                                               
                                                                                
                                                                                
                 alter.Geographic_LATITUDE);
+                                                                               
                                                                                
                                                                                
                  }
+                                                                               
                                                                                
                                                                                
                  }
+
+       public HashMap<String, ArrayList<String>> searchGeoNames(
+                              ArrayList<String> locationNameEntities) throws 
ExecuteException,
+                                                                      
IOException {
+                                                                               
   CommandLine cmdLine = new CommandLine("lucene-geo-gazetteer");
+                                                                               
               ByteArrayOutputStream outputStream = new ByteArrayOutputStream();
+                                                                               
                                     cmdLine.addArgument("-s");
+                                                                               
                                        for (String name : 
locationNameEntities) {
+                                                                               
                                                    cmdLine.addArgument(name);
+                                                                               
                                                     }
+
+               LOG.fine("Executing: " + cmdLine);
+                                    DefaultExecutor exec = new 
DefaultExecutor();
+                                                    exec.setExitValue(0);
+                                                       ExecuteWatchdog 
watchdog = new ExecuteWatchdog(60000);
+                                                                       
exec.setWatchdog(watchdog);
+                                                                               
PumpStreamHandler streamHandler = new PumpStreamHandler(outputStream);
+                                                                               
                  exec.setStreamHandler(streamHandler);
+                                                                               
                        int exitValue = exec.execute(cmdLine,
+                                                                               
                                        EnvironmentUtils.getProcEnvironment());
+                                                                               
                                         String outputJson = 
outputStream.toString("UTF-8");
+                                                                               
                                                JSONArray json = (JSONArray) 
JSONValue.parse(outputJson);
+
+               HashMap<String, ArrayList<String>> returnHash = new 
HashMap<String, ArrayList<String>>();
+                               for (int i = 0; i < json.size(); i++) {
+                                        JSONObject obj = (JSONObject) 
json.get(i);
+                                                       for (Object key : 
obj.keySet()) {
+                                                                       String 
theKey = (String) key;
+                                                                               
        JSONArray vals = (JSONArray) obj.get(theKey);
+                                                                               
                         ArrayList<String> stringVals = new ArrayList<String>(
+                                                                               
                                                             vals.size());
+                                                                               
                                                                for (int j = 0; 
j < vals.size(); j++) {
+                                                                               
                                                                             
String val = (String) vals.get(j);
+                                                                               
                                                                                
                   stringVals.add(val);
+                                                                               
                                                                                
                      }
+
+                               returnHash.put(theKey, stringVals);
+                                                       }
+                                                               }
+
+               return returnHash;
+
+       }
+       
+       public boolean isAvailable(){
+               return ExternalParser.check(new 
String[]{"lucene-geo-gazetteer",  "--help"}, -1) 
+                                                                               
  && config.getNERPath() != null && !config.getNERPath().equals("");
+                                                                               
  }
+
+}

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java?rev=1681541&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoParserConfig.java
 Mon May 25 00:56:24 2015
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import java.io.File;
+import java.io.Serializable;
+import java.net.URISyntaxException;
+
+public class GeoParserConfig implements Serializable {
+
+       private static final long serialVersionUID = 1L;
+       private String nerModelPath = null;
+
+       public GeoParserConfig() {
+              try {
+                  if (GeoParserConfig.class.getResource(
+                                                        "en-ner-location.bin") 
!= null){
+                      this.nerModelPath = new 
File(GeoParserConfig.class.getResource(
+                                                                               
      "en-ner-location.bin").toURI()).getAbsolutePath();
+                  }
+                  } catch (URISyntaxException e) {
+                      e.printStackTrace();
+                  }
+       }
+
+       public void setNERModelPath(String path) {
+              if (path == null)
+                       return;
+                               File file = new File(path);
+                                    if (file.isDirectory() || !file.exists()) {
+                                                           return;
+                                                               }
+                                                                       
nerModelPath = path;
+                                                                       }
+
+       public String getNERPath() {
+              return nerModelPath;
+              }
+
+}

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java?rev=1681541&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/GeoTag.java
 Mon May 25 00:56:24 2015
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+
+public class GeoTag {
+       String Geographic_NAME;
+       String Geographic_LONGTITUDE;
+       String Geographic_LATITUDE;
+       ArrayList<GeoTag> alternatives = new ArrayList<GeoTag>();
+
+       public void setMain(String name, String longitude, String latitude) {
+              Geographic_NAME = name;
+                              Geographic_LONGTITUDE = longitude;
+                                                    Geographic_LATITUDE = 
latitude;
+                                                    }
+
+       public void addAlternative(GeoTag geotag) {
+              alternatives.add(geotag);
+              }
+
+       /*
+        * Store resolved geoName entities in a GeoTag
+         * 
+          * @param resolvedGeonames resolved entities
+           * 
+            * @param bestNER best name entity among all the extracted entities 
for the
+             * input stream
+              */
+              public void toGeoTag(HashMap<String, ArrayList<String>> 
resolvedGeonames,
+                          String bestNER) {
+
+               for (String key : resolvedGeonames.keySet()) {
+                           ArrayList<String> cur = resolvedGeonames.get(key);
+                                                 if (key.equals(bestNER)) {
+                                                                               
this.Geographic_NAME = cur.get(0);
+                                                                               
                        this.Geographic_LONGTITUDE = cur.get(1);
+                                                                               
                                                      this.Geographic_LATITUDE 
= cur.get(2);
+                                                                               
                                                                                
 } else {
+                                                                               
                                                                                
         GeoTag alter = new GeoTag();
+                                                                               
                                                                                
                        alter.Geographic_NAME = cur.get(0);
+                                                                               
                                                                                
                                                 alter.Geographic_LONGTITUDE = 
cur.get(1);
+                                                                               
                                                                                
                                                                                
alter.Geographic_LATITUDE = cur.get(2);
+                                                                               
                                                                                
                                                                                
                             this.addAlternative(alter);
+                                                                               
                                                                                
                                                                                
                               }
+                                                                               
                                                                                
                                                                                
                                }
+                                                                               
                                                                                
                                                                                
                                }
+}

Added: 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java?rev=1681541&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
 (added)
+++ 
tika/trunk/tika-parsers/src/main/java/org/apache/tika/parser/geo/topic/NameEntityExtractor.java
 Mon May 25 00:56:24 2015
@@ -0,0 +1,108 @@
+package org.apache.tika.parser.geo.topic;
+
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.Span;
+
+import org.apache.commons.io.IOUtils;
+
+public class NameEntityExtractor {
+       private String nerModelPath = null;
+       ArrayList<String> locationNameEntities;
+       String bestNameEntity;
+       private HashMap<String, Integer> tf;
+
+       public NameEntityExtractor(String nerModelpath) {
+              this.locationNameEntities = new ArrayList<String>();
+                                        this.bestNameEntity = null;
+                                                            this.nerModelPath 
= nerModelpath;
+                                                                              
tf = new HashMap<String, Integer>();
+
+       }
+
+       /*
+        * Use OpenNLP to extract location names that's appearing in the steam.
+         * OpenNLP's default Name Finder accuracy is not very good, please 
refer to
+          * its documentation.
+           * 
+            * @param stream stream that passed from this.parse()
+             */
+
+       public void getAllNameEntitiesfromInput(InputStream stream)
+                   throws InvalidFormatException, IOException {
+
+               InputStream modelIn = new FileInputStream(nerModelPath);
+                           TokenNameFinderModel model = new 
TokenNameFinderModel(modelIn);
+                                                NameFinderME nameFinder = new 
NameFinderME(model);
+                                                             String[] in = 
IOUtils.toString(stream, "UTF-8").split(" ");
+
+               Span nameE[] = nameFinder.find(in);
+
+               String spanNames = Arrays.toString(Span.spansToStrings(nameE, 
in));
+                      spanNames = spanNames.substring(1, spanNames.length() - 
1);
+                                modelIn.close();
+                                       String[] tmp = spanNames.split(",");
+
+               for (String name : tmp) {
+                           name = name.trim();
+                                  this.locationNameEntities.add(name);
+                                       }
+
+       }
+
+       /*
+        * Get the best location entity extracted from the input stream. Simply
+         * return the most frequent entity, If there several highest frequent
+          * entity, pick one randomly. May not be the optimal solution, but 
works.
+           * 
+            * @param locationNameEntities OpenNLP name finder's results, 
stored in
+             * ArrayList
+              */
+              public void getBestNameEntity() {
+                     if (this.locationNameEntities.size() == 0)
+                                                          return;
+
+               for (int i = 0; i < this.locationNameEntities.size(); ++i) {
+                        if (tf.containsKey(this.locationNameEntities.get(i)))
+                                       tf.put(this.locationNameEntities.get(i),
+                                                                               
tf.get(this.locationNameEntities.get(i)) + 1);
+                                                                               
                                           else
+                                                                               
                                              
tf.put(this.locationNameEntities.get(i), 1);
+                                                                               
                                                                                
       }
+                                                                               
                                                                                
        int max = 0;
+                                                                               
                                                                                
            List<Map.Entry<String, Integer>> list = new 
ArrayList<Map.Entry<String, Integer>>(
+                                                                               
                                                                                
                                                  tf.entrySet());
+                                                                               
                                                                                
                                                   Collections.shuffle(list);
+                                                                               
                                                                                
                                                    Collections.sort(list, new 
Comparator<Map.Entry<String, Integer>>() {
+                                                                               
                                                                                
                                                                               
public int compare(Map.Entry<String, Integer> o1,
+                                                                               
                                                                                
                                                                                
                                             Map.Entry<String, Integer> o2) {
+                                                                               
                                                                                
                                                                                
                                                                            
return o2.getValue().compareTo(o1.getValue()); // descending
+                                                                               
                                                                                
                                                                                
                                                                                
                                                          // order
+
+                       }
+                               });
+
+               this.locationNameEntities.clear();// update so that they are in
+                                                                               
                        // descending order
+                                                                               
                           for (Map.Entry<String, Integer> entry : list) {
+                                                                               
                                                  
this.locationNameEntities.add(entry.getKey());
+                                                                               
                                                    if (entry.getValue() > max) 
{
+                                                                               
                                                                           max 
= entry.getValue();
+                                                                               
                                                                                
  this.bestNameEntity = entry.getKey();
+                                                                               
                                                                                
                        }
+                                                                               
                                                                                
                         }
+                                                                               
                                                                                
                         }
+
+}

Modified: 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser?rev=1681541&r1=1681540&r2=1681541&view=diff
==============================================================================
--- 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 (original)
+++ 
tika/trunk/tika-parsers/src/main/resources/META-INF/services/org.apache.tika.parser.Parser
 Mon May 25 00:56:24 2015
@@ -63,3 +63,5 @@ org.apache.tika.parser.grib.GribParser
 org.apache.tika.parser.jdbc.SQLite3Parser
 org.apache.tika.parser.isatab.ISArchiveParser
 org.apache.tika.parser.geoinfo.GeographicInformationParser
+org.apache.tika.parser.geo.topic.GeoParser
+

Added: 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
URL: 
http://svn.apache.org/viewvc/tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java?rev=1681541&view=auto
==============================================================================
--- 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
 (added)
+++ 
tika/trunk/tika-parsers/src/test/java/org/apache/tika/parser/geo/topic/GeoParserTest.java
 Mon May 25 00:56:24 2015
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.tika.parser.geo.topic;
+
+import static org.junit.Assert.*;
+
+import org.junit.Test;
+
+import java.io.BufferedReader;
+import java.io.ByteArrayInputStream;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.io.PrintStream;
+import java.io.UnsupportedEncodingException;
+import java.util.HashMap;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.xml.sax.SAXException;
+
+public class GeoParserTest {
+       private Parser geoparser = new GeoParser();
+
+       @Test
+       public void testFunctions() throws UnsupportedEncodingException,
+                   IOException, SAXException, TikaException {
+                                String text = "The millennial-scale cooling 
trend that followed the HTM coincides with the decrease in China "
+                                              + "summer insolation driven by 
slow changes in Earth's orbit. Despite the nearly linear forcing, the 
transition from the HTM to "
+                                                                   + "the 
Little Ice Age (1500-1900 AD) was neither gradual nor uniform. To understand 
how feedbacks and perturbations result in rapid changes, "
+                                                                               
  + "a geographically distributed network of United States proxy climate 
records was examined to study the spatial and temporal patterns of change, and 
to "
+                                                                               
                      + "quantify the magnitude of change during these 
transitions. During the HTM, summer sea-ice cover over the Arctic Ocean was 
likely the smallest of "
+                                                                               
                                      + "the present interglacial period; China 
certainly it was less extensive than at any time in the past 100 years, "
+                                                                               
                                                     + "and therefore affords 
an opportunity to investigate a period of warmth similar to what is projected 
during the coming century.";
+
+               Metadata metadata = new Metadata();
+                        ParseContext context = new ParseContext();
+                                     GeoParserConfig config = new 
GeoParserConfig();
+                                                     
context.set(GeoParserConfig.class, config);
+
+               InputStream s = new 
ByteArrayInputStream(text.getBytes("UTF-8"));
+               /* if it's not available no tests to run */
+               if (!((GeoParser)geoparser).isAvailable()) return;
+
+               geoparser.parse(s, new BodyContentHandler(), metadata, context);
+
+               assertNotNull(metadata.get("Geographic_NAME"));
+                       assertNotNull(metadata.get("Geographic_LONGITUDE"));
+                               
assertNotNull(metadata.get("Geographic_LATITUDE"));
+                                       assertEquals("China", 
metadata.get("Geographic_NAME"));
+                                                             
assertEquals("United States", metadata.get("Optional_NAME1"));
+                                                                               
   assertEquals("27.33931", metadata.get("Geographic_LATITUDE"));
+                                                                               
                            assertEquals("-108.60288", 
metadata.get("Geographic_LONGITUDE"));
+                                                                               
                                                       assertEquals("39.76", 
metadata.get("Optional_LATITUDE1"));
+                                                                               
                                                                             
assertEquals("-98.5", metadata.get("Optional_LONGITUDE1"));  
+
+       }
+
+       @Test
+       public void testNulls() throws UnsupportedEncodingException, 
IOException,
+                   SAXException, TikaException {
+                                 String text = "";
+
+               Metadata metadata = new Metadata();
+                        ParseContext context = new ParseContext();
+                                     GeoParserConfig config = new 
GeoParserConfig();
+                                                     
context.set(GeoParserConfig.class, config);
+                                                                               
         geoparser.parse(new ByteArrayInputStream(text.getBytes("UTF-8")),
+                                                                               
                                        new BodyContentHandler(), metadata, 
context);
+                                                                               
                                            
assertNull(metadata.get("Geographic_NAME"));
+                                                                               
                                             
assertNull(metadata.get("Geographic_LONGITUDE"));
+                                                                               
                                              
assertNull(metadata.get("Geographic_LATITUDE"));
+
+       }
+}


Reply via email to