Repository: opennlp
Updated Branches:
  refs/heads/trunk 92e541c93 -> 49f8e25a1


OPENNLP-582 Added morfologik addon. Thanks to Rodrigo Agerri for providing a 
patch.


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/f3e90579
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/f3e90579
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/f3e90579

Branch: refs/heads/trunk
Commit: f3e90579c5feba71dc4f04adaa4acc5ecc7f72e9
Parents: 
Author: Jörn Kottmann <[email protected]>
Authored: Thu Nov 14 21:24:13 2013 +0000
Committer: Jörn Kottmann <[email protected]>
Committed: Thu Nov 14 21:24:13 2013 +0000

----------------------------------------------------------------------
 pom.xml                                         | 50 ++++++++++
 .../lemmatizer/MorfologikLemmatizer.java        | 96 ++++++++++++++++++++
 2 files changed, 146 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/f3e90579/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
new file mode 100644
index 0000000..67e1eaa
--- /dev/null
+++ b/pom.xml
@@ -0,0 +1,50 @@
+<project xmlns="http://maven.apache.org/POM/4.0.0"; 
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance";
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 
http://maven.apache.org/xsd/maven-4.0.0.xsd";>
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>org.apache.opennlp</groupId>
+  <artifactId>morfologik-addon</artifactId>
+  <version>1.0-SNAPSHOT</version>
+  <packaging>jar</packaging>
+  <name>Morfologik Addon</name>
+
+  <url>http://maven.apache.org</url>
+    <build>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-compiler-plugin</artifactId>
+                <version>2.3.2</version>
+                <configuration>
+                    <source>1.7</source>
+                    <target>1.7</target>
+                </configuration>
+            </plugin>
+        </plugins>
+    </build>
+    <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+  </properties>
+
+  <dependencies>
+   <dependency>
+      <groupId>org.carrot2</groupId>
+      <artifactId>morfologik-stemming</artifactId>
+      <version>1.6.0</version>
+      <scope>compile</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.opennlp</groupId>
+      <artifactId>opennlp-tools</artifactId>
+      <version>1.6.0-SNAPSHOT</version>
+    </dependency>
+
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <version>3.8.1</version>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+</project>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f3e90579/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
----------------------------------------------------------------------
diff --git 
a/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java 
b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
new file mode 100644
index 0000000..99694a5
--- /dev/null
+++ b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.morfologik.lemmatizer;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.lemmatizer.DictionaryLemmatizer;
+import morfologik.stemming.Dictionary;
+import morfologik.stemming.DictionaryLookup;
+import morfologik.stemming.IStemmer;
+import morfologik.stemming.WordData;
+
+public class MorfologikLemmatizer implements DictionaryLemmatizer {
+
+  private IStemmer dictLookup;
+  public final Set<String> constantTags = new HashSet<String>(Arrays.asList(
+      "NNP", "NP00000"));
+
+  public MorfologikLemmatizer(URL dictURL) throws IllegalArgumentException,
+      IOException {
+    dictLookup = new DictionaryLookup(Dictionary.read(dictURL));
+  }
+
+  private HashMap<List<String>, String> getLemmaTagsDict(String word) {
+    List<WordData> wdList = dictLookup.lookup(word);
+    HashMap<List<String>, String> dictMap = new HashMap<List<String>, 
String>();
+    for (WordData wd : wdList) {
+      List<String> wordLemmaTags = new ArrayList<String>();
+      wordLemmaTags.add(word);
+      wordLemmaTags.add(wd.getTag().toString());
+      dictMap.put(wordLemmaTags, wd.getStem().toString());
+    }
+    return dictMap;
+  }
+
+  private List<String> getDictKeys(String word, String postag) {
+    List<String> keys = new ArrayList<String>();
+    if (constantTags.contains(postag)) {
+      keys.addAll(Arrays.asList(word, postag));
+    } else {
+      keys.addAll(Arrays.asList(word.toLowerCase(), postag));
+    }
+    return keys;
+  }
+
+  private HashMap<List<String>, String> getDictMap(String word, String postag) 
{
+    HashMap<List<String>, String> dictMap = new HashMap<List<String>, 
String>();
+
+    if (constantTags.contains(postag)) {
+      dictMap = this.getLemmaTagsDict(word);
+    } else {
+      dictMap = this.getLemmaTagsDict(word.toLowerCase());
+    }
+    return dictMap;
+  }
+
+  public String lemmatize(String word, String postag) {
+    String lemma = null;
+    List<String> keys = this.getDictKeys(word, postag);
+    HashMap<List<String>, String> dictMap = this.getDictMap(word, postag);
+    // lookup lemma as value of the map
+    String keyValue = dictMap.get(keys);
+    if (keyValue != null) {
+      lemma = keyValue;
+    } else if (keyValue == null && constantTags.contains(postag)) {
+      lemma = word;
+    } else if (keyValue == null && word.toUpperCase() == word) {
+      lemma = word;
+    } else {
+      lemma = word.toLowerCase();
+    }
+    return lemma;
+  }
+}

Reply via email to