Repository: opennlp Updated Branches: refs/heads/trunk 92e541c93 -> 49f8e25a1
OPENNLP-582 Added morfologik addon. Thanks to Rodrigo Agerri for providing a patch. Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/f3e90579 Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/f3e90579 Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/f3e90579 Branch: refs/heads/trunk Commit: f3e90579c5feba71dc4f04adaa4acc5ecc7f72e9 Parents: Author: Jörn Kottmann <[email protected]> Authored: Thu Nov 14 21:24:13 2013 +0000 Committer: Jörn Kottmann <[email protected]> Committed: Thu Nov 14 21:24:13 2013 +0000 ---------------------------------------------------------------------- pom.xml | 50 ++++++++++ .../lemmatizer/MorfologikLemmatizer.java | 96 ++++++++++++++++++++ 2 files changed, 146 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/f3e90579/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000..67e1eaa --- /dev/null +++ b/pom.xml @@ -0,0 +1,50 @@ +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <groupId>org.apache.opennlp</groupId> + <artifactId>morfologik-addon</artifactId> + <version>1.0-SNAPSHOT</version> + <packaging>jar</packaging> + <name>Morfologik Addon</name> + + <url>http://maven.apache.org</url> + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <version>2.3.2</version> + <configuration> + <source>1.7</source> + <target>1.7</target> + </configuration> + </plugin> + </plugins> + </build> + <properties> + <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> + </properties> + + <dependencies> + <dependency> + <groupId>org.carrot2</groupId> + <artifactId>morfologik-stemming</artifactId> + <version>1.6.0</version> + <scope>compile</scope> + </dependency> + + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-tools</artifactId> + <version>1.6.0-SNAPSHOT</version> + </dependency> + + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>3.8.1</version> + <scope>test</scope> + </dependency> + </dependencies> +</project> http://git-wip-us.apache.org/repos/asf/opennlp/blob/f3e90579/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java new file mode 100644 index 0000000..99694a5 --- /dev/null +++ b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.lemmatizer; + +import java.io.IOException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import opennlp.tools.lemmatizer.DictionaryLemmatizer; +import morfologik.stemming.Dictionary; +import morfologik.stemming.DictionaryLookup; +import morfologik.stemming.IStemmer; +import morfologik.stemming.WordData; + +public class MorfologikLemmatizer implements DictionaryLemmatizer { + + private IStemmer dictLookup; + public final Set<String> constantTags = new HashSet<String>(Arrays.asList( + "NNP", "NP00000")); + + public MorfologikLemmatizer(URL dictURL) throws IllegalArgumentException, + IOException { + dictLookup = new DictionaryLookup(Dictionary.read(dictURL)); + } + + private HashMap<List<String>, String> getLemmaTagsDict(String word) { + List<WordData> wdList = dictLookup.lookup(word); + HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>(); + for (WordData wd : wdList) { + List<String> wordLemmaTags = new ArrayList<String>(); + wordLemmaTags.add(word); + wordLemmaTags.add(wd.getTag().toString()); + dictMap.put(wordLemmaTags, wd.getStem().toString()); + } + return dictMap; + } + + private List<String> getDictKeys(String word, String postag) { + List<String> keys = new ArrayList<String>(); + if (constantTags.contains(postag)) { + keys.addAll(Arrays.asList(word, postag)); + } else { + keys.addAll(Arrays.asList(word.toLowerCase(), postag)); + } + return keys; + } + + private HashMap<List<String>, String> getDictMap(String word, String postag) { + HashMap<List<String>, String> dictMap = new HashMap<List<String>, String>(); + + if (constantTags.contains(postag)) { + dictMap = this.getLemmaTagsDict(word); + } else { + dictMap = this.getLemmaTagsDict(word.toLowerCase()); + } + return dictMap; + } + + public String lemmatize(String word, String postag) { + String lemma = null; + List<String> keys = this.getDictKeys(word, postag); + HashMap<List<String>, String> dictMap = this.getDictMap(word, postag); + // lookup lemma as value of the map + String keyValue = dictMap.get(keys); + if (keyValue != null) { + lemma = keyValue; + } else if (keyValue == null && constantTags.contains(postag)) { + lemma = word; + } else if (keyValue == null && word.toUpperCase() == word) { + lemma = word; + } else { + lemma = word.toLowerCase(); + } + return lemma; + } +}
