OPENNLP-904 Harmonize lemmatizer API and function to get multiple lemmas OPENNLP-904 add minor correction after PR comment
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/c09c9a4b Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/c09c9a4b Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/c09c9a4b Branch: refs/heads/parser_regression Commit: c09c9a4baacf59b33cf1b7cd50f5f0f8d25e7955 Parents: ed246d8 Author: Rodrigo Agerri <[email protected]> Authored: Fri Feb 3 16:00:38 2017 +0100 Committer: Jörn Kottmann <[email protected]> Committed: Thu Apr 20 12:40:22 2017 +0200 ---------------------------------------------------------------------- .../cmdline/lemmatizer/LemmatizerMETool.java | 4 +- .../tools/lemmatizer/DictionaryLemmatizer.java | 70 ++++++++++++++------ .../lemmatizer/LemmaSampleEventStream.java | 2 +- .../tools/lemmatizer/LemmaSampleStream.java | 4 +- .../opennlp/tools/lemmatizer/Lemmatizer.java | 16 ++++- .../opennlp/tools/lemmatizer/LemmatizerME.java | 64 ++++++++++++++++-- .../tools/lemmatizer/DummyLemmatizer.java | 7 ++ .../tools/lemmatizer/LemmatizerMETest.java | 3 +- 8 files changed, 136 insertions(+), 34 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/c09c9a4b/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java index e4e47b5..90ba95d 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/lemmatizer/LemmatizerMETool.java @@ -72,10 +72,8 @@ public class LemmatizerMETool extends BasicCmdLineTool { continue; } - String[] preds = lemmatizer.lemmatize(posSample.getSentence(), + String[] lemmas = lemmatizer.lemmatize(posSample.getSentence(), posSample.getTags()); - String[] lemmas = lemmatizer.decodeLemmas(posSample.getSentence(), - preds); System.out.println(new LemmaSample(posSample.getSentence(), posSample.getTags(), lemmas).toString()); http://git-wip-us.apache.org/repos/asf/opennlp/blob/c09c9a4b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java index b1b04a1..9f0b0b0 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java +++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java @@ -37,7 +37,7 @@ public class DictionaryLemmatizer implements Lemmatizer { /** * The hashmap containing the dictionary. */ - private final Map<List<String>, String> dictMap; + private final Map<List<String>, List<String>> dictMap; /** * Construct a hashmap from the input tab separated dictionary. @@ -47,26 +47,24 @@ public class DictionaryLemmatizer implements Lemmatizer { * @param dictionary * the input dictionary via inputstream */ - public DictionaryLemmatizer(final InputStream dictionary) { + public DictionaryLemmatizer(final InputStream dictionary) throws IOException { this.dictMap = new HashMap<>(); - final BufferedReader breader = new BufferedReader(new InputStreamReader(dictionary)); + final BufferedReader breader = new BufferedReader( + new InputStreamReader(dictionary)); String line; - try { - while ((line = breader.readLine()) != null) { - final String[] elems = line.split("\t"); - this.dictMap.put(Arrays.asList(elems[0], elems[1]), elems[2]); - } - } catch (final IOException e) { - e.printStackTrace(); + while ((line = breader.readLine()) != null) { + final String[] elems = line.split("\t"); + this.dictMap.put(Arrays.asList(elems[0], elems[1]), Arrays.asList(elems[2])); } } + /** * Get the Map containing the dictionary. * * @return dictMap the Map */ - public Map<List<String>, String> getDictMap() { + public Map<List<String>, List<String>> getDictMap() { return this.dictMap; } @@ -85,31 +83,65 @@ public class DictionaryLemmatizer implements Lemmatizer { return keys; } + public String[] lemmatize(final String[] tokens, final String[] postags) { List<String> lemmas = new ArrayList<>(); for (int i = 0; i < tokens.length; i++) { - lemmas.add(this.apply(tokens[i], postags[i])); + lemmas.add(this.lemmatize(tokens[i], postags[i])); } return lemmas.toArray(new String[lemmas.size()]); } + public List<List<String>> lemmatize(final List<String> tokens, final List<String> posTags) { + List<List<String>> allLemmas = new ArrayList<>(); + for (int i = 0; i < tokens.size(); i++) { + allLemmas.add(this.getAllLemmas(tokens.get(i), posTags.get(i))); + } + return allLemmas; + } + /** * Lookup lemma in a dictionary. Outputs "O" if not found. - * @param word the token - * @param postag the postag + * + * @param word + * the token + * @param postag + * the postag * @return the lemma */ - public String apply(final String word, final String postag) { + private String lemmatize(final String word, final String postag) { String lemma; final List<String> keys = this.getDictKeys(word, postag); // lookup lemma as value of the map - final String keyValue = this.dictMap.get(keys); - if (keyValue != null) { - lemma = keyValue; + final List<String> keyValues = this.dictMap.get(keys); + if (!keyValues.isEmpty()) { + lemma = keyValues.get(0); } else { lemma = "O"; } return lemma; } -} + /** + * Lookup every lemma for a word,pos tag in a dictionary. Outputs "O" if not + * found. + * + * @param word + * the token + * @param postag + * the postag + * @return every lemma + */ + private List<String> getAllLemmas(final String word, final String postag) { + List<String> lemmasList = new ArrayList<>(); + final List<String> keys = this.getDictKeys(word, postag); + // lookup lemma as value of the map + final List<String> keyValues = this.dictMap.get(keys); + if (!keyValues.isEmpty()) { + lemmasList.addAll(keyValues); + } else { + lemmasList.add("O"); + } + return lemmasList; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/c09c9a4b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java index fc1a558..a8d71e8 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleEventStream.java @@ -49,7 +49,7 @@ public class LemmaSampleEventStream extends AbstractEventStream<LemmaSample> { List<Event> events = new ArrayList<>(); String[] toksArray = sample.getTokens(); String[] tagsArray = sample.getTags(); - String[] lemmasArray = sample.getLemmas(); + String[] lemmasArray = LemmatizerME.encodeLemmas(toksArray,sample.getLemmas()); for (int ei = 0, el = sample.getTokens().length; ei < el; ei++) { events.add(new Event(lemmasArray[ei], contextGenerator.getContext(ei,toksArray,tagsArray,lemmasArray))); http://git-wip-us.apache.org/repos/asf/opennlp/blob/c09c9a4b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java index 0a133c3..9c661a5 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmaSampleStream.java @@ -23,7 +23,6 @@ import java.util.List; import opennlp.tools.util.FilterObjectStream; import opennlp.tools.util.ObjectStream; -import opennlp.tools.util.StringUtil; /** @@ -51,8 +50,7 @@ public class LemmaSampleStream extends FilterObjectStream<String, LemmaSample> { else { toks.add(parts[0]); tags.add(parts[1]); - String ses = StringUtil.getShortestEditScript(parts[0], parts[2]); - preds.add(ses); + preds.add(parts[2]); } } if (toks.size() > 0) { http://git-wip-us.apache.org/repos/asf/opennlp/blob/c09c9a4b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java index f21f9e3..933eec1 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java +++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/Lemmatizer.java @@ -17,19 +17,31 @@ package opennlp.tools.lemmatizer; +import java.util.List; + /** * The interface for lemmatizers. */ public interface Lemmatizer { /** - * Generates lemma tags for the word and postag returning the result in an array. + * Generates lemmas for the word and postag returning the result in an array. * * @param toks an array of the tokens * @param tags an array of the pos tags * - * @return an array of lemma classes for each token in the sequence. + * @return an array of possible lemmas for each token in the sequence. */ String[] lemmatize(String[] toks, String[] tags); + /** + * Generates a lemma tags for the word and postag returning the result in a list + * of every possible lemma for each token and postag. + * + * @param toks an array of the tokens + * @param tags an array of the pos tags + * @return a list of every possible lemma for each token in the sequence. + */ + List<List<String>> lemmatize(List<String> toks, List<String> tags); + } http://git-wip-us.apache.org/repos/asf/opennlp/blob/c09c9a4b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java index 4855fda..2b8122f 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java +++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java @@ -19,6 +19,7 @@ package opennlp.tools.lemmatizer; import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -47,6 +48,7 @@ import opennlp.tools.util.TrainingParameters; */ public class LemmatizerME implements Lemmatizer { + public static final int LEMMA_NUMBER = 29; public static final int DEFAULT_BEAM_SIZE = 3; protected int beamSize; private Sequence bestSequence; @@ -86,9 +88,52 @@ public class LemmatizerME implements Lemmatizer { } public String[] lemmatize(String[] toks, String[] tags) { + String[] ses = predictSES(toks, tags); + String[] lemmas = decodeLemmas(toks, ses); + return lemmas; + } + + @Override public List<List<String>> lemmatize(List<String> toks, + List<String> tags) { + String[] tokens = toks.toArray(new String[toks.size()]); + String[] posTags = tags.toArray(new String[tags.size()]); + String[][] allLemmas = predictLemmas(LEMMA_NUMBER, tokens, posTags); + List<List<String>> predictedLemmas = new ArrayList<>(); + for (int i = 0; i < allLemmas.length; i++) { + predictedLemmas.add(Arrays.asList(allLemmas[i])); + } + return predictedLemmas; + } + + /** + * Predict Short Edit Script (automatically induced lemma class). + * @param toks the array of tokens + * @param tags the array of pos tags + * @return an array containing the lemma classes + */ + public String[] predictSES(String[] toks, String[] tags) { bestSequence = model.bestSequence(toks, new Object[] {tags}, contextGenerator, sequenceValidator); - List<String> c = bestSequence.getOutcomes(); - return c.toArray(new String[c.size()]); + List<String> ses = bestSequence.getOutcomes(); + return ses.toArray(new String[ses.size()]); + } + + /** + * Predict all possible lemmas (using a default upper bound). + * @param numLemmas the default number of lemmas + * @param toks the tokens + * @param tags the postags + * @return a double array containing all posible lemmas for each token and postag pair + */ + public String[][] predictLemmas(int numLemmas, String[] toks, String[] tags) { + Sequence[] bestSequences = model.bestSequences(numLemmas, toks, new Object[] {tags}, + contextGenerator, sequenceValidator); + String[][] allLemmas = new String[bestSequences.length][]; + for (int i = 0; i < allLemmas.length; i++) { + List<String> ses = bestSequences[i].getOutcomes(); + String[] sesArray = ses.toArray(new String[ses.size()]); + allLemmas[i] = decodeLemmas(toks,sesArray); + } + return allLemmas; } /** @@ -97,11 +142,10 @@ public class LemmatizerME implements Lemmatizer { * @param preds the predicted lemma classes * @return the array of decoded lemmas */ - public String[] decodeLemmas(String[] toks, String[] preds) { + public static String[] decodeLemmas(String[] toks, String[] preds) { List<String> lemmas = new ArrayList<>(); for (int i = 0; i < toks.length; i++) { String lemma = StringUtil.decodeShortestEditScript(toks[i].toLowerCase(), preds[i]); - //System.err.println("-> DEBUG: " + toks[i].toLowerCase() + " " + preds[i] + " " + lemma); if (lemma.length() == 0) { lemma = "_"; } @@ -110,6 +154,18 @@ public class LemmatizerME implements Lemmatizer { return lemmas.toArray(new String[lemmas.size()]); } + public static String[] encodeLemmas(String[] toks, String[] lemmas) { + List<String> sesList = new ArrayList<>(); + for (int i = 0; i < toks.length; i++) { + String ses = StringUtil.getShortestEditScript(toks[i], lemmas[i]); + if (ses.length() == 0) { + ses = "_"; + } + sesList.add(ses); + } + return sesList.toArray(new String[sesList.size()]); + } + public Sequence[] topKSequences(String[] sentence, String[] tags) { return model.bestSequences(DEFAULT_BEAM_SIZE, sentence, new Object[] { tags }, contextGenerator, sequenceValidator); http://git-wip-us.apache.org/repos/asf/opennlp/blob/c09c9a4b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java index 489ba38..dcfc883 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java +++ b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DummyLemmatizer.java @@ -19,6 +19,7 @@ package opennlp.tools.lemmatizer; import java.io.IOException; import java.util.Arrays; +import java.util.List; /** * This dummy lemmatizer implementation simulates a LemmatizerME. The file has @@ -56,4 +57,10 @@ public class DummyLemmatizer implements Lemmatizer { } } + @Override + public List<List<String>> lemmatize(List<String> toks, + List<String> tags) { + return null; + } + } http://git-wip-us.apache.org/repos/asf/opennlp/blob/c09c9a4b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java index 76b4cd5..97dcc3c 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java @@ -82,8 +82,7 @@ public class LemmatizerMETest { @Test public void testLemmasAsArray() throws Exception { - String[] preds = lemmatizer.lemmatize(tokens, postags); - String[] lemmas = lemmatizer.decodeLemmas(tokens, preds); + String[] lemmas = lemmatizer.lemmatize(tokens, postags); Assert.assertArrayEquals(expect, lemmas); }
