Repository: opennlp
Updated Branches:
  refs/heads/trunk 65a73b130 -> 92bc7f05a


Add test to automate SF model regression test

This test automates the SF model regression tests with private
test data derived from the leipzig corpus. This has to be improved
to use only resources that can be shared across the community.

See issue OPENNLP-877


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/92bc7f05
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/92bc7f05
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/92bc7f05

Branch: refs/heads/trunk
Commit: 92bc7f05a26a6a867b10ac64a84f6feb12ab5f98
Parents: 65a73b1
Author: Jörn Kottmann <[email protected]>
Authored: Wed Nov 2 19:02:45 2016 +0100
Committer: Jörn Kottmann <[email protected]>
Committed: Wed Nov 2 19:02:45 2016 +0100

----------------------------------------------------------------------
 .../tools/eval/SourceForgeModelEval.java        | 310 +++++++++++++++++++
 1 file changed, 310 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/92bc7f05/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
----------------------------------------------------------------------
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java 
b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
new file mode 100644
index 0000000..f63fcb5
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
@@ -0,0 +1,310 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.eval;
+
+import junit.framework.Assert;
+import opennlp.tools.chunker.Chunker;
+import opennlp.tools.chunker.ChunkerME;
+import opennlp.tools.chunker.ChunkerModel;
+import opennlp.tools.cmdline.parser.ParserTool;
+import opennlp.tools.formats.LeipzigDoccatSampleStream;
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.parser.Parse;
+import opennlp.tools.parser.Parser;
+import opennlp.tools.parser.ParserFactory;
+import opennlp.tools.parser.ParserModel;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.postag.POSTagger;
+import opennlp.tools.postag.POSTaggerME;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.MarkableFileInputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.Span;
+import org.junit.Test;
+
+import java.io.File;
+import java.io.IOException;
+import java.math.BigInteger;
+import java.nio.charset.Charset;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+
+/**
+ * The tests only run if the input text files are available and those
+ * are derived from the leipzig corpus.
+ *
+ * Next step is to replace the input texts with ones that don't have license 
issues.
+ * Wikinews is probably a vey good source. In addition also models that
+ * can be shared are required to give everyone the possibilty to run this.
+ */
+public class SourceForgeModelEval {
+
+  private static MessageDigest createDigest() {
+    try {
+      return MessageDigest.getInstance("MD5");
+    } catch (NoSuchAlgorithmException e) {
+      throw new IllegalStateException(e);
+    }
+  }
+
+  @Test
+  public void evalSentenceModel() throws IOException {
+
+    SentenceModel model = new SentenceModel(
+            new File("/home/burn/opennlp-data-dir", "models-sf/en-sent.bin"));
+
+    MessageDigest digest = createDigest();
+
+    SentenceDetector sentenceDetector = new SentenceDetectorME(model);
+
+    StringBuilder text = new StringBuilder();
+
+    try (ObjectStream<String> lines = new PlainTextByLineStream(
+            new MarkableFileInputStreamFactory(new 
File("/home/burn/opennlp-data-dir",
+            "leipzig/sentences.txt")), Charset.forName("UTF-8"))) {
+
+      String line;
+      while ((line = lines.read()) != null) {
+        text.append(line).append(" ");
+      }
+    }
+
+    String[] sentences = sentenceDetector.sentDetect(text.toString());
+
+    for (String sentence : sentences) {
+      digest.update(sentence.getBytes("UTF-8"));
+    }
+
+    Assert.assertEquals(new 
BigInteger("54058993675314170033586747935067060992"),
+            new BigInteger(1, digest.digest()));
+  }
+
+  @Test
+  public void evalTokenModel() throws IOException {
+
+    TokenizerModel model = new TokenizerModel(
+            new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-token.bin"));
+
+    MessageDigest digest = createDigest();
+
+    Tokenizer tokenizer = new TokenizerME(model);
+
+    try (ObjectStream<String> lines = new PlainTextByLineStream(
+            new MarkableFileInputStreamFactory(new 
File(EvalUtil.getOpennlpDataDir(),
+            "leipzig/sentences.txt")), Charset.forName("UTF-8"))) {
+
+      String line;
+      while ((line = lines.read()) != null) {
+        String[] tokens = tokenizer.tokenize(line);
+        for (String token : tokens) {
+          digest.update(token.getBytes("UTF-8"));
+        }
+      }
+    }
+
+    Assert.assertEquals(new 
BigInteger("309548448163611475251363008574168734058"),
+            new BigInteger(1, digest.digest()));
+  }
+
+  private void evalNameFinder(TokenNameFinderModel model, BigInteger 
expectedHash)
+      throws IOException {
+
+    MessageDigest digest = createDigest();
+
+    TokenNameFinder nameFinder = new NameFinderME(model);
+
+    try (ObjectStream<String> lines = new PlainTextByLineStream(
+        new MarkableFileInputStreamFactory(new 
File(EvalUtil.getOpennlpDataDir(), "leipzig/simpleTok.txt")),
+        Charset.forName("UTF-8"))) {
+
+      String line;
+      while ((line = lines.read()) != null) {
+        Span[] names = 
nameFinder.find(WhitespaceTokenizer.INSTANCE.tokenize(line));
+        for (Span name : names) {
+          digest.update((name.getType() + name.getStart() + 
name.getEnd()).getBytes("UTF-8"));
+        }
+      }
+    }
+
+    Assert.assertEquals(expectedHash, new BigInteger(1, digest.digest()));
+  }
+
+  @Test
+  public void evalNerDateModel() throws IOException {
+    TokenNameFinderModel personModel = new TokenNameFinderModel(
+            new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-ner-date.bin"));
+
+    evalNameFinder(personModel, new 
BigInteger("13595680199220579055030594287753821185"));
+  }
+
+  @Test
+  public void evalNerLocationModel() throws IOException {
+    TokenNameFinderModel personModel = new TokenNameFinderModel(
+            new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-ner-location.bin"));
+
+    evalNameFinder(personModel, new 
BigInteger("61423868331440897441202803979849564658"));
+  }
+
+  @Test
+  public void evalNerMoneyModel() throws IOException {
+    TokenNameFinderModel personModel = new TokenNameFinderModel(
+            new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-ner-money.bin"));
+
+    evalNameFinder(personModel, new 
BigInteger("31779803056581858429003932617173745364"));
+  }
+
+  @Test
+  public void evalNerOrganizationModel() throws IOException {
+    TokenNameFinderModel personModel = new TokenNameFinderModel(
+            new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-ner-organization.bin"));
+
+    evalNameFinder(personModel, new 
BigInteger("268615755804346283904103340480818555730"));
+  }
+
+  @Test
+  public void evalNerPercentageModel() throws IOException {
+    TokenNameFinderModel personModel = new TokenNameFinderModel(
+            new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-ner-percentage.bin"));
+
+    evalNameFinder(personModel, new 
BigInteger("1793019183238911248412519564457497503"));
+  }
+
+  @Test
+  public void evalNerPersonModel() throws IOException {
+    TokenNameFinderModel personModel = new TokenNameFinderModel(
+        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-person.bin"));
+
+    evalNameFinder(personModel, new 
BigInteger("260378080051855476096106859434660527393"));
+  }
+
+  @Test
+  public void evalNerTimeModel() throws IOException {
+    TokenNameFinderModel personModel = new TokenNameFinderModel(
+        new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-ner-time.bin"));
+
+    evalNameFinder(personModel, new 
BigInteger("264798318876255738642952635833268231353"));
+  }
+
+  @Test
+  public void evalChunkerModel() throws IOException {
+
+    ChunkerModel model = new ChunkerModel(
+            new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-chunker.bin"));
+
+    MessageDigest digest = createDigest();
+
+    Chunker chunker = new ChunkerME(model);
+
+    try (ObjectStream<String> lines = new PlainTextByLineStream(
+            new MarkableFileInputStreamFactory(new 
File(EvalUtil.getOpennlpDataDir(), "leipzig/simpleTokPos.txt")),
+            Charset.forName("UTF-8"))) {
+
+      String line;
+      while ((line = lines.read()) != null) {
+        POSSample sentence = POSSample.parse(line);
+
+        String[] chunks = chunker.chunk(sentence.getSentence(), 
sentence.getTags());
+        for (String chunk : chunks) {
+          digest.update(chunk.getBytes("UTF-8"));
+        }
+      }
+    }
+
+    Assert.assertEquals(new 
BigInteger("87766988424222321513554054789708059330"),
+        new BigInteger(1, digest.digest()));
+  }
+
+  private void evalPosModel(POSModel model, BigInteger expectedHash) throws 
IOException {
+    MessageDigest digest = createDigest();
+
+    POSTagger tagger = new POSTaggerME(model);
+
+    try (ObjectStream<String> lines = new PlainTextByLineStream(
+            new MarkableFileInputStreamFactory(new 
File(EvalUtil.getOpennlpDataDir(),
+            "leipzig/simpleTok.txt")), Charset.forName("UTF-8"))) {
+
+      String line;
+      while ((line = lines.read()) != null) {
+        String[] tags = 
tagger.tag(WhitespaceTokenizer.INSTANCE.tokenize(line));
+        for (String tag : tags) {
+          digest.update(tag.getBytes("UTF-8"));
+        }
+      }
+    }
+
+    Assert.assertEquals(expectedHash, new BigInteger(1, digest.digest()));
+  }
+
+  @Test
+  public void evalMaxentModel() throws IOException {
+    POSModel maxentModel = new POSModel(
+            new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-pos-maxent.bin"));
+
+    evalPosModel(maxentModel, new 
BigInteger("6912278014292642909634347798602234960"));
+  }
+
+  @Test
+  public void evalPerceptronModel() throws IOException {
+    POSModel perceptronModel = new POSModel(
+            new File(EvalUtil.getOpennlpDataDir(), 
"models-sf/en-pos-perceptron.bin"));
+
+    evalPosModel(perceptronModel, new 
BigInteger("333081688760132868394207450128996236484"));
+  }
+
+  @Test
+  public void evalParserModel() throws IOException {
+
+    ParserModel model = new ParserModel(
+            new File("/home/burn/opennlp-data-dir", 
"models-sf/en-parser-chunking.bin"));
+
+    MessageDigest digest = createDigest();
+
+
+    Parser parser = ParserFactory.create(model);
+
+    try (ObjectStream<String> lines = new PlainTextByLineStream(
+            new MarkableFileInputStreamFactory(new 
File("/home/burn/opennlp-data-dir",
+            "leipzig/simpleTok.txt")), Charset.forName("UTF-8"))) {
+
+      String line;
+      while ((line = lines.read()) != null) {
+
+        Parse[] parse = ParserTool.parseLine(line, parser, 1);
+        if (parse.length > 0) {
+          digest.update(parse[0].toString().getBytes("UTF-8"));
+        }
+        else {
+          digest.update("empty".getBytes("UTF-8"));
+        }
+      }
+    }
+
+    Assert.assertEquals(new 
BigInteger("95566096874728850374427554294889512256"),
+            new BigInteger(1, digest.digest()));
+  }
+}

Reply via email to