OPENNLP-1033: Adds unit tests for opennlp.tools.ngram, closes apache/opennlp#172
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/3ba27e9f Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/3ba27e9f Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/3ba27e9f Branch: refs/heads/LangDetect Commit: 3ba27e9f4a29be1922b3a19f9c6c5127f93027ab Parents: d447459 Author: jzonthemtn <[email protected]> Authored: Wed Apr 19 15:53:32 2017 -0400 Committer: smarthi <[email protected]> Committed: Wed Apr 19 15:53:32 2017 -0400 ---------------------------------------------------------------------- .../java/opennlp/tools/ngram/NGramModel.java | 6 +- .../opennlp/tools/ngram/NGramGeneratorTest.java | 91 ++++++++++++++++++ .../opennlp/tools/ngram/NGramModelTest.java | 98 ++++++++++++++------ .../tools/ngram/ngram-model-no-count.xml | 27 ++++++ .../tools/ngram/ngram-model-not-a-number.xml | 27 ++++++ 5 files changed, 222 insertions(+), 27 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java b/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java index 7005dc4..0e0e4dd 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java +++ b/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java @@ -216,6 +216,7 @@ public class NGramModel implements Iterable<StringList> { * * @return iterator over all grams */ + @Override public Iterator<StringList> iterator() { return mNGrams.keySet().iterator(); } @@ -306,10 +307,12 @@ public class NGramModel implements Iterable<StringList> { { private Iterator<StringList> mDictionaryIterator = NGramModel.this.iterator(); + @Override public boolean hasNext() { return mDictionaryIterator.hasNext(); } + @Override public Entry next() { StringList tokens = mDictionaryIterator.next(); @@ -317,10 +320,11 @@ public class NGramModel implements Iterable<StringList> { Attributes attributes = new Attributes(); attributes.setValue(COUNT, Integer.toString(getCount(tokens))); - + return new Entry(tokens, attributes); } + @Override public void remove() { throw new UnsupportedOperationException(); } http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramGeneratorTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramGeneratorTest.java new file mode 100644 index 0000000..b1da5d6 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramGeneratorTest.java @@ -0,0 +1,91 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.ngram; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import org.junit.Assert; +import org.junit.Test; + +public class NGramGeneratorTest { + + @Test + public void generateListTest() { + + final List<String> input = Arrays.asList("This", "is", "a", "sentence"); + final int window = 2; + final String separator = "-"; + + final List<String> ngrams = NGramGenerator.generate(input, window, separator); + + Assert.assertEquals(3, ngrams.size()); + Assert.assertTrue(ngrams.contains("This-is")); + Assert.assertTrue(ngrams.contains("is-a")); + Assert.assertTrue(ngrams.contains("a-sentence")); + + } + + @Test + public void generateCharTest() { + + final char[] input = "Test again".toCharArray(); + final int window = 4; + final String separator = "-"; + + final List<String> ngrams = NGramGenerator.generate(input, window, separator); + + Assert.assertEquals(7, ngrams.size()); + Assert.assertTrue(ngrams.contains("T-e-s-t")); + Assert.assertTrue(ngrams.contains("e-s-t- ")); + Assert.assertTrue(ngrams.contains("s-t- -a")); + Assert.assertTrue(ngrams.contains("t- -a-g")); + Assert.assertTrue(ngrams.contains(" -a-g-a")); + Assert.assertTrue(ngrams.contains("a-g-a-i")); + Assert.assertTrue(ngrams.contains("g-a-i-n")); + + } + + @Test + public void generateLargerWindowThanListTest() { + + final List<String> input = Arrays.asList("One", "two"); + final int window = 3; + final String separator = "-"; + + final List<String> ngrams = NGramGenerator.generate(input, window, separator); + + Assert.assertTrue(ngrams.isEmpty()); + + } + + @Test + public void emptyTest() { + + final List<String> input = new ArrayList<>(); + final int window = 2; + final String separator = "-"; + + final List<String> ngrams = NGramGenerator.generate(input, window, separator); + + Assert.assertTrue(ngrams.isEmpty()); + + } + +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java b/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java index e4fb43d..47c228c 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java @@ -17,16 +17,17 @@ package opennlp.tools.ngram; +import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.InputStream; import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; -import org.apache.commons.io.IOUtils; import org.junit.Assert; -import org.junit.Ignore; import org.junit.Test; import opennlp.tools.dictionary.Dictionary; +import opennlp.tools.util.InvalidFormatException; import opennlp.tools.util.StringList; /** @@ -169,31 +170,76 @@ public class NGramModelTest { Assert.assertEquals(1, dictionary.getMinTokenCount()); Assert.assertEquals(3, dictionary.getMaxTokenCount()); } - - @Ignore + + @Test(expected = InvalidFormatException.class) + public void testInvalidFormat() throws Exception { + InputStream stream = new ByteArrayInputStream("inputstring".getBytes(StandardCharsets.UTF_8)); + NGramModel ngramModel = new NGramModel(stream); + stream.close(); + ngramModel.toDictionary(true); + } + + @Test + public void testFromFile() throws Exception { + InputStream stream = getClass().getResourceAsStream("/opennlp/tools/ngram/ngram-model.xml"); + NGramModel ngramModel = new NGramModel(stream); + stream.close(); + Dictionary dictionary = ngramModel.toDictionary(true); + Assert.assertNotNull(dictionary); + Assert.assertEquals(14, dictionary.size()); + Assert.assertEquals(3, dictionary.getMaxTokenCount()); + Assert.assertEquals(1, dictionary.getMinTokenCount()); + } + @Test public void testSerialize() throws Exception { - NGramModel ngramModel = new NGramModel(); - StringList tokens = new StringList("the", "brown", "fox", "jumped"); - ngramModel.add(tokens, 1, 3); - tokens = new StringList("the", "brown", "Fox", "jumped"); - ngramModel.add(tokens, 1, 3); - ByteArrayOutputStream out = new ByteArrayOutputStream(); - ngramModel.serialize(out); - Assert.assertNotNull(out); - InputStream nGramModelStream = getClass() - .getResourceAsStream("/opennlp/tools/ngram/ngram-model.xml"); - String modelString = IOUtils.toString(nGramModelStream); - // remove AL header - int start = modelString.indexOf("<!--"); - int end = modelString.indexOf("-->"); - String asfHeaderString = modelString.substring(start, end + 3); - modelString = modelString.replace(asfHeaderString, ""); - String outputString = out.toString(Charset.forName("UTF-8").name()); - Assert.assertEquals( - modelString.replaceAll("\n", "").replaceAll("\r", "") - .replaceAll("\t", "").replaceAll(" ", ""), - outputString.replaceAll("\n", "").replaceAll("\r", "") - .replaceAll("\t", "").replaceAll(" ", "")); + + InputStream stream = getClass().getResourceAsStream("/opennlp/tools/ngram/ngram-model.xml"); + + NGramModel ngramModel1 = new NGramModel(stream); + stream.close(); + + Dictionary dictionary = ngramModel1.toDictionary(true); + Assert.assertNotNull(dictionary); + Assert.assertEquals(14, dictionary.size()); + Assert.assertEquals(3, dictionary.getMaxTokenCount()); + Assert.assertEquals(1, dictionary.getMinTokenCount()); + + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + ngramModel1.serialize(baos); + + final String serialized = new String(baos.toByteArray(), Charset.defaultCharset()); + InputStream inputStream = new ByteArrayInputStream(serialized.getBytes(StandardCharsets.UTF_8)); + + NGramModel ngramModel2 = new NGramModel(inputStream); + stream.close(); + + Assert.assertEquals(ngramModel2.numberOfGrams(), ngramModel2.numberOfGrams()); + Assert.assertEquals(ngramModel2.size(), ngramModel2.size()); + + dictionary = ngramModel2.toDictionary(true); + + Assert.assertNotNull(dictionary); + Assert.assertEquals(14, dictionary.size()); + Assert.assertEquals(3, dictionary.getMaxTokenCount()); + Assert.assertEquals(1, dictionary.getMinTokenCount()); + } + + @Test(expected = InvalidFormatException.class) + public void testFromInvalidFileMissingCount() throws Exception { + InputStream stream = getClass().getResourceAsStream("/opennlp/tools/ngram/ngram-model-no-count.xml"); + NGramModel ngramModel = new NGramModel(stream); + stream.close(); + ngramModel.toDictionary(true); + } + + @Test(expected = InvalidFormatException.class) + public void testFromInvalidFileNotANumber() throws Exception { + InputStream stream = getClass().getResourceAsStream("/opennlp/tools/ngram/ngram-model-not-a-number.xml"); + NGramModel ngramModel = new NGramModel(stream); + stream.close(); + ngramModel.toDictionary(true); + } + } http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-no-count.xml ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-no-count.xml b/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-no-count.xml new file mode 100644 index 0000000..62a1d90 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-no-count.xml @@ -0,0 +1,27 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +<dictionary case_sensitive="false"> + <entry> + <token>brown</token> + <token>fox</token> + </entry> +</dictionary> http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-not-a-number.xml ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-not-a-number.xml b/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-not-a-number.xml new file mode 100644 index 0000000..e132ea4 --- /dev/null +++ b/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-not-a-number.xml @@ -0,0 +1,27 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +--> + +<dictionary case_sensitive="false"> + <entry count="asdf"> + <token>brown</token> + <token>fox</token> + </entry> +</dictionary>
