OPENNLP-1015: Add tests for DataIndexers Closes #152
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/4ba2a8b9 Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/4ba2a8b9 Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/4ba2a8b9 Branch: refs/heads/parser_regression Commit: 4ba2a8b9745d669b4b6de645eded912e65813ed8 Parents: 0fb11cd Author: koji <[email protected]> Authored: Fri Apr 7 21:50:02 2017 +0900 Committer: Jörn Kottmann <[email protected]> Committed: Thu Apr 20 12:40:25 2017 +0200 ---------------------------------------------------------------------- .../tools/ml/model/OnePassDataIndexerTest.java | 64 ++++++++++ .../model/OnePassRealValueDataIndexerTest.java | 116 +++++++++++++++++++ .../ml/model/SimpleEventStreamBuilder.java | 76 ++++++++++++ .../tools/ml/model/TwoPassDataIndexerTest.java | 64 ++++++++++ 4 files changed, 320 insertions(+) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/4ba2a8b9/opennlp-tools/src/test/java/opennlp/tools/ml/model/OnePassDataIndexerTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/model/OnePassDataIndexerTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/model/OnePassDataIndexerTest.java new file mode 100644 index 0000000..e629e7a --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/ml/model/OnePassDataIndexerTest.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.ml.model; + +import java.io.IOException; +import java.util.Collections; + +import org.junit.Assert; +import org.junit.Test; + +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.TrainingParameters; + +public class OnePassDataIndexerTest { + + @Test + public void testIndex() throws IOException { + // He belongs to <START:org> Apache Software Foundation <END> . + ObjectStream<Event> eventStream = new SimpleEventStreamBuilder() + .add("other/w=he n1w=belongs n2w=to po=other pow=other,He powf=other,ic ppo=other") + .add("other/w=belongs p1w=he n1w=to n2w=apache po=other pow=other,belongs powf=other,lc ppo=other") + .add("other/w=to p1w=belongs p2w=he n1w=apache n2w=software po=other pow=other,to" + + " powf=other,lc ppo=other") + .add("org-start/w=apache p1w=to p2w=belongs n1w=software n2w=foundation po=other pow=other,Apache" + + " powf=other,ic ppo=other") + .add("org-cont/w=software p1w=apache p2w=to n1w=foundation n2w=. po=org-start" + + " pow=org-start,Software powf=org-start,ic ppo=other") + .add("org-cont/w=foundation p1w=software p2w=apache n1w=. po=org-cont pow=org-cont,Foundation" + + " powf=org-cont,ic ppo=org-start") + .add("other/w=. p1w=foundation p2w=software po=org-cont pow=org-cont,. powf=org-cont,other" + + " ppo=org-cont") + .build(); + + DataIndexer indexer = new OnePassDataIndexer(); + indexer.init(new TrainingParameters(Collections.emptyMap()), null); + indexer.index(eventStream); + Assert.assertEquals(3, indexer.getContexts().length); + Assert.assertArrayEquals(new int[]{0}, indexer.getContexts()[0]); + Assert.assertArrayEquals(new int[]{0}, indexer.getContexts()[1]); + Assert.assertArrayEquals(new int[]{0}, indexer.getContexts()[2]); + Assert.assertNull(indexer.getValues()); + Assert.assertEquals(5, indexer.getNumEvents()); + Assert.assertArrayEquals(new int[]{0, 1, 2}, indexer.getOutcomeList()); + Assert.assertArrayEquals(new int[]{3, 1, 1}, indexer.getNumTimesEventsSeen()); + Assert.assertArrayEquals(new String[]{"ppo=other"}, indexer.getPredLabels()); + Assert.assertArrayEquals(new String[]{"other", "org-start", "org-cont"}, indexer.getOutcomeLabels()); + Assert.assertArrayEquals(new int[]{5}, indexer.getPredCounts()); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/4ba2a8b9/opennlp-tools/src/test/java/opennlp/tools/ml/model/OnePassRealValueDataIndexerTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/model/OnePassRealValueDataIndexerTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/model/OnePassRealValueDataIndexerTest.java new file mode 100644 index 0000000..ab9eda3 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/ml/model/OnePassRealValueDataIndexerTest.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.ml.model; + +import java.io.IOException; +import java.util.Collections; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.TrainingParameters; + +public class OnePassRealValueDataIndexerTest { + + DataIndexer indexer; + + @Before + public void setUp() throws Exception { + indexer = new OnePassRealValueDataIndexer(); + indexer.init(new TrainingParameters(Collections.emptyMap()), null); + } + + @Test + public void testIndex() throws IOException { + // He belongs to <START:org> Apache Software Foundation <END> . + ObjectStream<Event> eventStream = new SimpleEventStreamBuilder() + .add("other/w=he n1w=belongs n2w=to po=other pow=other,He powf=other,ic ppo=other") + .add("other/w=belongs p1w=he n1w=to n2w=apache po=other pow=other,belongs powf=other,lc ppo=other") + .add("other/w=to p1w=belongs p2w=he n1w=apache n2w=software po=other pow=other,to" + + " powf=other,lc ppo=other") + .add("org-start/w=apache p1w=to p2w=belongs n1w=software n2w=foundation po=other pow=other,Apache" + + " powf=other,ic ppo=other") + .add("org-cont/w=software p1w=apache p2w=to n1w=foundation n2w=. po=org-start" + + " pow=org-start,Software powf=org-start,ic ppo=other") + .add("org-cont/w=foundation p1w=software p2w=apache n1w=. po=org-cont pow=org-cont,Foundation" + + " powf=org-cont,ic ppo=org-start") + .add("other/w=. p1w=foundation p2w=software po=org-cont pow=org-cont,. powf=org-cont,other" + + " ppo=org-cont") + .build(); + + indexer.index(eventStream); + Assert.assertEquals(3, indexer.getContexts().length); + Assert.assertArrayEquals(new int[]{0}, indexer.getContexts()[0]); + Assert.assertArrayEquals(new int[]{0}, indexer.getContexts()[1]); + Assert.assertArrayEquals(new int[]{0}, indexer.getContexts()[2]); + Assert.assertEquals(3, indexer.getValues().length); + Assert.assertNull(indexer.getValues()[0]); + Assert.assertNull(indexer.getValues()[1]); + Assert.assertNull(indexer.getValues()[2]); + Assert.assertEquals(5, indexer.getNumEvents()); + Assert.assertArrayEquals(new int[]{0, 1, 2}, indexer.getOutcomeList()); + Assert.assertArrayEquals(new int[]{3, 1, 1}, indexer.getNumTimesEventsSeen()); + Assert.assertArrayEquals(new String[]{"ppo=other"}, indexer.getPredLabels()); + Assert.assertArrayEquals(new String[]{"other", "org-start", "org-cont"}, indexer.getOutcomeLabels()); + Assert.assertArrayEquals(new int[]{5}, indexer.getPredCounts()); + } + + @Test + public void testIndexValues() throws IOException { + // He belongs to <START:org> Apache Software Foundation <END> . + ObjectStream<Event> eventStream = new SimpleEventStreamBuilder() + .add("other/w=he;0.1 n1w=belongs;0.2 n2w=to;0.1 po=other;0.1" + + " pow=other,He;0.1 powf=other,ic;0.1 ppo=other;0.1") + .add("other/w=belongs;0.1 p1w=he;0.2 n1w=to;0.1 n2w=apache;0.1" + + " po=other;0.1 pow=other,belongs;0.1 powf=other,lc;0.1 ppo=other;0.1") + .add("other/w=to;0.1 p1w=belongs;0.2 p2w=he;0.1 n1w=apache;0.1" + + " n2w=software;0.1 po=other;0.1 pow=other,to;0.1 powf=other,lc;0.1 ppo=other;0.1") + .add("org-start/w=apache;0.1 p1w=to;0.2 p2w=belongs;0.1 n1w=software;0.1 n2w=foundation;0.1" + + " po=other;0.1 pow=other,Apache;0.1 powf=other,ic;0.1 ppo=other;0.1") + .add("org-cont/w=software;0.1 p1w=apache;0.2 p2w=to;0.1 n1w=foundation;0.1" + + " n2w=.;0.1 po=org-start;0.1 pow=org-start,Software;0.1 powf=org-start,ic;0.1 ppo=other;0.1") + .add("org-cont/w=foundation;0.1 p1w=software;0.2 p2w=apache;0.1 n1w=.;0.1 po=org-cont;0.1" + + " pow=org-cont,Foundation;0.1 powf=org-cont,ic;0.1 ppo=org-start;0.1") + .add("other/w=.;0.1 p1w=foundation;0.1 p2w=software;0.1 po=org-cont;0.1 pow=org-cont,.;0.1" + + " powf=org-cont,other;0.1 ppo=org-cont;0.1") + .build(); + + indexer.index(eventStream); + System.out.println(indexer); + Assert.assertEquals(3, indexer.getContexts().length); + Assert.assertArrayEquals(new int[]{0}, indexer.getContexts()[0]); + Assert.assertArrayEquals(new int[]{0}, indexer.getContexts()[1]); + Assert.assertArrayEquals(new int[]{0}, indexer.getContexts()[2]); + Assert.assertEquals(3, indexer.getValues().length); + final float delta = 0.001F; + Assert.assertArrayEquals(new float[]{0.1F, 0.2F, 0.1F, 0.1F, 0.1F, 0.1F, 0.1F}, + indexer.getValues()[0], delta); + Assert.assertArrayEquals(new float[]{0.1F, 0.2F, 0.1F, 0.1F, 0.1F, 0.1F, 0.1F, 0.1F, 0.1F}, + indexer.getValues()[1], delta); + Assert.assertArrayEquals(new float[]{0.1F, 0.2F, 0.1F, 0.1F, 0.1F, 0.1F, 0.1F, 0.1F, 0.1F}, + indexer.getValues()[2], delta); + Assert.assertEquals(5, indexer.getNumEvents()); + Assert.assertArrayEquals(new int[]{0, 1, 2}, indexer.getOutcomeList()); + Assert.assertArrayEquals(new int[]{3, 1, 1}, indexer.getNumTimesEventsSeen()); + Assert.assertArrayEquals(new String[]{"ppo=other"}, indexer.getPredLabels()); + Assert.assertArrayEquals(new String[]{"other", "org-start", "org-cont"}, indexer.getOutcomeLabels()); + Assert.assertArrayEquals(new int[]{5}, indexer.getPredCounts()); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/4ba2a8b9/opennlp-tools/src/test/java/opennlp/tools/ml/model/SimpleEventStreamBuilder.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/model/SimpleEventStreamBuilder.java b/opennlp-tools/src/test/java/opennlp/tools/ml/model/SimpleEventStreamBuilder.java new file mode 100644 index 0000000..49fa242 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/ml/model/SimpleEventStreamBuilder.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.ml.model; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +import opennlp.tools.util.ObjectStream; + +public class SimpleEventStreamBuilder { + + private final List<Event> eventList = new ArrayList<>(); + private int pos = 0; + + /* + * the format of event should look like: + * without values) other/w=he n1w=belongs n2w=to po=other pow=other,He powf=other,ic + * with values) other/w=he;0.5 n1w=belongs;0.4 n2w=to;0.3 po=other;0.5 pow=other,He;0.25 powf=other,ic;0.5 + */ + public SimpleEventStreamBuilder add(String event) { + String[] ss = event.split("/"); + if (ss.length != 2) { + throw new RuntimeException(String.format("format error of the event \"%s\"", event)); + } + + // look for context (and values) + String[] cvPairs = ss[1].split("\\s+"); + if (cvPairs[0].contains(";")) { // has values? + String[] context = new String[cvPairs.length]; + float[] values = new float[cvPairs.length]; + for (int i = 0; i < cvPairs.length; i++) { + String[] pair = cvPairs[i].split(";"); + if (pair.length != 2) { + throw new RuntimeException(String.format("format error of the event \"%s\". " + + "\"%s\" doesn't have value", event, pair)); + } + context[i] = pair[0]; + values[i] = Float.parseFloat(pair[1]); + } + eventList.add(new Event(ss[0], context, values)); + } + else { + eventList.add(new Event(ss[0], cvPairs)); + } + + return this; + } + + public ObjectStream<Event> build() { + return new ObjectStream<Event>() { + @Override + public Event read() throws IOException { + if (eventList.size() <= pos) { + return null; + } + return eventList.get(pos++); + } + }; + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/4ba2a8b9/opennlp-tools/src/test/java/opennlp/tools/ml/model/TwoPassDataIndexerTest.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/model/TwoPassDataIndexerTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/model/TwoPassDataIndexerTest.java new file mode 100644 index 0000000..c246936 --- /dev/null +++ b/opennlp-tools/src/test/java/opennlp/tools/ml/model/TwoPassDataIndexerTest.java @@ -0,0 +1,64 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.ml.model; + +import java.io.IOException; +import java.util.Collections; + +import org.junit.Assert; +import org.junit.Test; + +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.TrainingParameters; + +public class TwoPassDataIndexerTest { + + @Test + public void testIndex() throws IOException { + // He belongs to <START:org> Apache Software Foundation <END> . + ObjectStream<Event> eventStream = new SimpleEventStreamBuilder() + .add("other/w=he n1w=belongs n2w=to po=other pow=other,He powf=other,ic ppo=other") + .add("other/w=belongs p1w=he n1w=to n2w=apache po=other pow=other,belongs powf=other,lc ppo=other") + .add("other/w=to p1w=belongs p2w=he n1w=apache n2w=software po=other pow=other,to" + + " powf=other,lc ppo=other") + .add("org-start/w=apache p1w=to p2w=belongs n1w=software n2w=foundation po=other pow=other,Apache" + + " powf=other,ic ppo=other") + .add("org-cont/w=software p1w=apache p2w=to n1w=foundation n2w=. po=org-start" + + " pow=org-start,Software powf=org-start,ic ppo=other") + .add("org-cont/w=foundation p1w=software p2w=apache n1w=. po=org-cont pow=org-cont,Foundation" + + " powf=org-cont,ic ppo=org-start") + .add("other/w=. p1w=foundation p2w=software po=org-cont pow=org-cont,. powf=org-cont,other" + + " ppo=org-cont") + .build(); + + DataIndexer indexer = new TwoPassDataIndexer(); + indexer.init(new TrainingParameters(Collections.emptyMap()), null); + indexer.index(eventStream); + Assert.assertEquals(3, indexer.getContexts().length); + Assert.assertArrayEquals(new int[]{0}, indexer.getContexts()[0]); + Assert.assertArrayEquals(new int[]{0}, indexer.getContexts()[1]); + Assert.assertArrayEquals(new int[]{0}, indexer.getContexts()[2]); + Assert.assertNull(indexer.getValues()); + Assert.assertEquals(5, indexer.getNumEvents()); + Assert.assertArrayEquals(new int[]{0, 1, 2}, indexer.getOutcomeList()); + Assert.assertArrayEquals(new int[]{3, 1, 1}, indexer.getNumTimesEventsSeen()); + Assert.assertArrayEquals(new String[]{"ppo=other"}, indexer.getPredLabels()); + Assert.assertArrayEquals(new String[]{"other", "org-start", "org-cont"}, indexer.getOutcomeLabels()); + Assert.assertArrayEquals(new int[]{5}, indexer.getPredCounts()); + } +}
