OPENNLP-622 Fixed PosTaggerFactory and restored test.
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/3ceb5540 Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/3ceb5540 Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/3ceb5540 Branch: refs/heads/trunk Commit: 3ceb5540ced842875c010bb81169afcb544f203e Parents: 1314887 Author: William Colen <[email protected]> Authored: Fri Jul 8 03:52:14 2016 +0000 Committer: William Colen <[email protected]> Committed: Fri Jul 8 03:52:14 2016 +0000 ---------------------------------------------------------------------- .../tagdict/MorfologikPOSTaggerFactory.java | 46 +++-- .../tagdict/POSTaggerFactoryTest.java | 192 ++++++++----------- 2 files changed, 106 insertions(+), 132 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ceb5540/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java index 723b1ce..dcb6554 100644 --- a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java +++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java @@ -26,9 +26,11 @@ import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.nio.file.Files; +import java.nio.file.Path; import java.nio.file.Paths; import java.util.Map; +import morfologik.stemming.DictionaryMetadata; import opennlp.tools.dictionary.Dictionary; import opennlp.tools.postag.POSTaggerFactory; import opennlp.tools.postag.TagDictionary; @@ -53,23 +55,27 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory { public MorfologikPOSTaggerFactory() { } - - /** - * Creates a new {@link POSTaggerFactory} that uses the a Morfologik based {@link TagDictionary}. - * - * @param ngramDictionary a ngramDictionary - * @param morfologikDictionary a Morfologik dictionary - * @param morfologikDictionaryMetadata the dictionary metadata - * @throws IOException invalid Morfologik dictionary - */ - public MorfologikPOSTaggerFactory(Dictionary ngramDictionary, - byte[] morfologikDictionary, byte[] morfologikDictionaryMetadata) throws IOException { - super(ngramDictionary, null); - this.dictData = morfologikDictionary; - this.dictInfo = morfologikDictionaryMetadata; + + public TagDictionary createTagDictionary(File dictionary) + throws InvalidFormatException, FileNotFoundException, IOException { + + if(!dictionary.canRead()) { + throw new FileNotFoundException("Could not read dictionary: " + dictionary.getAbsolutePath()); + } + + Path dictionaryMeta = DictionaryMetadata.getExpectedMetadataLocation(dictionary.toPath()); + + if(dictionaryMeta == null || !dictionaryMeta.toFile().canRead()) { + throw new FileNotFoundException("Could not read dictionary metadata: " + dictionaryMeta.getFileName()); + } + + this.dictData = Files.readAllBytes(dictionary.toPath()); + this.dictInfo = Files.readAllBytes(dictionaryMeta); + + return createMorfologikDictionary(dictData, dictInfo); - this.dict = createMorfologikDictionary(dictData, dictInfo); } + @Override protected void init(Dictionary ngramDictionary, TagDictionary posDictionary) { @@ -130,8 +136,7 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory { @Override public void setTagDictionary(TagDictionary dictionary) { - throw new UnsupportedOperationException( - "Morfologik POS Tagger factory does not support this operation"); + this.dict = dictionary; } @Override @@ -141,13 +146,6 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory { } @Override - public TagDictionary createTagDictionary(File dictionary) - throws InvalidFormatException, FileNotFoundException, IOException { - throw new UnsupportedOperationException( - "Morfologik POS Tagger factory does not support this operation"); - } - - @Override public TagDictionary createTagDictionary(InputStream in) throws InvalidFormatException, IOException { throw new UnsupportedOperationException( http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ceb5540/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java index 6c6814b..9233979 100644 --- a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java +++ b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java @@ -1,108 +1,84 @@ -///* -// * Licensed to the Apache Software Foundation (ASF) under one or more -// * contributor license agreements. See the NOTICE file distributed with -// * this work for additional information regarding copyright ownership. -// * The ASF licenses this file to You under the Apache License, Version 2.0 -// * (the "License"); you may not use this file except in compliance with -// * the License. You may obtain a copy of the License at -// * -// * http://www.apache.org/licenses/LICENSE-2.0 -// * -// * Unless required by applicable law or agreed to in writing, software -// * distributed under the License is distributed on an "AS IS" BASIS, -// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// * See the License for the specific language governing permissions and -// * limitations under the License. -// */ -// -//package opennlp.morfologik.tagdict; -// -//import static org.junit.Assert.assertTrue; -// -//import java.io.ByteArrayInputStream; -//import java.io.ByteArrayOutputStream; -//import java.io.File; -//import java.io.IOException; -//import java.io.InputStream; -//import java.io.InputStreamReader; -//import java.nio.charset.Charset; -//import java.nio.file.Files; -//import java.nio.file.Path; -//import java.nio.file.Paths; -// -//import morfologik.stemming.DictionaryMetadata; -//import morfologik.stemming.EncoderType; -//import opennlp.morfologik.builder.MorfologikDictionayBuilder; -//import opennlp.morfologik.builder.POSDictionayBuilderTest; -//import opennlp.tools.dictionary.Dictionary; -//import opennlp.tools.postag.DefaultPOSSequenceValidator; -//import opennlp.tools.postag.POSContextGenerator; -//import opennlp.tools.postag.POSDictionary; -//import opennlp.tools.postag.POSModel; -//import opennlp.tools.postag.POSSample; -//import opennlp.tools.postag.POSTaggerFactory; -//import opennlp.tools.postag.POSTaggerME; -//import opennlp.tools.postag.WordTagSampleStream; -//import opennlp.tools.util.BaseToolFactory; -//import opennlp.tools.util.InvalidFormatException; -//import opennlp.tools.util.ObjectStream; -//import opennlp.tools.util.TrainingParameters; -//import opennlp.tools.util.model.ModelType; -// -//import org.junit.Test; -// -///** -// * Tests for the {@link POSTaggerFactory} class. -// */ -//public class POSTaggerFactoryTest { -// -// private static ObjectStream<POSSample> createSampleStream() -// throws IOException { -// InputStream in = POSTaggerFactoryTest.class.getClassLoader() -// .getResourceAsStream("AnnotatedSentences.txt"); -// -// return new WordTagSampleStream((new InputStreamReader(in))); -// } -// -// static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory) -// throws IOException { -// return POSTaggerME.train("en", createSampleStream(), -// TrainingParameters.defaultParams(), factory); -// } -// -// @Test -// public void testPOSTaggerWithCustomFactory() throws Exception { -// -// MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder(); -// File dictInFile = new File(POSDictionayBuilderTest.class.getResource( -// "/dictionaryWithLemma.txt").getFile()); -// -// File dictOutFile = File.createTempFile( -// POSDictionayBuilderTest.class.getName(), ".dict"); -// -// builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", -// EncoderType.PREFIX); -// -// Path dictPath = dictOutFile.toPath(); -// Path metaPath = DictionaryMetadata.getExpectedMetadataLocation(dictPath); -// -// byte[] dic = Files.readAllBytes(dictPath); -// byte[] meta = Files.readAllBytes(metaPath); -// -// POSModel posModel = trainPOSModel(ModelType.MAXENT, -// new MorfologikPOSTaggerFactory(null, dic, meta)); -// -// POSTaggerFactory factory = posModel.getFactory(); -// assertTrue(factory.getTagDictionary() instanceof MorfologikPOSTaggerFactory); -// -// ByteArrayOutputStream out = new ByteArrayOutputStream(); -// posModel.serialize(out); -// ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); -// -// POSModel fromSerialized = new POSModel(in); -// -// factory = fromSerialized.getFactory(); -// assertTrue(factory.getTagDictionary() instanceof MorfologikPOSTaggerFactory); -// } -// -//} \ No newline at end of file +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.morfologik.tagdict; + +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.file.Path; + +import opennlp.morfologik.builder.POSDictionayBuilderTest; +import opennlp.tools.postag.POSModel; +import opennlp.tools.postag.POSSample; +import opennlp.tools.postag.POSTaggerFactory; +import opennlp.tools.postag.POSTaggerME; +import opennlp.tools.postag.TagDictionary; +import opennlp.tools.postag.WordTagSampleStream; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.TrainingParameters; +import opennlp.tools.util.model.ModelType; + +import org.junit.Test; + +/** + * Tests for the {@link POSTaggerFactory} class. + */ +public class POSTaggerFactoryTest { + + private static ObjectStream<POSSample> createSampleStream() + throws IOException { + InputStream in = POSTaggerFactoryTest.class.getClassLoader() + .getResourceAsStream("AnnotatedSentences.txt"); + + return new WordTagSampleStream((new InputStreamReader(in))); + } + + static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory) + throws IOException { + return POSTaggerME.train("en", createSampleStream(), + TrainingParameters.defaultParams(), factory); + } + + @Test + public void testPOSTaggerWithCustomFactory() throws Exception { + + Path dictionary = POSDictionayBuilderTest.createMorfologikDictionary(); + POSTaggerFactory inFactory = new MorfologikPOSTaggerFactory(); + TagDictionary inDict = inFactory.createTagDictionary(dictionary.toFile()); + inFactory.setTagDictionary(inDict); + + POSModel posModel = trainPOSModel(ModelType.MAXENT, inFactory); + + POSTaggerFactory factory = posModel.getFactory(); + assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary); + + ByteArrayOutputStream out = new ByteArrayOutputStream(); + posModel.serialize(out); + ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); + + POSModel fromSerialized = new POSModel(in); + + factory = fromSerialized.getFactory(); + assertTrue(factory.getTagDictionary() instanceof MorfologikTagDictionary); + } + +} \ No newline at end of file
