OPENNLP-622 Refactored to remove usage of main methods of Morfologik.
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/1314887f Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/1314887f Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/1314887f Branch: refs/heads/trunk Commit: 1314887fe657f21e1213788fd6084a485781f2f1 Parents: 15c3fb7 Author: William Colen <[email protected]> Authored: Thu Jul 7 05:19:18 2016 +0000 Committer: William Colen <[email protected]> Committed: Thu Jul 7 05:19:18 2016 +0000 ---------------------------------------------------------------------- .../builder/MorfologikDictionayBuilder.java | 144 ++++++------------- .../MorfologikDictionaryBuilderParams.java | 37 +++-- .../MorfologikDictionaryBuilderTool.java | 17 +-- .../lemmatizer/MorfologikLemmatizer.java | 8 +- .../tagdict/MorfologikPOSTaggerFactory.java | 14 +- .../builder/POSDictionayBuilderTest.java | 67 +++------ .../lemmatizer/MorfologikLemmatizerTest.java | 17 +-- .../tagdict/MorfologikTagDictionaryTest.java | 18 +-- .../tagdict/POSTaggerFactoryTest.java | 108 ++++++++++++++ src/test/resources/AnnotatedSentences.txt | 136 ++++++++++++++++++ src/test/resources/dictionaryWithLemma.info | 15 ++ src/test/resources/dictionaryWithLemma.txt | 21 +-- 12 files changed, 386 insertions(+), 216 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java index 0131318..dbbca4d 100644 --- a/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java +++ b/src/main/java/opennlp/morfologik/builder/MorfologikDictionayBuilder.java @@ -17,21 +17,15 @@ package opennlp.morfologik.builder; -import java.io.File; import java.io.FileNotFoundException; -import java.io.FileOutputStream; import java.io.IOException; -import java.io.OutputStream; import java.nio.charset.Charset; import java.nio.file.Path; -import java.util.ArrayList; -import java.util.List; import java.util.Properties; import morfologik.stemming.DictionaryMetadata; import morfologik.stemming.EncoderType; -import morfologik.tools.FSACompile; -import morfologik.tools.Launcher; +import morfologik.tools.DictCompile; /** * Utility class to build Morfologik dictionaries from a tab separated values @@ -41,117 +35,69 @@ import morfologik.tools.Launcher; public class MorfologikDictionayBuilder { /** - * Build a Morfologik binary dictionary - * - * @param dictInFile - * the 3 column TSV dictionary file - * @param dictOutFile - * where to store the binary Morfologik dictionary - * @param encoding - * the encoding to be used while reading and writing - * @param separator - * a field separator, the default is '+'. If your tags contains '+' - * change to something else - * @param encoderType - * the Morfologik enconder type - * @param isUseInfixes - * if to compact using infixes + * Helper to compile a morphological dictionary automaton. + * + * @param input + * The input file (base,inflected,tag). An associated metadata + * (*.info) file must exist. + * @param overwrite + * Overwrite the output file if it exists. + * @param validate + * Validate input to make sure it makes sense. + * @param acceptBom + * Accept leading BOM bytes (UTF-8). + * @param acceptCr + * Accept CR bytes in input sequences (\r). + * @param ignoreEmpty + * Ignore empty lines in the input. + * @return the dictionary path + * * @throws Exception */ - public void build(File dictInFile, File dictOutFile, Charset encoding, - String separator, EncoderType encoderType) + public Path build(Path input, boolean overwrite, boolean validate, + boolean acceptBom, boolean acceptCr, boolean ignoreEmpty) throws Exception { - Path propertiesPath = DictionaryMetadata - .getExpectedMetadataLocation(dictOutFile.toPath()); + + DictCompile compiler = new DictCompile(input, overwrite, validate, + acceptBom, acceptCr, ignoreEmpty); + compiler.call(); + + + Path metadataPath = DictionaryMetadata + .getExpectedMetadataLocation(input); - this.build(dictInFile, dictOutFile, propertiesPath.toFile(), encoding, separator, - encoderType); + return metadataPath.resolveSibling( + metadataPath.getFileName().toString().replaceAll( + "\\." + DictionaryMetadata.METADATA_FILE_EXTENSION + "$", ".dict")); } /** - * Build a Morfologik binary dictionary - * - * @param dictInFile - * the 3 column TSV dictionary file - * @param dictOutFile - * where to store the binary Morfologik dictionary - * @param propertiesOutFile - * where to store the properties of the Morfologik dictionary - * @param encoding - * the encoding to be used while reading and writing - * @param separator - * a field separator, the default is '+'. If your tags contains '+' - * change to something else - * @param isUsePrefixes - * if to compact using prefixes - * @param isUseInfixes - * if to compact using infixes + * Helper to compile a morphological dictionary automaton using default + * parameters. + * + * @param input + * The input file (base,inflected,tag). An associated metadata + * (*.info) file must exist. + * + * @return the dictionary path + * * @throws Exception */ - public void build(File dictInFile, File dictOutFile, File propertiesOutFile, - Charset encoding, String separator, EncoderType encoderType) throws Exception { - - // we need to execute tab2morph followed by fsa_build - - File morph = tab2morph(dictInFile, separator, encoderType); + public Path build(Path input) throws Exception { - fsaBuild(morph, dictOutFile); + return build(input, true, true, false, false, false); - morph.delete(); - - // now we create the properties files using the passed parameters - createProperties(encoding, separator, encoderType, - propertiesOutFile); } - void createProperties(Charset encoding, String separator, - EncoderType encoderType, File propertiesFile) - throws FileNotFoundException, IOException { + Properties createProperties(Charset encoding, String separator, + EncoderType encoderType) throws FileNotFoundException, IOException { Properties properties = new Properties(); properties.setProperty("fsa.dict.separator", separator); properties.setProperty("fsa.dict.encoding", encoding.name()); properties.setProperty("fsa.dict.encoder", encoderType.name()); - OutputStream os = new FileOutputStream(propertiesFile); - properties.store(os, "Morfologik POS Dictionary properties"); - os.close(); - - } + return properties; - private void fsaBuild(File morph, File dictOutFile) throws Exception { - String[] params = { "-f", "cfsa2", "-i", morph.getAbsolutePath(), "-o", - dictOutFile.getAbsolutePath() }; - FSACompile.main(params); - // FSABuildTool.main(params); } - - private File tab2morph(File dictInFile, String separator, - EncoderType encoderType) throws Exception { - - // create tab2morph parameters - List<String> tag2morphParams = new ArrayList<String>(); - tag2morphParams.add("tab2morph"); - - tag2morphParams.add("--annotation"); - tag2morphParams.add(separator); - - tag2morphParams.add("--e"); - tag2morphParams.add(encoderType.name()); - - tag2morphParams.add("-i"); - tag2morphParams.add(dictInFile.getAbsolutePath()); - - // we need a temporary file to store the intermediate output - File tmp = File.createTempFile("tab2morph", ".txt"); - tmp.deleteOnExit(); - - tag2morphParams.add("-o"); - tag2morphParams.add(tmp.getAbsolutePath()); - - Launcher.main(tag2morphParams.toArray(new String[tag2morphParams.size()])); - - return tmp; - } - } http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java index 193599b..5ea2e4f 100644 --- a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java +++ b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderParams.java @@ -19,7 +19,6 @@ package opennlp.morfologik.cmdline.builder; import java.io.File; -import morfologik.stemming.EncoderType; import opennlp.tools.cmdline.ArgumentParser.OptionalParameter; import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; import opennlp.tools.cmdline.params.EncodingParameter; @@ -29,18 +28,30 @@ import opennlp.tools.cmdline.params.EncodingParameter; */ interface MorfologikDictionaryBuilderParams extends EncodingParameter { - @ParameterDescription(valueName = "in", description = "Plain file with one entry per line") + @ParameterDescription(valueName = "in", description = "The input file (base,inflected,tag). An associated metadata (*.info) file must exist.") File getInputFile(); - - @ParameterDescription(valueName = "out", description = "The generated dictionary file.") - File getOutputFile(); - - @ParameterDescription(valueName = "sep", description = "The FSA dictionary separator. Default is '+'.") - @OptionalParameter(defaultValue = "+") - String getFSADictSeparator(); - @ParameterDescription(valueName = "sep", description = "The type of lemma-inflected form encoding compression that precedes automaton construction. Allowed values: [suffix, infix, prefix, none]. Details are in Daciuk's paper and in the code. ") - @OptionalParameter(defaultValue = "prefix") - EncoderType getEncoderType(); - + @ParameterDescription(valueName = "true|false", description = "Accept leading BOM bytes (UTF-8).") + @OptionalParameter(defaultValue="false") + Boolean getAcceptBOM(); + + @ParameterDescription(valueName = "true|false", description = "Accept CR bytes in input sequences (\r).") + @OptionalParameter(defaultValue="false") + Boolean getAcceptCR(); + + @ParameterDescription(valueName = "FSA5|CFSA2", description = "Automaton serialization format.") + @OptionalParameter(defaultValue="FSA5") + String getFormat(); + + @ParameterDescription(valueName = "true|false", description = "Ignore empty lines in the input.") + @OptionalParameter(defaultValue="false") + Boolean getIgnoreEmpty(); + + @ParameterDescription(valueName = "true|false", description = "Overwrite the output file if it exists.") + @OptionalParameter(defaultValue="false") + Boolean getOverwrite(); + + @ParameterDescription(valueName = "true|false", description = "Validate input to make sure it makes sense.") + @OptionalParameter(defaultValue="false") + Boolean getValidate(); } http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java index 741515e..eb9b51c 100644 --- a/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java +++ b/src/main/java/opennlp/morfologik/cmdline/builder/MorfologikDictionaryBuilderTool.java @@ -17,11 +17,10 @@ package opennlp.morfologik.cmdline.builder; -import static opennlp.morfologik.util.MorfologikUtil.getExpectedPropertiesFile; - import java.io.File; -import java.nio.charset.Charset; +import java.nio.file.Path; +import morfologik.stemming.DictionaryMetadata; import opennlp.morfologik.builder.MorfologikDictionayBuilder; import opennlp.tools.cmdline.BasicCmdLineTool; import opennlp.tools.cmdline.CmdLineUtil; @@ -44,18 +43,16 @@ public class MorfologikDictionaryBuilderTool extends BasicCmdLineTool { Params params = validateAndParseParams(args, Params.class); File dictInFile = params.getInputFile(); - File dictOutFile = params.getOutputFile(); - File propertiesFile = getExpectedPropertiesFile(dictOutFile); - Charset encoding = params.getEncoding(); CmdLineUtil.checkInputFile("dictionary input file", dictInFile); - CmdLineUtil.checkOutputFile("dictionary output file", dictOutFile); - CmdLineUtil.checkOutputFile("properties output file", propertiesFile); + Path metadataPath = DictionaryMetadata.getExpectedMetadataLocation(dictInFile.toPath()); + CmdLineUtil.checkInputFile("dictionary metadata (.info) input file", metadataPath.toFile()); MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder(); try { - builder.build(dictInFile, dictOutFile, propertiesFile, encoding, - params.getFSADictSeparator(), params.getEncoderType()); + builder.build(dictInFile.toPath(), params.getOverwrite(), + params.getValidate(), params.getAcceptBOM(), params.getAcceptCR(), + params.getIgnoreEmpty()); } catch (Exception e) { throw new TerminateToolException(-1, "Error while creating Morfologik POS Dictionay: " + e.getMessage(), e); http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java index 99694a5..2090ce5 100644 --- a/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java +++ b/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java @@ -18,7 +18,7 @@ package opennlp.morfologik.lemmatizer; import java.io.IOException; -import java.net.URL; +import java.nio.file.Path; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; @@ -26,11 +26,11 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import opennlp.tools.lemmatizer.DictionaryLemmatizer; import morfologik.stemming.Dictionary; import morfologik.stemming.DictionaryLookup; import morfologik.stemming.IStemmer; import morfologik.stemming.WordData; +import opennlp.tools.lemmatizer.DictionaryLemmatizer; public class MorfologikLemmatizer implements DictionaryLemmatizer { @@ -38,9 +38,9 @@ public class MorfologikLemmatizer implements DictionaryLemmatizer { public final Set<String> constantTags = new HashSet<String>(Arrays.asList( "NNP", "NP00000")); - public MorfologikLemmatizer(URL dictURL) throws IllegalArgumentException, + public MorfologikLemmatizer(Path dictionaryPath) throws IllegalArgumentException, IOException { - dictLookup = new DictionaryLookup(Dictionary.read(dictURL)); + dictLookup = new DictionaryLookup(Dictionary.read(dictionaryPath)); } private HashMap<List<String>, String> getLemmaTagsDict(String word) { http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java ---------------------------------------------------------------------- diff --git a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java index f022a86..723b1ce 100644 --- a/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java +++ b/src/main/java/opennlp/morfologik/tagdict/MorfologikPOSTaggerFactory.java @@ -54,9 +54,21 @@ public class MorfologikPOSTaggerFactory extends POSTaggerFactory { public MorfologikPOSTaggerFactory() { } + /** + * Creates a new {@link POSTaggerFactory} that uses the a Morfologik based {@link TagDictionary}. + * + * @param ngramDictionary a ngramDictionary + * @param morfologikDictionary a Morfologik dictionary + * @param morfologikDictionaryMetadata the dictionary metadata + * @throws IOException invalid Morfologik dictionary + */ public MorfologikPOSTaggerFactory(Dictionary ngramDictionary, - TagDictionary posDictionary) { + byte[] morfologikDictionary, byte[] morfologikDictionaryMetadata) throws IOException { super(ngramDictionary, null); + this.dictData = morfologikDictionary; + this.dictInfo = morfologikDictionaryMetadata; + + this.dict = createMorfologikDictionary(dictData, dictInfo); } @Override http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java index 730025c..0a7ba48 100644 --- a/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java +++ b/src/test/java/opennlp/morfologik/builder/POSDictionayBuilderTest.java @@ -18,14 +18,12 @@ package opennlp.morfologik.builder; import java.io.File; -import java.io.FileInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.nio.charset.Charset; -import java.util.Properties; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardCopyOption; import junit.framework.TestCase; -import morfologik.stemming.EncoderType; +import morfologik.stemming.DictionaryMetadata; import opennlp.morfologik.lemmatizer.MorfologikLemmatizer; import org.junit.Test; @@ -34,56 +32,27 @@ public class POSDictionayBuilderTest extends TestCase { @Test public void testBuildDictionary() throws Exception { - MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder(); - File dictInFile = new File(POSDictionayBuilderTest.class.getResource( - "/dictionaryWithLemma.txt").getFile()); - - File dictOutFile = File.createTempFile( - POSDictionayBuilderTest.class.getName(), ".dict"); - - builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", EncoderType.PREFIX); + + Path output = createMorfologikDictionary(); - MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI() - .toURL()); + MorfologikLemmatizer ml = new MorfologikLemmatizer(output); assertNotNull(ml); } - - @Test - public void testPropertiesCreation() throws Exception { - - Charset c = Charset.forName("iso-8859-1"); - String sep = "_"; + + public static Path createMorfologikDictionary() throws Exception { + Path tabFilePath = File.createTempFile( + POSDictionayBuilderTest.class.getName(), ".txt").toPath(); + Path infoFilePath = DictionaryMetadata.getExpectedMetadataLocation(tabFilePath); - EncoderType encoderType = EncoderType.PREFIX; - Properties p = createPropertiesHelper(c, sep, encoderType); - - assertEquals(c.name(), p.getProperty("fsa.dict.encoding")); - assertEquals(sep, p.getProperty("fsa.dict.separator")); - assertEquals(encoderType, - EncoderType.valueOf(p.getProperty("fsa.dict.encoder"))); + Files.copy(POSDictionayBuilderTest.class.getResourceAsStream( + "/dictionaryWithLemma.txt"), tabFilePath, StandardCopyOption.REPLACE_EXISTING); + Files.copy(POSDictionayBuilderTest.class.getResourceAsStream( + "/dictionaryWithLemma.info"), infoFilePath, StandardCopyOption.REPLACE_EXISTING); - encoderType = EncoderType.SUFFIX; - p = createPropertiesHelper(c, sep, encoderType); - assertEquals(encoderType, - EncoderType.valueOf(p.getProperty("fsa.dict.encoder"))); - - } - - private Properties createPropertiesHelper(Charset c, String sep, - EncoderType encoderType) throws IOException { MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder(); - File f = File.createTempFile(POSDictionayBuilderTest.class.getName(), - ".info"); - builder.createProperties(c, sep, encoderType, f); - - InputStream is = new FileInputStream(f); - - Properties prop = new Properties(); - prop.load(is); - is.close(); - f.delete(); - return prop; + + return builder.build(tabFilePath); } } http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java index 87fc2cc..6b7525e 100644 --- a/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java +++ b/src/test/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizerTest.java @@ -2,11 +2,8 @@ package opennlp.morfologik.lemmatizer; import static org.junit.Assert.assertEquals; -import java.io.File; -import java.nio.charset.Charset; +import java.nio.file.Path; -import morfologik.stemming.EncoderType; -import opennlp.morfologik.builder.MorfologikDictionayBuilder; import opennlp.morfologik.builder.POSDictionayBuilderTest; import opennlp.tools.lemmatizer.DictionaryLemmatizer; @@ -28,17 +25,9 @@ public class MorfologikLemmatizerTest { private MorfologikLemmatizer createDictionary(boolean caseSensitive) throws Exception { - MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder(); - File dictInFile = new File(POSDictionayBuilderTest.class.getResource( - "/dictionaryWithLemma.txt").getFile()); + Path output = POSDictionayBuilderTest.createMorfologikDictionary(); - File dictOutFile = File.createTempFile( - POSDictionayBuilderTest.class.getName(), ".dict"); - - builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", EncoderType.PREFIX); - - MorfologikLemmatizer ml = new MorfologikLemmatizer(dictOutFile.toURI() - .toURL()); + MorfologikLemmatizer ml = new MorfologikLemmatizer(output); return ml; } http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java index d605e15..c6c9e04 100644 --- a/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java +++ b/src/test/java/opennlp/morfologik/tagdict/MorfologikTagDictionaryTest.java @@ -3,16 +3,11 @@ package opennlp.morfologik.tagdict; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; -import java.io.File; -import java.nio.charset.Charset; import java.util.Arrays; import java.util.List; import morfologik.stemming.Dictionary; -import morfologik.stemming.EncoderType; -import opennlp.morfologik.builder.MorfologikDictionayBuilder; import opennlp.morfologik.builder.POSDictionayBuilderTest; -import opennlp.morfologik.tagdict.MorfologikTagDictionary; import opennlp.tools.postag.TagDictionary; import org.junit.Test; @@ -74,17 +69,8 @@ public class MorfologikTagDictionaryTest { private MorfologikTagDictionary createDictionary(boolean caseSensitive, List<String> constant) throws Exception { - MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder(); - File dictInFile = new File(POSDictionayBuilderTest.class.getResource( - "/dictionaryWithLemma.txt").getFile()); - - File dictOutFile = File.createTempFile( - POSDictionayBuilderTest.class.getName(), ".dict"); - - builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", EncoderType.PREFIX); - - MorfologikTagDictionary ml = new MorfologikTagDictionary( - Dictionary.read(dictOutFile.toURI().toURL()), caseSensitive); + Dictionary dic = Dictionary.read(POSDictionayBuilderTest.createMorfologikDictionary()); + MorfologikTagDictionary ml = new MorfologikTagDictionary(dic, caseSensitive); return ml; } http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java ---------------------------------------------------------------------- diff --git a/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java new file mode 100644 index 0000000..6c6814b --- /dev/null +++ b/src/test/java/opennlp/morfologik/tagdict/POSTaggerFactoryTest.java @@ -0,0 +1,108 @@ +///* +// * Licensed to the Apache Software Foundation (ASF) under one or more +// * contributor license agreements. See the NOTICE file distributed with +// * this work for additional information regarding copyright ownership. +// * The ASF licenses this file to You under the Apache License, Version 2.0 +// * (the "License"); you may not use this file except in compliance with +// * the License. You may obtain a copy of the License at +// * +// * http://www.apache.org/licenses/LICENSE-2.0 +// * +// * Unless required by applicable law or agreed to in writing, software +// * distributed under the License is distributed on an "AS IS" BASIS, +// * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// * See the License for the specific language governing permissions and +// * limitations under the License. +// */ +// +//package opennlp.morfologik.tagdict; +// +//import static org.junit.Assert.assertTrue; +// +//import java.io.ByteArrayInputStream; +//import java.io.ByteArrayOutputStream; +//import java.io.File; +//import java.io.IOException; +//import java.io.InputStream; +//import java.io.InputStreamReader; +//import java.nio.charset.Charset; +//import java.nio.file.Files; +//import java.nio.file.Path; +//import java.nio.file.Paths; +// +//import morfologik.stemming.DictionaryMetadata; +//import morfologik.stemming.EncoderType; +//import opennlp.morfologik.builder.MorfologikDictionayBuilder; +//import opennlp.morfologik.builder.POSDictionayBuilderTest; +//import opennlp.tools.dictionary.Dictionary; +//import opennlp.tools.postag.DefaultPOSSequenceValidator; +//import opennlp.tools.postag.POSContextGenerator; +//import opennlp.tools.postag.POSDictionary; +//import opennlp.tools.postag.POSModel; +//import opennlp.tools.postag.POSSample; +//import opennlp.tools.postag.POSTaggerFactory; +//import opennlp.tools.postag.POSTaggerME; +//import opennlp.tools.postag.WordTagSampleStream; +//import opennlp.tools.util.BaseToolFactory; +//import opennlp.tools.util.InvalidFormatException; +//import opennlp.tools.util.ObjectStream; +//import opennlp.tools.util.TrainingParameters; +//import opennlp.tools.util.model.ModelType; +// +//import org.junit.Test; +// +///** +// * Tests for the {@link POSTaggerFactory} class. +// */ +//public class POSTaggerFactoryTest { +// +// private static ObjectStream<POSSample> createSampleStream() +// throws IOException { +// InputStream in = POSTaggerFactoryTest.class.getClassLoader() +// .getResourceAsStream("AnnotatedSentences.txt"); +// +// return new WordTagSampleStream((new InputStreamReader(in))); +// } +// +// static POSModel trainPOSModel(ModelType type, POSTaggerFactory factory) +// throws IOException { +// return POSTaggerME.train("en", createSampleStream(), +// TrainingParameters.defaultParams(), factory); +// } +// +// @Test +// public void testPOSTaggerWithCustomFactory() throws Exception { +// +// MorfologikDictionayBuilder builder = new MorfologikDictionayBuilder(); +// File dictInFile = new File(POSDictionayBuilderTest.class.getResource( +// "/dictionaryWithLemma.txt").getFile()); +// +// File dictOutFile = File.createTempFile( +// POSDictionayBuilderTest.class.getName(), ".dict"); +// +// builder.build(dictInFile, dictOutFile, Charset.forName("UTF-8"), "+", +// EncoderType.PREFIX); +// +// Path dictPath = dictOutFile.toPath(); +// Path metaPath = DictionaryMetadata.getExpectedMetadataLocation(dictPath); +// +// byte[] dic = Files.readAllBytes(dictPath); +// byte[] meta = Files.readAllBytes(metaPath); +// +// POSModel posModel = trainPOSModel(ModelType.MAXENT, +// new MorfologikPOSTaggerFactory(null, dic, meta)); +// +// POSTaggerFactory factory = posModel.getFactory(); +// assertTrue(factory.getTagDictionary() instanceof MorfologikPOSTaggerFactory); +// +// ByteArrayOutputStream out = new ByteArrayOutputStream(); +// posModel.serialize(out); +// ByteArrayInputStream in = new ByteArrayInputStream(out.toByteArray()); +// +// POSModel fromSerialized = new POSModel(in); +// +// factory = fromSerialized.getFactory(); +// assertTrue(factory.getTagDictionary() instanceof MorfologikPOSTaggerFactory); +// } +// +//} \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/resources/AnnotatedSentences.txt ---------------------------------------------------------------------- diff --git a/src/test/resources/AnnotatedSentences.txt b/src/test/resources/AnnotatedSentences.txt new file mode 100644 index 0000000..b40be87 --- /dev/null +++ b/src/test/resources/AnnotatedSentences.txt @@ -0,0 +1,136 @@ +Last_JJ September_NNP ,_, I_PRP tried_VBD to_TO find_VB out_RP the_DT address_NN of_IN an_DT old_JJ school_NN friend_NN whom_WP I_PRP had_VBD not_RB seen_VBN for_IN 15_CD years_NNS ._. +I_PRP just_RB knew_VBD his_PRP$ name_NN ,_, Alan_NNP McKennedy_NNP ,_, and_CC I_PRP 'd_MD heard_VBD the_DT rumour_NN that_IN he_PRP 'd_MD moved_VBD to_TO Scotland_NNP ,_, the_DT country_NN of_IN his_PRP$ ancestors_NNS ._. +So_IN I_PRP called_VBD Julie_NNP ,_, a_DT friend_NN who's_WDT still_RB in_IN contact_NN with_IN him_PRP ._. +She_PRP told_VBD me_PRP that_IN he_PRP lived_VBD in_IN 23213_CD Edinburgh_NNP ,_, Worcesterstreet_NNP 12_CD ._. +I_PRP wrote_VBD him_PRP a_DT letter_NN right_RB away_RB and_CC he_PRP answered_VBD soon_RB ,_, sounding_VBG very_RB happy_JJ and_CC delighted_JJ ._. + +Last_JJ year_NN ,_, I_PRP wanted_VBD to_TO write_VB a_DT letter_NN to_TO my_PRP$ grandaunt_NN ._. +Her_PRP$ 86_CD th_NN birthday_NN was_VBD on_IN October_NNP 6_CD ,_, and_CC I_PRP no_RB longer_RB wanted_VBD to_TO be_VB hesitant_JJ to_TO get_VB in_IN touch_NN with_IN her_PRP ._. +I_PRP did_VBD not_RB know_VB her_PRP face-to-face_RB ,_, and_CC so_RB it_PRP was_VBD not_RB easy_JJ for_IN me_PRP to_TO find_VB out_RP her_PRP$ address_NN ._. +As_IN she_PRP had_VBD two_CD apartments_NNS in_IN different_JJ countries_NNS ,_, I_PRP decided_VBD to_TO write_VB to_TO both_DT ._. +The_DT first_JJ was_VBD in_IN 12424_CD Paris_NNP in_IN Rue-de-Grandes-Illusions_NNP 5_CD ._. +But_CC Marie_NNP Clara_NNP ,_, as_IN my_PRP$ aunt_NN is_VBZ called_VBN ,_, prefered_VBN her_PRP$ apartment_NN in_IN Berlin_NNP ._. +It_PRP 's_VBZ postcode_JJ is_VBZ 30202_CD ._. +She_PRP lived_VBD there_RB ,_, in_IN beautiful_JJ Kaiserstra�e_NNP 13_CD ,_, particulary_NN in_IN summer_NN ._. + +Hi_UH my_PRP$ name_NN is_VBZ Stefanie_NNP Schmidt_NNP ,_, how_WRB much_RB is_VBZ a_DT taxi_NN from_IN Ostbahnhof_NNP to_TO Hauptbahnhof_NNP ?_. +About_IN 10_CD Euro_NNP ,_, I_PRP reckon_VBP ._. +That_DT sounds_VBZ good_JJ ._. +So_RB please_VB call_VB a_DT driver_NN to_TO Leonardstra�e_NNP 112_CD ,_, near_IN the_DT Ostbahnhof_NNP in_IN 56473_CD Hamburg_NNP ._. +I_PRP 'd_MD like_VB to_TO be_VB at_IN Silberhornstra�e_NNP 12_CD as_RB soon_RB as_IN possible_JJ ._. +Thank_VB you_PRP very_RB much_RB !_. + +Hi_NNP Mike_NNP ,_, it_PRP 's_VBZ Stefanie_NNP Schmidt_NNP ._. +I_PRP 'm_VBP in_IN N�rnberg_NNP at_IN the_DT moment_NN and_CC I_PRP 've_VBP got_VBD the_DT problem_NN that_IN my_PRP$ bike_NN has_VBZ broken_VBN ._. +Could_MD you_PRP please_VB pick_VB me_PRP up_RP from_IN Seidlstra�e_NNP 56_CD ,_, I_PRP 'm_VBP in_IN the_DT Caf�_NNP "Mondnacht"_NNP at_IN the_DT moment_NN ._. +Please_VB hurry_VB up_RB ,_, I_PRP need_VBP to_TO be_VB back_RB in_IN Ulm_NNP at_IN 8_CD p.m._NN !_. + +My_PRP$ husband_NN George_NNP and_CC me_PRP recently_RB celebrated_VBD our_PRP$ 10_CD th_JJ wedding_NN anniversary_NN ._. +We_PRP got_VBD married_VBN on_IN March_NNP 11_CD ,_, 1995_CD ._. +Therefore_RB ,_, we_PRP found_VBD a_DT photo_NN album_NN with_IN pictures_NNS of_IN our_PRP$ first_JJ own_JJ apartment_NN ,_, which_WDT was_VBD in_IN 81234_CD Munich_NNP ._. +As_IN a_DT young_JJ married_JJ couple_NN ,_, we_PRP did_VBD not_RB have_VB enough_JJ money_NN to_TO afford_VB a_DT bigger_JJR lodge_NN than_IN this_DT one_CD in_IN Blumenweg_NNP 1_CD ._. +But_CC only_RB five_CD years_NNS later_RB ,_, my_PRP$ husband_NN was_VBD offered_VBN a_DT well-payed_JJ job_NN in_IN 17818_CD Hamburg_NNP ,_, so_IN we_PRP moved_VBD there_RB ._. +Since_IN then_RB ,_, our_PRP$ guests_NNS have_VBP to_TO ring_VB at_IN Veilchenstra�e_NNP 11_CD if_IN they_PRP want_VBP to_TO visit_VB us_PRP ,_, Luise_NNP and_CC George_NNP Bauer_NNP ._. + +I_PRP read_VBD your_PRP$ help-wanted_JJ ad_NN with_IN great_JJ attention_NN ._. +I_PRP 'm_VBP a_DT student_NN of_IN informatics_NNS ,_, 6th_JJ semester,_NN and_CC I_PRP 'm_VBP very_RB interested_VBN in_IN your_PRP$ part-time_JJ job_NN offer_NN ._. +I_PRP have_VBP a_DT competent_JJ knowledge_NN of_IN programming_NN and_CC foreign_JJ languages_NNS ,_, like_IN French_JJ and_CC Italian_JJ ._. +I_PRP 'm_VBP looking_VBG forward_RB to_TO your_PRP$ reply_NN ._. + +Alisa_NNP Fernandes_NNP ,_, a_DT tourist_NN from_IN Spain_NNP ,_, went_VBD to_TO the_DT reception_NN desk_NN of_IN the_DT famous_JJ Highfly-Hotel_NNP in_IN 30303_CD Berlin_NNP ._. +As_IN she_PRP felt_VBD quite_RB homesick_JJ ,_, she_PRP asked_VBD the_DT staff_NN if_IN they_PRP knew_VBD a_DT good_JJ Spanish_JJ restaurant_NN in_IN Berlin_NNP ._. +The_DT concierge_NN told_VBD her_PRP to_TO go_VB to_TO the_DT "Tapasbar"_NN in_IN Chesterstr._NNP 2_CD ._. +Alisa_NNP appreciated_VBD the_DT hint_NN and_CC enjoyed_VBD a_DT delicious_JJ traditional_JJ meal_NN ._. + +An_DT old_JJ friend_NN from_IN France_NNP is_VBZ currently_RB travelling_VBG around_IN Europe_NNP ._. +Yesterday_NN ,_, she_PRP arrived_VBD in_IN Berlin_NNP and_CC we_PRP met_VBD up_RP spontaneously_RB ._. +She_PRP wanted_VBD me_PRP to_TO show_VB her_PRP some_DT famous_JJ sights_NNS ,_, like_IN the_DT Brandenburger_NNP Tor_NNP and_CC the_DT Reichstag_NNP ._. +But_CC it_PRP was_VBD not_RB easy_JJ to_TO meet_VB up_RP in_IN the_DT city_NN because_IN she_PRP hardly_RB knows_VBZ any_DT streetname_NN or_CC building_NN ._. +So_IN I_PRP proposed_VBD to_TO meet_VB at_IN a_DT quite_RB local_JJ point:_NN the_DT caf�_NN "Daily's"_NN in_IN Unter-den-Linden_NNP 18,_CD 30291_CD Berlin_NNP ._. +It_PRP is_VBZ five_CD minutes_NNS away_RB from_IN the_DT underground_JJ station_NN "Westbad"_NN ._. +She_PRP found_VBD it_PRP instantly_RB and_CC we_PRP spent_VBD a_DT great_JJ day_NN in_IN the_DT capital_NN ._. + +Where_WRB did_VBD you_PRP get_VB those_DT great_JJ shoes_NNS ?_. +They_PRP look_VBP amazing_JJ ,_, I_PRP love_VBP the_DT colour_NN ._. +Are_VBP they_PRP made_VBN of_IN leather_NN ?_. +No,_NNP that_DT 's_VBZ faked_VBN ._. +But_CC anyway_RB ,_, I_PRP like_VBP them_PRP too_RB ._. +I_PRP got_VBD them_PRP from_IN Hamburg._NNP +Do_VBP not_RB you_PRP know_VB the_DT famous_JJ shop_NN in_IN Veilchenstra�e_NNP ?_. +It_PRP 's_VBZ called_VBN "Twentytwo"_NNP ._. +I_PRP 've_VBP never_RB heard_VBN of_IN that_DT before_RB ._. +Could_MD you_PRP give_VB me_PRP the_DT complete_JJ address_NN ?_. +Sure_JJ ,_, it_PRP 's_VBZ in_IN Veilchenstra�e_NNP 12_CD ,_, in_IN 78181_CD Hamburg_NNP ._. +I_PRP deem_VBP it_PRP best_RB to_TO write_VB a_DT letter_NN to_TO the_DT owner_NN if_IN the_DT shoes_NNS are_VBP still_RB available_JJ ._. +His_PRP$ name_NN is_VBZ Gerhard_NNP Fritsch_NNP ._. + +Hi_UH ,_, am_VBP I_PRP talking_VBG to_TO the_DT inquiries_NNS ?_. +My_PRP$ name_NN is_VBZ Mike_NNP Sander_NNP and_CC I_PRP 'd_MD like_VB to_TO know_VB if_IN it_PRP is_VBZ possible_JJ to_TO get_VB information_NN about_IN an_DT address_NN if_IN I_PRP merely_RB know_VBP the_DT name_NN and_CC the_DT phone_NN number_NN of_IN a_DT person_NN !_. +How_WRB is_VBZ he_PRP or_CC she_PRP called_VBD ?_. +His_PRP$ name_NN is_VBZ Stefan_NNP Miller_NNP and_CC his_PRP$ number_NN is_VBZ the_DT 030/827234_CD ._. +I'll_NNP have_VBP a_DT look_NN in_IN the_DT computer..._NN +I_PRP found_VBD a_DT Stefan_NNP Miller_NNP who_WP lives_VBZ in_IN Leipzig._NNP +Is_VBZ that_DT right_NN ?_. +Yes_UH ,_, it_PRP definitely_RB is_VBZ ._. +So_RB Stefan_NNP Miller_NNP lives_VBZ in_IN Heinrich-Heine-Stra�e_NNP 112_CD ,_, in_IN 20193_CD Leipzig_NNP ._. +Thank_VB you_PRP very_RB much_RB for_IN the_DT information_NN ._. +Bye_NNP !_. + +On_IN July_NNP 14_CD ,_, the_DT father_NN of_IN a_DT family_NN got_VBD painfully_RB injured_VBN after_IN he_PRP had_VBD tried_VBN to_TO start_VB a_DT barbecue_NN ._. +The_DT flaring_VBG flames_NNS burnt_VBP instantly_RB through_IN his_PRP$ jacket_NN ,_, which_WDT he_PRP managed_VBD to_TO pull_VB off_RP last-minute_JJ ._. +Although_IN the_DT wounds_NNS were_VBD n't_RB life-threatening_JJ ,_, it_PRP was_VBD urgent_JJ to_TO bring_VB him_PRP directly_RB into_IN ambulance_NN ._. +But_CC the_DT only_JJ hospital_NN that_WDT had_VBD opened_VBN that_IN Sunday_NNP was_VBD the_DT Paracelsus_NNP Hospital_NNP in_IN 83939_CD Weilheim_NNP ,_, which_WDT was_VBD 2_CD hours_NNS away_RB ._. +Convulsed_JJ with_IN pain_NN ,_, the_DT man_NN finally_RB arrived_VBD in_IN Stifterstra�e_NNP 15_CD ,_, where_WRB the_DT personal_NN immediately_RB took_VBD care_NN of_IN him_PRP ._. + +Last_JJ year_NN ,_, I_PRP worked_VBD as_IN a_DT delivery_NN boy_NN for_IN a_DT small_JJ local_JJ magazine_NN ._. +I_PRP worked_VBD in_IN the_DT area_NN of_IN 83454_CD Ottobrunn_NNP ._. +I_PRP had_VBD a_DT list_NN with_IN the_DT home_NN addresses_NNS of_IN our_PRP$ costumers_NNS whom_WP I_PRP brought_VBD their_PRP$ papers_NNS once_RB a_DT week_NN ._. +An_DT elderly_JJ lady_NN ,_, who_WP was_VBD called_VBN Elenor_NNP Meier_NNP ,_, lived_VBD in_IN G�rtnerweg_NNP 6_CD ,_, and_CC I_PRP always_RB drove_VBD there_RB first_RB ,_, because_IN I_PRP liked_VBD her_PRP the_DT most_JJS ._. +Afterwards_RB ,_, I_PRP went_VBD to_TO a_DT student_NN ,_, Gina_NNP Schneider_NNP ,_, who_WP lived_VBD still_RB in_IN her_PRP$ parent's_NNS house_NN in_IN G�rtnerweg_NNP 25_CD ._. +The_DT last_JJ in_IN line_NN was_VBD the_DT retired_JJ teacher_NN Bruno_NNP Schulz_NNP in_IN Dramenstra�e_NNP 15_CD ._. +He_PRP was_VBD friendly_JJ enough_RB to_TO tip_VB sometimes_RB ._. + +Our_PRP$ business_NN company_NN was_VBD founded_VBN in_IN 1912_CD by_IN the_DT singer_NN and_CC entertainer_NN Michel_NNP Seile_NNP ._. +He_PRP opened_VBD the_DT first_JJ agency_NN in_IN Erding_NNP ,_, a_DT small_JJ town_NN near_IN Munich_NNP ._. +Now_RB ,_, more_JJR than_IN 90_CD years_NNS of_IN turbulent_JJ ups_NNS and_CC downs_NNS later_RB ,_, we_PRP finally_RB decided_VBD to_TO situate_VB our_PRP$ company_NN in_IN a_DT more_JJR central_JJ and_CC frequented_JJ area_NN ._. +Last_JJ year_NN ,_, we_PRP moved_VBD into_IN an_DT empty_JJ factory_NN building_NN in_IN 30303_CD Berlin_NNP ._. +It_PRP is_VBZ located_VBN in_IN Barmerstr._NNP 34_CD ._. + +When_WRB George_NNP Miller_NNP ,_, a_DT tourist_NN from_IN England_NNP ,_, came_VBD to_TO Munich_NNP ,_, he_PRP had_VBD no_DT idea_NN how_WRB to_TO read_VB the_DT city_NN maps_NNS ._. +He_PRP depended_VBD completely_RB on_IN the_DT help_NN and_CC information_NN of_IN German_JJ pedestrians_NNS ._. +One_CD day_NN ,_, he_PRP simply_RB could_MD not_RB find_VB the_DT famous_JJ Lenbachhaus_NNP ._. +So_RB he_PRP asked_VBD a_DT young_JJ woman_NN for_IN help_NN ._. +She_PRP pointed_VBD at_IN a_DT street_NN sign_NN and_CC explained_VBD to_TO him_PRP that_IN he_PRP 'd_MD find_VB the_DT Lenbachhaus_NNP in_IN Luisenstra�e_NNP 33_CD ,_, which_WDT is_VBZ in_IN 80333_CD Munich_NNP ._. +Miller_NNP was_VBD very_RB grateful_JJ and_CC could_MD finally_RB enjoy_VB the_DT exhibition_NN ._. + +On_IN March_NNP 15_CD ,_, there_EX was_VBD an_DT accident_NN near_IN Munich_NNP ._. +The_DT driver_NN got_VBD badly_RB injured_VBN ._. +Driving_VBG alone_RB not_RB far_RB from_IN her_PRP$ home_NN ,_, the_DT middle-aged_JJ woman_NN crashed_VBD at_IN high_JJ speed_NN into_IN a_DT tree_NN ._. +A_DT resident_NN ,_, who_WP lives_VBZ near_IN the_DT street_NN where_WRB the_DT accident_NN took_VBD place_NN ,_, called_VBN instantly_RB the_DT police_NN ._. +He_PRP reported_VBD what_WP had_VBD happened_VBN and_CC gave_VBD his_PRP$ name_NN and_CC address_NN to_TO the_DT officer_NN ._. +He_PRP 's_VBZ called_VBN Peter_NNP Schubert_NNP and_CC he_PRP lives_VBZ at_IN Max-L�w-Stra�e_NNP 13_CD in_IN 84630_CD Gauting_NNP ._. +The_DT police_NN arrived_VBD ten_CD minutes_NNS later_RB and_CC brought_VBD the_DT woman_NN into_IN hospital_NN ._. +Although_IN she_PRP had_VBD multiple_JJ trauma_NN ,_, she_PRP 's_VBZ out_IN of_IN mortal_JJ danger_NN ._. + +Hi_NNP ,_, how_WRB are_VBP you_PRP ?_. +Are_VBP nt't_RB you_PRP a_DT friend_NN of_IN Natalie_NNP ?_. +Yeah_UH for_IN sure_JJ ._. +How_WRB did_VBD you_PRP know_VB that_DT ?_. +I_PRP saw_VBD you_PRP sitting_VBG next_JJ to_TO her_PRP at_IN uni_JJ ._. +Yeah_NNP she_PRP 's_VBZ my_PRP$ best_JJS friend_NN ._. +Are_VBP you_PRP going_VBG to_TO her_PRP party_NN next_JJ friday_NN ?_. +Oh_UH yes_UH ,_, I_PRP 'd_MD really_RB like_VB to_TO ._. +But_CC in_IN fact_NN I_PRP do_VBP n't_RB know_VB yet_RB where_WRB it_PRP takes_VBZ place_NN ._. +I_PRP can_MD tell_VB you_PRP :_: ring_NN at_IN Baumann,_NNP Meisenstra�e_NNP 5_CD ,_, in_IN 81737_CD Munich_NNP ._. +The_DT party_NN starts_VBZ at_IN 9_CD p.m._NN ._. +I_PRP hope_VBP you_PRP 'll_MD find_VB it_PRP ._. +Thank_VB you_PRP very_RB much_RB ,_, see_VBP you_PRP next_JJ friday_NN !_. + +My_PRP$ name_NN is_VBZ Michael_NNP Hinterhofer_NNP ._. +When_WRB I_PRP was_VBD 21_CD ,_, I_PRP moved_VBD out_RP from_IN my_PRP$ parents_NNS home_NN into_IN my_PRP$ first_JJ own_JJ appartment_NN in_IN order_NN to_TO study_VB in_IN a_DT bigger_JJR city_NN ._. +My_PRP$ new_JJ home_NN was_VBD in_IN Lilienstra�e_NNP 1_CD in_IN 25334_CD Hamburg_NNP ._. +But_CC I_PRP realized_VBD quickly_RB that_IN life_NN in_IN a_DT metropolis_NN was_VBD n't_RB relaxed_VBN enough_RB for_IN me_PRP ._. +So_IN I_PRP decided_VBD to_TO move_VB into_IN a_DT smaller_JJR town_NN ._. +Now_RB I_PRP 'm_VBP a_DT tenant_NN with_IN an_DT elderly_JJ widow_NN ._. +We_PRP live_VBP in_IN B�rgerstra�e_NNP 2_CD in_IN 63737_CD Heidelberg_NNP ._. +I_PRP really_RB like_IN the_DT smalltown_JJ flair_NN and_CC my_PRP$ studies_NNS at_IN Heidelberg_NNP 's_POS notable_JJ university_NN ._. \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/resources/dictionaryWithLemma.info ---------------------------------------------------------------------- diff --git a/src/test/resources/dictionaryWithLemma.info b/src/test/resources/dictionaryWithLemma.info new file mode 100644 index 0000000..ad5fe8d --- /dev/null +++ b/src/test/resources/dictionaryWithLemma.info @@ -0,0 +1,15 @@ +# +# REQUIRED PROPERTIES +# + +# Column (lemma, inflected, tag) separator. This must be a single byte in the target encoding. +fsa.dict.separator=, + +# The charset in which the input is encoded. UTF-8 is strongly recommended. +fsa.dict.encoding=UTF-8 + +# The type of lemma-inflected form encoding compression that precedes automaton +# construction. Allowed values: [suffix, infix, prefix, none]. +# Details are in Daciuk's paper and in the code. +# Leave at 'prefix' if not sure. +fsa.dict.encoder=prefix \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp/blob/1314887f/src/test/resources/dictionaryWithLemma.txt ---------------------------------------------------------------------- diff --git a/src/test/resources/dictionaryWithLemma.txt b/src/test/resources/dictionaryWithLemma.txt index 5ac7111..09d39e3 100644 --- a/src/test/resources/dictionaryWithLemma.txt +++ b/src/test/resources/dictionaryWithLemma.txt @@ -1,10 +1,11 @@ -casa casa NOUN -casa casar V -Casa Casa PROP -casinha casa NOUN -casona casa NOUN -menina menino NOUN -menino menino NOUN -meninão menino NOUN -menininho menino NOUN -carro NOUN +casa,casa,NOUN +casar,casa,V +casar,casar,V-INF +Casa,Casa,PROP +casa,casinha,NOUN +casa,casona,NOUN +menino,menina,NOUN +menino,menino,NOUN +menino,meninão,NOUN +menino,menininho,NOUN +carro,carro,NOUN \ No newline at end of file
