Repository: opennlp Updated Branches: refs/heads/LangDetect a189d4ecc -> eb6fb32d1
Add support to train on leipzig Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/eb6fb32d Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/eb6fb32d Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/eb6fb32d Branch: refs/heads/LangDetect Commit: eb6fb32d1dbb86f3417ae67a1f45daa0da39fa3a Parents: a189d4e Author: Jörn Kottmann <[email protected]> Authored: Thu May 18 17:03:52 2017 +0200 Committer: Jörn Kottmann <[email protected]> Committed: Fri May 19 11:02:44 2017 +0200 ---------------------------------------------------------------------- .../tools/cmdline/StreamFactoryRegistry.java | 3 + .../formats/LeipzigDoccatSampleStream.java | 5 +- .../LeipzigDocumentSampleStreamFactory.java | 3 + .../leipzig/LeipzigLanguageSampleStream.java | 133 +++++++++++++++++++ .../LeipzigLanguageSampleStreamFactory.java | 73 ++++++++++ .../LanguageDetectorContextGenerator.java | 21 ++- .../tools/langdetect/LanguageSample.java | 13 +- 7 files changed, 227 insertions(+), 24 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java index d1e8c89..b258ab2 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java +++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java @@ -53,6 +53,7 @@ import opennlp.tools.formats.convert.ParseToPOSSampleStreamFactory; import opennlp.tools.formats.convert.ParseToSentenceSampleStreamFactory; import opennlp.tools.formats.convert.ParseToTokenSampleStreamFactory; import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory; +import opennlp.tools.formats.leipzig.LeipzigLanguageSampleStreamFactory; import opennlp.tools.formats.letsmt.LetsmtSentenceStreamFactory; import opennlp.tools.formats.moses.MosesSentenceSampleStreamFactory; import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory; @@ -117,6 +118,8 @@ public final class StreamFactoryRegistry { ConlluPOSSampleStreamFactory.registerFactory(); ConlluLemmaSampleStreamFactory.registerFactory(); + + LeipzigLanguageSampleStreamFactory.registerFactory(); } public static final String DEFAULT_FORMAT = "opennlp"; http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java index 8ed0036..7059e21 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java @@ -39,8 +39,11 @@ import opennlp.tools.util.PlainTextByLineStream; * <p> * The input text is tokenized with the {@link SimpleTokenizer}. The input text classified * by the language model must also be tokenized by the {@link SimpleTokenizer} to produce - * exactly the same tokenization during testing and training.ø + * exactly the same tokenization during testing and training. + * + * @deprecated will be removed, use the language detector instead */ +@Deprecated public class LeipzigDoccatSampleStream extends FilterObjectStream<String, DocumentSample> { http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java index bd2453b..d6ff9ba 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java @@ -33,7 +33,10 @@ import opennlp.tools.util.ObjectStreamUtils; /** * <b>Note:</b> Do not use this class, internal use only! + * + * @deprecated will be removed, use the language detector instead */ +@Deprecated public class LeipzigDocumentSampleStreamFactory extends AbstractSampleStreamFactory<DocumentSample> { http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java new file mode 100644 index 0000000..582fb08 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.leipzig; + +import java.io.File; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.util.Arrays; +import java.util.Iterator; +import java.util.Map; +import java.util.stream.Collectors; + +import opennlp.tools.langdetect.Language; +import opennlp.tools.langdetect.LanguageSample; +import opennlp.tools.util.MarkableFileInputStreamFactory; +import opennlp.tools.util.ObjectStream; +import opennlp.tools.util.PlainTextByLineStream; + +public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample> { + + private class LeipzigSentencesStream implements ObjectStream<LanguageSample> { + private final String lang; + private int sentencesPerSample; + private int numberOfSamples; + + private ObjectStream<String> lineStream; + private int sampleCount; + + LeipzigSentencesStream(String lang, File sentencesFile, int sentencesPerSample, int numberOfSamples) + throws IOException { + this.lang = sentencesFile.getName().substring(0, 3); + this.sentencesPerSample = sentencesPerSample; + this.numberOfSamples = numberOfSamples; + + lineStream = new PlainTextByLineStream(new MarkableFileInputStreamFactory(sentencesFile), + StandardCharsets.UTF_8); + } + + @Override + public LanguageSample read() throws IOException { + if (sampleCount < numberOfSamples) { + StringBuilder sampleString = new StringBuilder(); + + int count = 0; + String line; + while (count < sentencesPerSample && (line = lineStream.read()) != null) { + + // TODO: It should it be changed to contain an array of sample strings ?! + sampleString.append(line + " "); + + count++; + } + + if (sampleString.length() > 0) { + sampleCount++; + return new LanguageSample(new Language(lang), sampleString); + } + } + return null; + } + } + + private final int sentencesPerSample; + + private Map<String, Integer> langSampleCounts; + private File[] sentencesFiles; + + private Iterator<File> sentencesFilesIt; + private ObjectStream<LanguageSample> sampleStream; + + public LeipzigLanguageSampleStream(File leipzigFolder, final int sentencesPerSample, + final int samplesPerLanguage) throws IOException { + this.sentencesPerSample = sentencesPerSample; + // TODO: Use a FileFilter to make this more reliable in case there are files which should be ignored + sentencesFiles = leipzigFolder.listFiles(); + Arrays.sort(sentencesFiles); + + Map<String, Integer> langCounts = Arrays.stream(sentencesFiles) + .map(file -> file.getName().substring(0, 3)) + .collect(Collectors.groupingBy(String::toString, Collectors.summingInt(v -> 1))); + + langSampleCounts = langCounts.entrySet().stream() + .collect(Collectors.toMap(Map.Entry::getKey, e -> samplesPerLanguage / e.getValue())); + + reset(); + } + + public LanguageSample read() throws IOException { + LanguageSample sample; + if (sampleStream != null && (sample = sampleStream.read()) != null) { + return sample; + } + else { + if (sentencesFilesIt.hasNext()) { + File sentencesFile = sentencesFilesIt.next(); + System.out.println(sentencesFile); + String lang = sentencesFile.getName().substring(0, 3); + + sampleStream = new LeipzigSentencesStream(lang, sentencesFile, + sentencesPerSample, langSampleCounts.get(lang)); + + return read(); + } + } + return null; + } + + @Override + public void reset() throws IOException { + sentencesFilesIt = Arrays.asList(sentencesFiles).iterator(); + sampleStream = null; + } + + public static void main(String[] args) throws Exception { + new LeipzigLanguageSampleStream(new File("/home/blue/opennlp-data-dir/leipzig-lang"), + 10, 100000); + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java new file mode 100644 index 0000000..96b0378 --- /dev/null +++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java @@ -0,0 +1,73 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.formats.leipzig; + +import java.io.File; +import java.io.IOException; + +import opennlp.tools.cmdline.ArgumentParser; +import opennlp.tools.cmdline.ArgumentParser.ParameterDescription; +import opennlp.tools.cmdline.StreamFactoryRegistry; +import opennlp.tools.cmdline.TerminateToolException; +import opennlp.tools.cmdline.params.EncodingParameter; +import opennlp.tools.formats.AbstractSampleStreamFactory; +import opennlp.tools.langdetect.LanguageSample; +import opennlp.tools.util.ObjectStream; + +/** + * <b>Note:</b> Do not use this class, internal use only! + */ +public class LeipzigLanguageSampleStreamFactory + extends AbstractSampleStreamFactory<LanguageSample> { + + interface Parameters extends EncodingParameter { + @ParameterDescription(valueName = "sentencesDir", + description = "dir with Leipig sentences to be used") + File getSentencesDir(); + + @ParameterDescription(valueName = "sentencesPerSample", + description = "number of sentences per sample") + int getSentencesPerSample(); + + @ParameterDescription(valueName = "samplesPerLanguage", + description = "number of samples per language") + int getSamplesPerLanguage(); + } + + protected <P> LeipzigLanguageSampleStreamFactory(Class<P> params) { + super(params); + } + + public static void registerFactory() { + StreamFactoryRegistry.registerFactory(LanguageSample.class, + "leipzig", new LeipzigLanguageSampleStreamFactory(Parameters.class)); + } + + public ObjectStream<LanguageSample> create(String[] args) { + + Parameters params = ArgumentParser.parse(args, Parameters.class); + File sentencesFileDir = params.getSentencesDir(); + + try { + return new LeipzigLanguageSampleStream(sentencesFileDir, params.getSentencesPerSample(), + params.getSamplesPerLanguage()); + } catch (IOException e) { + throw new TerminateToolException(-1, "IO error while opening sample data.", e); + } + } +} http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java index c63ba76..dcfe0e9 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java @@ -25,12 +25,6 @@ import opennlp.tools.util.StringList; import opennlp.tools.util.StringUtil; import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer; import opennlp.tools.util.normalizer.CharSequenceNormalizer; -import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer; -import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer; -import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer; -import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer; -import opennlp.tools.util.normalizer.UnicodeCharSequenceNormalizer; -import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer; /** * Context generator for document categorizer @@ -46,19 +40,20 @@ class LanguageDetectorContextGenerator { this.maxLength = maxLength; this.normalizer = new AggregateCharSequenceNormalizer( - EmojiCharSequenceNormalizer.getInstance(), - UrlCharSequenceNormalizer.getInstance(), - TwitterCharSequenceNormalizer.getInstance(), - NumberCharSequenceNormalizer.getInstance(), - UnicodeCharSequenceNormalizer.getInstance(), - ShrinkCharSequenceNormalizer.getInstance()); + // EmojiCharSequenceNormalizer.getInstance(), + //UrlCharSequenceNormalizer.getInstance(), + //TwitterCharSequenceNormalizer.getInstance(), + //NumberCharSequenceNormalizer.getInstance(), + //UnicodeCharSequenceNormalizer.getInstance(), + //ShrinkCharSequenceNormalizer.getInstance()); + ); } /** * Initializes the current instance with min 2 length and max 5 length of ngrams. */ LanguageDetectorContextGenerator() { - this(2, 5); + this(3, 3); } public String[] getContext(String document) { http://git-wip-us.apache.org/repos/asf/opennlp/blob/eb6fb32d/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java ---------------------------------------------------------------------- diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java index 6f2fda7..f454864 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java +++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java @@ -28,10 +28,8 @@ public class LanguageSample { private final CharSequence context; public LanguageSample(Language language, CharSequence context) { - Objects.requireNonNull(context, "context must not be null"); - Objects.requireNonNull(language, "language must not be null"); - this.language = language; - this.context = context; + this.language = Objects.requireNonNull(language, "language must not be null"); + this.context = Objects.requireNonNull(context, "context must not be null"); } public Language getLanguage() { @@ -44,12 +42,7 @@ public class LanguageSample { @Override public String toString() { - - StringBuilder sampleString = new StringBuilder(); - - sampleString.append(language.getLang()).append('\t').append(context); - - return sampleString.toString(); + return language.getLang() + '\t' + context; } @Override
