This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch OPENNLP-1655-Add-constructors-to-customize-Abbreviation-Dict-at-runtime in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit 58cdb4248c1d5004cf82c23d1d83847ffef052d0 Author: Martin Wiesner <[email protected]> AuthorDate: Sun Nov 24 20:35:33 2024 +0100 OPENNLP-1655 Add constructors in SentenceDetectorME and TokenizerME to customize Abbreviation Dict at runtime --- .../opennlp/tools/sentdetect/SentenceDetectorME.java | 14 ++++++++++++-- .../tools/sentdetect/ThreadSafeSentenceDetectorME.java | 16 ++++++++++++++-- .../opennlp/tools/tokenize/ThreadSafeTokenizerME.java | 18 +++++++++++++++--- .../main/java/opennlp/tools/tokenize/TokenizerME.java | 16 +++++++++++++--- 4 files changed, 54 insertions(+), 10 deletions(-) diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java index e65eed33..96953506 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java @@ -99,11 +99,21 @@ public class SentenceDetectorME implements SentenceDetector { * @param model the {@link SentenceModel} */ public SentenceDetectorME(SentenceModel model) { - SentenceDetectorFactory sdFactory = model.getFactory(); + this(model, model.getAbbreviations()); + } + + /** + * Instantiates a {@link SentenceDetectorME} with an existing {@link SentenceModel}. + * + * @param model The {@link SentenceModel} to be used. + * @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}. + */ + public SentenceDetectorME(SentenceModel model, Dictionary abbDict) { this.model = model.getMaxentModel(); + this.abbDict = abbDict; + SentenceDetectorFactory sdFactory = model.getFactory(); cgen = sdFactory.getSDContextGenerator(); scanner = sdFactory.getEndOfSentenceScanner(); - abbDict = model.getAbbreviations(); useTokenEnd = sdFactory.isUseTokenEnd(); } diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java index 17ea14e8..7706cfa8 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java +++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java @@ -18,6 +18,7 @@ package opennlp.tools.sentdetect; import opennlp.tools.commons.ThreadSafe; +import opennlp.tools.dictionary.Dictionary; import opennlp.tools.util.Span; /** @@ -37,20 +38,31 @@ import opennlp.tools.util.Span; public class ThreadSafeSentenceDetectorME implements SentenceDetector, AutoCloseable { private final SentenceModel model; + private final Dictionary abbDict; private final ThreadLocal<SentenceDetectorME> threadLocal = new ThreadLocal<>(); public ThreadSafeSentenceDetectorME(SentenceModel model) { - super(); + this(model, model.getAbbreviations()); + } + + /** + * Instantiates a {@link ThreadSafeSentenceDetectorME} with an existing {@link SentenceModel}. + * + * @param model The {@link SentenceModel} to be used. + * @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}. + */ + public ThreadSafeSentenceDetectorME(SentenceModel model, Dictionary abbDict) { this.model = model; + this.abbDict = abbDict; } // If a thread-local version exists, return it. Otherwise, create, then return. private SentenceDetectorME getSD() { SentenceDetectorME sd = threadLocal.get(); if (sd == null) { - sd = new SentenceDetectorME(model); + sd = new SentenceDetectorME(model, abbDict); threadLocal.set(sd); } return sd; diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java index 3ebbd1e3..13de7bc5 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java +++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java @@ -18,6 +18,7 @@ package opennlp.tools.tokenize; import opennlp.tools.commons.ThreadSafe; +import opennlp.tools.dictionary.Dictionary; import opennlp.tools.util.Span; /** @@ -37,18 +38,29 @@ import opennlp.tools.util.Span; public class ThreadSafeTokenizerME implements Tokenizer, AutoCloseable { private final TokenizerModel model; - + private final Dictionary abbDict; + private final ThreadLocal<TokenizerME> threadLocal = new ThreadLocal<>(); public ThreadSafeTokenizerME(TokenizerModel model) { - super(); + this(model, model.getAbbreviations()); + } + + /** + * Instantiates a {@link ThreadSafeTokenizerME} with an existing {@link TokenizerModel}. + * + * @param model The {@link TokenizerModel} to be used. + * @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}. + */ + public ThreadSafeTokenizerME(TokenizerModel model, Dictionary abbDict) { this.model = model; + this.abbDict = abbDict; } private TokenizerME getTokenizer() { TokenizerME tokenizer = threadLocal.get(); if (tokenizer == null) { - tokenizer = new TokenizerME(model); + tokenizer = new TokenizerME(model, abbDict); threadLocal.set(tokenizer); } return tokenizer; diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java index 1c88f84b..ee0d8267 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java +++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java @@ -129,13 +129,23 @@ public class TokenizerME extends AbstractTokenizer { * @param model The {@link TokenizerModel} to be used. */ public TokenizerME(TokenizerModel model) { + this(model, model.getAbbreviations()); + } + + /** + * Instantiates a {@link TokenizerME} with an existing {@link TokenizerModel}. + * + * @param model The {@link TokenizerModel} to be used. + * @param abbDict The {@link Dictionary} to be used. It must fit the language of the {@code model}. + */ + public TokenizerME(TokenizerModel model, Dictionary abbDict) { + this.model = model.getMaxentModel(); + this.abbDict = abbDict; TokenizerFactory factory = model.getFactory(); - this.alphanumeric = factory.getAlphaNumericPattern(); this.cg = factory.getContextGenerator(); - this.model = model.getMaxentModel(); + this.alphanumeric = factory.getAlphaNumericPattern(); this.useAlphaNumericOptimization = factory.isUseAlphaNumericOptimization(); - abbDict = model.getAbbreviations(); newTokens = new ArrayList<>(); tokProbs = new ArrayList<>(50); }
