(opennlp) 01/01: OPENNLP-1655 Add constructors in SentenceDetectorME and TokenizerME to customize Abbreviation Dict at runtime

mawiesne Sun, 24 Nov 2024 11:35:51 -0800

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch 
OPENNLP-1655-Add-constructors-to-customize-Abbreviation-Dict-at-runtime
in repository https://gitbox.apache.org/repos/asf/opennlp.git


commit 58cdb4248c1d5004cf82c23d1d83847ffef052d0
Author: Martin Wiesner <[email protected]>
AuthorDate: Sun Nov 24 20:35:33 2024 +0100

    OPENNLP-1655 Add constructors in SentenceDetectorME and TokenizerME to 
customize Abbreviation Dict at runtime
---
 .../opennlp/tools/sentdetect/SentenceDetectorME.java   | 14 ++++++++++++--
 .../tools/sentdetect/ThreadSafeSentenceDetectorME.java | 16 ++++++++++++++--
 .../opennlp/tools/tokenize/ThreadSafeTokenizerME.java  | 18 +++++++++++++++---
 .../main/java/opennlp/tools/tokenize/TokenizerME.java  | 16 +++++++++++++---
 4 files changed, 54 insertions(+), 10 deletions(-)

diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
index e65eed33..96953506 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceDetectorME.java
@@ -99,11 +99,21 @@ public class SentenceDetectorME implements SentenceDetector 
{
    * @param model the {@link SentenceModel}
    */
   public SentenceDetectorME(SentenceModel model) {
-    SentenceDetectorFactory sdFactory = model.getFactory();
+    this(model, model.getAbbreviations());
+  }
+
+  /**
+   * Instantiates a {@link SentenceDetectorME} with an existing {@link 
SentenceModel}.
+   *
+   * @param model The {@link SentenceModel} to be used.
+   * @param abbDict The {@link Dictionary} to be used. It must fit the 
language of the {@code model}.
+   */
+  public SentenceDetectorME(SentenceModel model, Dictionary abbDict) {
     this.model = model.getMaxentModel();
+    this.abbDict = abbDict;
+    SentenceDetectorFactory sdFactory = model.getFactory();
     cgen = sdFactory.getSDContextGenerator();
     scanner = sdFactory.getEndOfSentenceScanner();
-    abbDict = model.getAbbreviations();
     useTokenEnd = sdFactory.isUseTokenEnd();
   }
 
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java
 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java
index 17ea14e8..7706cfa8 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java
@@ -18,6 +18,7 @@
 package opennlp.tools.sentdetect;
 
 import opennlp.tools.commons.ThreadSafe;
+import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.util.Span;
 
 /**
@@ -37,20 +38,31 @@ import opennlp.tools.util.Span;
 public class ThreadSafeSentenceDetectorME implements SentenceDetector, 
AutoCloseable {
 
   private final SentenceModel model;
+  private final Dictionary abbDict;
 
   private final ThreadLocal<SentenceDetectorME> threadLocal =
       new ThreadLocal<>();
 
   public ThreadSafeSentenceDetectorME(SentenceModel model) {
-    super();
+    this(model, model.getAbbreviations());
+  }
+
+  /**
+   * Instantiates a {@link ThreadSafeSentenceDetectorME} with an existing 
{@link SentenceModel}.
+   *
+   * @param model The {@link SentenceModel} to be used.
+   * @param abbDict The {@link Dictionary} to be used. It must fit the 
language of the {@code model}.
+   */
+  public ThreadSafeSentenceDetectorME(SentenceModel model, Dictionary abbDict) 
{
     this.model = model;
+    this.abbDict = abbDict;
   }
 
   // If a thread-local version exists, return it. Otherwise, create, then 
return.
   private SentenceDetectorME getSD() {
     SentenceDetectorME sd = threadLocal.get();
     if (sd == null) {
-      sd = new SentenceDetectorME(model);
+      sd = new SentenceDetectorME(model, abbDict);
       threadLocal.set(sd);
     }
     return sd;
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java
index 3ebbd1e3..13de7bc5 100644
--- 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java
@@ -18,6 +18,7 @@
 package opennlp.tools.tokenize;
 
 import opennlp.tools.commons.ThreadSafe;
+import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.util.Span;
 
 /**
@@ -37,18 +38,29 @@ import opennlp.tools.util.Span;
 public class ThreadSafeTokenizerME implements Tokenizer, AutoCloseable {
 
   private final TokenizerModel model;
-
+  private final Dictionary abbDict;
+  
   private final ThreadLocal<TokenizerME> threadLocal = new ThreadLocal<>();
 
   public ThreadSafeTokenizerME(TokenizerModel model) {
-    super();
+    this(model, model.getAbbreviations());
+  }
+
+  /**
+   * Instantiates a {@link ThreadSafeTokenizerME} with an existing {@link 
TokenizerModel}.
+   *
+   * @param model The {@link TokenizerModel} to be used.
+   * @param abbDict The {@link Dictionary} to be used. It must fit the 
language of the {@code model}.
+   */
+  public ThreadSafeTokenizerME(TokenizerModel model, Dictionary abbDict) {
     this.model = model;
+    this.abbDict = abbDict;
   }
 
   private TokenizerME getTokenizer() {
     TokenizerME tokenizer = threadLocal.get();
     if (tokenizer == null) {
-      tokenizer = new TokenizerME(model);
+      tokenizer = new TokenizerME(model, abbDict);
       threadLocal.set(tokenizer);
     }
     return tokenizer;
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
index 1c88f84b..ee0d8267 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
@@ -129,13 +129,23 @@ public class TokenizerME extends AbstractTokenizer {
    * @param model The {@link TokenizerModel} to be used.
    */
   public TokenizerME(TokenizerModel model) {
+    this(model, model.getAbbreviations());
+  }
+
+  /**
+   * Instantiates a {@link TokenizerME} with an existing {@link 
TokenizerModel}.
+   *
+   * @param model The {@link TokenizerModel} to be used.
+   * @param abbDict The {@link Dictionary} to be used. It must fit the 
language of the {@code model}.
+   */
+  public TokenizerME(TokenizerModel model, Dictionary abbDict) {
+    this.model = model.getMaxentModel();
+    this.abbDict = abbDict;
     TokenizerFactory factory = model.getFactory();
-    this.alphanumeric = factory.getAlphaNumericPattern();
     this.cg = factory.getContextGenerator();
-    this.model = model.getMaxentModel();
+    this.alphanumeric = factory.getAlphaNumericPattern();
     this.useAlphaNumericOptimization = factory.isUseAlphaNumericOptimization();
 
-    abbDict = model.getAbbreviations();
     newTokens = new ArrayList<>();
     tokProbs = new ArrayList<>(50);
   }

(opennlp) 01/01: OPENNLP-1655 Add constructors in SentenceDetectorME and TokenizerME to customize Abbreviation Dict at runtime

Reply via email to