This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/main by this push:
     new df4a7ca0 OPENNLP-936: Add thread-safe versions of POSTaggerME, 
SentenceDetecto… (#69)
df4a7ca0 is described below

commit df4a7ca0109e6dd6966af60d0c18b694e883d19f
Author: Thilo Götz <[email protected]>
AuthorDate: Mon Oct 7 08:30:30 2024 +0200

    OPENNLP-936: Add thread-safe versions of POSTaggerME, SentenceDetecto… (#69)
    
    * OPENNLP-936: Add thread-safe versions of some tools.
    
    Thread safe versions of POSTaggerME, SentenceDetectorME and TokenizerME.
    Include test case as well.
    
    * Fix checkstyle and adjust code
    
    * Applies comments regarding class names
    
    ---------
    
    Co-authored-by: Thilo Goetz <[email protected]>
    Co-authored-by: Richard Zowalla <[email protected]>
---
 .../tools/postag/ThreadSafePOSTaggerME.java        | 67 ++++++++++++++++
 .../sentdetect/ThreadSafeSentenceDetectorME.java   | 67 ++++++++++++++++
 .../tools/tokenize/ThreadSafeTokenizerME.java      | 61 +++++++++++++++
 .../opennlp/tools/eval/MultiThreadedToolsEval.java | 90 ++++++++++++++++++++++
 4 files changed, 285 insertions(+)

diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/postag/ThreadSafePOSTaggerME.java 
b/opennlp-tools/src/main/java/opennlp/tools/postag/ThreadSafePOSTaggerME.java
new file mode 100644
index 00000000..52419ddf
--- /dev/null
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/postag/ThreadSafePOSTaggerME.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.postag;
+
+import opennlp.tools.commons.ThreadSafe;
+import opennlp.tools.util.Sequence;
+
+/**
+ * A thread-safe version of the POSTaggerME. Using it is completely 
transparent. You can use it in
+ * a single-threaded context as well, it only incurs a minimal overhead.
+ */
+@ThreadSafe
+public class ThreadSafePOSTaggerME implements POSTagger {
+
+  private final POSModel model;
+
+  private final ThreadLocal<POSTaggerME> threadLocal = new ThreadLocal<>();
+
+  public ThreadSafePOSTaggerME(POSModel model) {
+    super();
+    this.model = model;
+  }
+
+  private POSTaggerME getTagger() {
+    POSTaggerME tagger = threadLocal.get();
+    if (tagger == null) {
+      tagger = new POSTaggerME(model);
+      threadLocal.set(tagger);
+    }
+    return tagger;
+  }
+
+  @Override
+  public String[] tag(String[] sentence) {
+    return getTagger().tag(sentence);
+  }
+
+  @Override
+  public String[] tag(String[] sentence, Object[] additionaContext) {
+    return getTagger().tag(sentence, additionaContext);
+  }
+
+  @Override
+  public Sequence[] topKSequences(String[] sentence) {
+    return getTagger().topKSequences(sentence);
+  }
+
+  @Override
+  public Sequence[] topKSequences(String[] sentence, Object[] 
additionaContext) {
+    return getTagger().topKSequences(sentence, additionaContext);
+  }
+}
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java
 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java
new file mode 100644
index 00000000..99abc6fb
--- /dev/null
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect;
+
+import opennlp.tools.commons.ThreadSafe;
+import opennlp.tools.util.Span;
+
+/**
+ * A thread-safe version of SentenceDetectorME. Using it is completely 
transparent. You can use it in
+ * a single-threaded context as well, it only incurs a minimal overhead.
+ * <p>
+ * Note, however, that this implementation uses a ThreadLocal. Although the 
implementation is
+ * lightweight as the model is not duplicated, if you have many long-running 
threads, you may run
+ * into memory issues. Be careful when you use this in a JEE application, for 
example.
+ */
+@ThreadSafe
+public class ThreadSafeSentenceDetectorME implements SentenceDetector {
+
+  private final SentenceModel model;
+
+  private final ThreadLocal<SentenceDetectorME> sentenceDetectorThreadLocal =
+      new ThreadLocal<>();
+
+  public ThreadSafeSentenceDetectorME(SentenceModel model) {
+    super();
+    this.model = model;
+  }
+
+  // If a thread-local version exists, return it. Otherwise, create, then 
return.
+  private SentenceDetectorME getSD() {
+    SentenceDetectorME sd = sentenceDetectorThreadLocal.get();
+    if (sd == null) {
+      sd = new SentenceDetectorME(model);
+      sentenceDetectorThreadLocal.set(sd);
+    }
+    return sd;
+  }
+
+  public double[] getSentenceProbabilities() {
+    return getSD().getSentenceProbabilities();
+  }
+
+  @Override
+  public String[] sentDetect(CharSequence s) {
+    return getSD().sentDetect(s);
+  }
+
+  @Override
+  public Span[] sentPosDetect(CharSequence s) {
+    return getSD().sentPosDetect(s);
+  }
+}
diff --git 
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java
new file mode 100644
index 00000000..b92dd5e0
--- /dev/null
+++ 
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import opennlp.tools.commons.ThreadSafe;
+import opennlp.tools.util.Span;
+
+/**
+ * A thread-safe version of TokenizerME. Using it is completely transparent. 
You can use it in
+ * a single-threaded context as well, it only incurs a minimal overhead.
+ */
+@ThreadSafe
+public class ThreadSafeTokenizerME implements Tokenizer {
+
+  private final TokenizerModel model;
+
+  private final ThreadLocal<TokenizerME> tokenizerThreadLocal = new 
ThreadLocal<>();
+
+  public ThreadSafeTokenizerME(TokenizerModel model) {
+    super();
+    this.model = model;
+  }
+
+  private TokenizerME getTokenizer() {
+    TokenizerME tokenizer = tokenizerThreadLocal.get();
+    if (tokenizer == null) {
+      tokenizer = new TokenizerME(model);
+      tokenizerThreadLocal.set(tokenizer);
+    }
+    return tokenizer;
+  }
+
+  @Override
+  public String[] tokenize(String s) {
+    return getTokenizer().tokenize(s);
+  }
+
+  @Override
+  public Span[] tokenizePos(String s) {
+    return getTokenizer().tokenizePos(s);
+  }
+
+  public double[] getProbabilities() {
+    return getTokenizer().getTokenProbabilities();
+  }
+}
diff --git 
a/opennlp-tools/src/test/java/opennlp/tools/eval/MultiThreadedToolsEval.java 
b/opennlp-tools/src/test/java/opennlp/tools/eval/MultiThreadedToolsEval.java
new file mode 100644
index 00000000..fcb2bfa9
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/MultiThreadedToolsEval.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.eval;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.ThreadSafePOSTaggerME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.sentdetect.ThreadSafeSentenceDetectorME;
+import opennlp.tools.tokenize.ThreadSafeTokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.Span;
+
+/**
+ * Test the reentrant tools implementations are really thread safe by running 
concurrently.
+ * Replace the thread-safe versions with the non-safe versions to see this 
test case fail.
+ */
+public class MultiThreadedToolsEval extends AbstractEvalTest {
+
+  @Test
+  public void runMEToolsMultiThreaded() throws IOException, 
InterruptedException {
+
+    File sModelFile = new File(getOpennlpDataDir(), "models-sf/en-sent.bin");
+    SentenceModel sModel = new SentenceModel(sModelFile);
+    ThreadSafeSentenceDetectorME sentencer = new 
ThreadSafeSentenceDetectorME(sModel);
+
+    File tModelFile = new File(getOpennlpDataDir(), "models-sf/en-token.bin");
+    TokenizerModel tModel = new TokenizerModel(tModelFile);
+    ThreadSafeTokenizerME tokenizer = new ThreadSafeTokenizerME(tModel);
+
+    File pModelFile = new File(getOpennlpDataDir(), 
"models-sf/en-pos-maxent.bin");
+    POSModel pModel = new POSModel(pModelFile);
+    ThreadSafePOSTaggerME tagger = new ThreadSafePOSTaggerME(pModel);
+
+    final String text = "All human beings are born free and equal in dignity 
and rights. They " +
+        "are endowed with reason and conscience and should act towards one 
another in a " +
+        "spirit of brotherhood.";
+
+    // Run numThreads threads, each processing the sample text 
numRunsPerThread times.
+    final int numThreads = 8;
+    final int numRunsPerThread = 1000;
+    Thread[] threads = new Thread[numThreads];
+
+    for (int i = 0; i < 8; i++) {
+      threads[i] = new Thread(new Runnable() {
+        @Override
+        public void run() {
+          for (int j = 0; j < numRunsPerThread; j++) {
+            Span[] sentences = sentencer.sentPosDetect(text);
+            for (Span span : sentences) {
+              String sentence = text.substring(span.getStart(), span.getEnd());
+              Span[] tokens = tokenizer.tokenizePos(sentence);
+              String[] tokenStrings = new String[tokens.length];
+              for (int k = 0; k < tokens.length; k++) {
+                tokenStrings[k] = sentence.substring(tokens[k].getStart(),
+                    tokens[k].getEnd());
+              }
+              String[] tags = tagger.tag(tokenStrings);
+            }
+          }
+        }
+      });
+      threads[i].start();
+    }
+    for (Thread t : threads) {
+      t.join();
+    }
+
+  }
+
+}

Reply via email to