This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new df4a7ca0 OPENNLP-936: Add thread-safe versions of POSTaggerME,
SentenceDetecto… (#69)
df4a7ca0 is described below
commit df4a7ca0109e6dd6966af60d0c18b694e883d19f
Author: Thilo Götz <[email protected]>
AuthorDate: Mon Oct 7 08:30:30 2024 +0200
OPENNLP-936: Add thread-safe versions of POSTaggerME, SentenceDetecto… (#69)
* OPENNLP-936: Add thread-safe versions of some tools.
Thread safe versions of POSTaggerME, SentenceDetectorME and TokenizerME.
Include test case as well.
* Fix checkstyle and adjust code
* Applies comments regarding class names
---------
Co-authored-by: Thilo Goetz <[email protected]>
Co-authored-by: Richard Zowalla <[email protected]>
---
.../tools/postag/ThreadSafePOSTaggerME.java | 67 ++++++++++++++++
.../sentdetect/ThreadSafeSentenceDetectorME.java | 67 ++++++++++++++++
.../tools/tokenize/ThreadSafeTokenizerME.java | 61 +++++++++++++++
.../opennlp/tools/eval/MultiThreadedToolsEval.java | 90 ++++++++++++++++++++++
4 files changed, 285 insertions(+)
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/postag/ThreadSafePOSTaggerME.java
b/opennlp-tools/src/main/java/opennlp/tools/postag/ThreadSafePOSTaggerME.java
new file mode 100644
index 00000000..52419ddf
--- /dev/null
+++
b/opennlp-tools/src/main/java/opennlp/tools/postag/ThreadSafePOSTaggerME.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.postag;
+
+import opennlp.tools.commons.ThreadSafe;
+import opennlp.tools.util.Sequence;
+
+/**
+ * A thread-safe version of the POSTaggerME. Using it is completely
transparent. You can use it in
+ * a single-threaded context as well, it only incurs a minimal overhead.
+ */
+@ThreadSafe
+public class ThreadSafePOSTaggerME implements POSTagger {
+
+ private final POSModel model;
+
+ private final ThreadLocal<POSTaggerME> threadLocal = new ThreadLocal<>();
+
+ public ThreadSafePOSTaggerME(POSModel model) {
+ super();
+ this.model = model;
+ }
+
+ private POSTaggerME getTagger() {
+ POSTaggerME tagger = threadLocal.get();
+ if (tagger == null) {
+ tagger = new POSTaggerME(model);
+ threadLocal.set(tagger);
+ }
+ return tagger;
+ }
+
+ @Override
+ public String[] tag(String[] sentence) {
+ return getTagger().tag(sentence);
+ }
+
+ @Override
+ public String[] tag(String[] sentence, Object[] additionaContext) {
+ return getTagger().tag(sentence, additionaContext);
+ }
+
+ @Override
+ public Sequence[] topKSequences(String[] sentence) {
+ return getTagger().topKSequences(sentence);
+ }
+
+ @Override
+ public Sequence[] topKSequences(String[] sentence, Object[]
additionaContext) {
+ return getTagger().topKSequences(sentence, additionaContext);
+ }
+}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java
new file mode 100644
index 00000000..99abc6fb
--- /dev/null
+++
b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/ThreadSafeSentenceDetectorME.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect;
+
+import opennlp.tools.commons.ThreadSafe;
+import opennlp.tools.util.Span;
+
+/**
+ * A thread-safe version of SentenceDetectorME. Using it is completely
transparent. You can use it in
+ * a single-threaded context as well, it only incurs a minimal overhead.
+ * <p>
+ * Note, however, that this implementation uses a ThreadLocal. Although the
implementation is
+ * lightweight as the model is not duplicated, if you have many long-running
threads, you may run
+ * into memory issues. Be careful when you use this in a JEE application, for
example.
+ */
+@ThreadSafe
+public class ThreadSafeSentenceDetectorME implements SentenceDetector {
+
+ private final SentenceModel model;
+
+ private final ThreadLocal<SentenceDetectorME> sentenceDetectorThreadLocal =
+ new ThreadLocal<>();
+
+ public ThreadSafeSentenceDetectorME(SentenceModel model) {
+ super();
+ this.model = model;
+ }
+
+ // If a thread-local version exists, return it. Otherwise, create, then
return.
+ private SentenceDetectorME getSD() {
+ SentenceDetectorME sd = sentenceDetectorThreadLocal.get();
+ if (sd == null) {
+ sd = new SentenceDetectorME(model);
+ sentenceDetectorThreadLocal.set(sd);
+ }
+ return sd;
+ }
+
+ public double[] getSentenceProbabilities() {
+ return getSD().getSentenceProbabilities();
+ }
+
+ @Override
+ public String[] sentDetect(CharSequence s) {
+ return getSD().sentDetect(s);
+ }
+
+ @Override
+ public Span[] sentPosDetect(CharSequence s) {
+ return getSD().sentPosDetect(s);
+ }
+}
diff --git
a/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java
new file mode 100644
index 00000000..b92dd5e0
--- /dev/null
+++
b/opennlp-tools/src/main/java/opennlp/tools/tokenize/ThreadSafeTokenizerME.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.tokenize;
+
+import opennlp.tools.commons.ThreadSafe;
+import opennlp.tools.util.Span;
+
+/**
+ * A thread-safe version of TokenizerME. Using it is completely transparent.
You can use it in
+ * a single-threaded context as well, it only incurs a minimal overhead.
+ */
+@ThreadSafe
+public class ThreadSafeTokenizerME implements Tokenizer {
+
+ private final TokenizerModel model;
+
+ private final ThreadLocal<TokenizerME> tokenizerThreadLocal = new
ThreadLocal<>();
+
+ public ThreadSafeTokenizerME(TokenizerModel model) {
+ super();
+ this.model = model;
+ }
+
+ private TokenizerME getTokenizer() {
+ TokenizerME tokenizer = tokenizerThreadLocal.get();
+ if (tokenizer == null) {
+ tokenizer = new TokenizerME(model);
+ tokenizerThreadLocal.set(tokenizer);
+ }
+ return tokenizer;
+ }
+
+ @Override
+ public String[] tokenize(String s) {
+ return getTokenizer().tokenize(s);
+ }
+
+ @Override
+ public Span[] tokenizePos(String s) {
+ return getTokenizer().tokenizePos(s);
+ }
+
+ public double[] getProbabilities() {
+ return getTokenizer().getTokenProbabilities();
+ }
+}
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/eval/MultiThreadedToolsEval.java
b/opennlp-tools/src/test/java/opennlp/tools/eval/MultiThreadedToolsEval.java
new file mode 100644
index 00000000..fcb2bfa9
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/MultiThreadedToolsEval.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.eval;
+
+import java.io.File;
+import java.io.IOException;
+
+import org.junit.jupiter.api.Test;
+
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.ThreadSafePOSTaggerME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.sentdetect.ThreadSafeSentenceDetectorME;
+import opennlp.tools.tokenize.ThreadSafeTokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.util.Span;
+
+/**
+ * Test the reentrant tools implementations are really thread safe by running
concurrently.
+ * Replace the thread-safe versions with the non-safe versions to see this
test case fail.
+ */
+public class MultiThreadedToolsEval extends AbstractEvalTest {
+
+ @Test
+ public void runMEToolsMultiThreaded() throws IOException,
InterruptedException {
+
+ File sModelFile = new File(getOpennlpDataDir(), "models-sf/en-sent.bin");
+ SentenceModel sModel = new SentenceModel(sModelFile);
+ ThreadSafeSentenceDetectorME sentencer = new
ThreadSafeSentenceDetectorME(sModel);
+
+ File tModelFile = new File(getOpennlpDataDir(), "models-sf/en-token.bin");
+ TokenizerModel tModel = new TokenizerModel(tModelFile);
+ ThreadSafeTokenizerME tokenizer = new ThreadSafeTokenizerME(tModel);
+
+ File pModelFile = new File(getOpennlpDataDir(),
"models-sf/en-pos-maxent.bin");
+ POSModel pModel = new POSModel(pModelFile);
+ ThreadSafePOSTaggerME tagger = new ThreadSafePOSTaggerME(pModel);
+
+ final String text = "All human beings are born free and equal in dignity
and rights. They " +
+ "are endowed with reason and conscience and should act towards one
another in a " +
+ "spirit of brotherhood.";
+
+ // Run numThreads threads, each processing the sample text
numRunsPerThread times.
+ final int numThreads = 8;
+ final int numRunsPerThread = 1000;
+ Thread[] threads = new Thread[numThreads];
+
+ for (int i = 0; i < 8; i++) {
+ threads[i] = new Thread(new Runnable() {
+ @Override
+ public void run() {
+ for (int j = 0; j < numRunsPerThread; j++) {
+ Span[] sentences = sentencer.sentPosDetect(text);
+ for (Span span : sentences) {
+ String sentence = text.substring(span.getStart(), span.getEnd());
+ Span[] tokens = tokenizer.tokenizePos(sentence);
+ String[] tokenStrings = new String[tokens.length];
+ for (int k = 0; k < tokens.length; k++) {
+ tokenStrings[k] = sentence.substring(tokens[k].getStart(),
+ tokens[k].getEnd());
+ }
+ String[] tags = tagger.tag(tokenStrings);
+ }
+ }
+ }
+ });
+ threads[i].start();
+ }
+ for (Thread t : threads) {
+ t.join();
+ }
+
+ }
+
+}