This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
The following commit(s) were added to refs/heads/main by this push:
new 781d8ec OPENNLP-1729: Switch to easier loading of Models (#279)
781d8ec is described below
commit 781d8ecb70473ee0eb891fa2cd9ad0cacd624070
Author: Martin Wiesner <[email protected]>
AuthorDate: Wed May 14 08:27:02 2025 +0200
OPENNLP-1729: Switch to easier loading of Models (#279)
---
corpus-server/corpus-server-impl/pom.xml | 6 ----
.../opennlp/tools/coref/AbstractCorefTest.java | 8 ++---
.../Muc6FullParseCorefSampleStreamFactoryTest.java | 3 +-
opennlp-similarity/pom.xml | 35 ++++++++++++++++++----
.../ParserChunker2MatcherProcessor.java | 31 +++++++++----------
.../ParserPure2MatcherProcessor.java | 29 ++++++------------
opennlp-wsd/pom.xml | 25 ++++++++++++++++
.../opennlp/tools/disambiguator/WSDHelper.java | 24 ++++++++++-----
pom.xml | 8 ++++-
summarizer/pom.xml | 19 ++++++++++++
.../main/java/opennlp/summarization/Sentence.java | 6 ++--
.../lexicalchaining/NounPOSTagger.java | 19 ++++++++----
.../WordRelationshipDetermination.java | 4 +--
.../preprocess/DefaultDocProcessor.java | 14 +++++----
14 files changed, 154 insertions(+), 77 deletions(-)
diff --git a/corpus-server/corpus-server-impl/pom.xml
b/corpus-server/corpus-server-impl/pom.xml
index 8d47c7f..947d77d 100644
--- a/corpus-server/corpus-server-impl/pom.xml
+++ b/corpus-server/corpus-server-impl/pom.xml
@@ -72,12 +72,6 @@
<version>${uimaj.version}</version>
</dependency>
- <dependency>
- <groupId>org.apache.derby</groupId>
- <artifactId>derby</artifactId>
- <version>${derby.version}</version>
- </dependency>
-
<dependency>
<groupId>org.apache.uima</groupId>
<artifactId>Lucas</artifactId>
diff --git
a/opennlp-coref/src/test/java/opennlp/tools/coref/AbstractCorefTest.java
b/opennlp-coref/src/test/java/opennlp/tools/coref/AbstractCorefTest.java
index 11869c8..bebcfdb 100644
--- a/opennlp-coref/src/test/java/opennlp/tools/coref/AbstractCorefTest.java
+++ b/opennlp-coref/src/test/java/opennlp/tools/coref/AbstractCorefTest.java
@@ -17,9 +17,6 @@
package opennlp.tools.coref;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
@@ -29,10 +26,13 @@ import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.StandardCopyOption;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
public abstract class AbstractCorefTest {
private static final Logger logger =
LoggerFactory.getLogger(AbstractCorefTest.class);
-
+
private static final String BASE_URL_MODELS_V15 =
"https://opennlp.sourceforge.net/models-1.5/";
protected static final Path OPENNLP_DIR =
Paths.get(System.getProperty("user.home") + "/.opennlp/");
protected static final String MODEL_DIR = "/models/coref/en";
diff --git
a/opennlp-coref/src/test/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactoryTest.java
b/opennlp-coref/src/test/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactoryTest.java
index 7f426e0..5f852b8 100644
---
a/opennlp-coref/src/test/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactoryTest.java
+++
b/opennlp-coref/src/test/java/opennlp/tools/formats/muc/Muc6FullParseCorefSampleStreamFactoryTest.java
@@ -27,6 +27,7 @@ import org.junit.jupiter.api.Test;
import opennlp.tools.coref.AbstractCorefTest;
import opennlp.tools.coref.CorefSample;
import opennlp.tools.coref.mention.Parse;
+import opennlp.tools.models.ModelType;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.DownloadUtil;
import opennlp.tools.util.ObjectStream;
@@ -49,7 +50,7 @@ public class Muc6FullParseCorefSampleStreamFactoryTest
extends AbstractCorefTest
@BeforeAll
public static void initEnv() throws IOException {
Muc6FullParseCorefSampleStreamFactory.registerFactory();
- DownloadUtil.downloadModel("en", DownloadUtil.ModelType.TOKENIZER,
TokenizerModel.class);
+ DownloadUtil.downloadModel("en", ModelType.TOKENIZER,
TokenizerModel.class);
downloadVersion15Model(MODEL_PARSER);
downloadVersion15Model(MODEL_NER_PER);
downloadVersion15Model(MODEL_NER_ORG);
diff --git a/opennlp-similarity/pom.xml b/opennlp-similarity/pom.xml
index cb0514a..91679aa 100644
--- a/opennlp-similarity/pom.xml
+++ b/opennlp-similarity/pom.xml
@@ -37,6 +37,7 @@
<hdf5.version>1.14.3-1.5.10</hdf5.version>
<javacpp.version>1.5.11</javacpp.version>
<openblas.version>0.3.28-1.5.11</openblas.version>
+ <httpclient.version>4.5.14</httpclient.version>
</properties>
<repositories>
@@ -59,12 +60,12 @@
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
- <version>4.5.14</version>
+ <version>${httpclient.version}</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient-cache</artifactId>
- <version>4.5.14</version>
+ <version>${httpclient.version}</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
@@ -74,12 +75,12 @@
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpmime</artifactId>
- <version>4.5.14</version>
+ <version>${httpclient.version}</version>
</dependency>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>fluent-hc</artifactId>
- <version>4.5.14</version>
+ <version>${httpclient.version}</version>
</dependency>
</dependencies>
</dependencyManagement>
@@ -90,9 +91,31 @@
<artifactId>opennlp-tools</artifactId>
</dependency>
<dependency>
- <groupId>org.apache.commons</groupId>
- <artifactId>commons-math3</artifactId>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-tools-models</artifactId>
+ </dependency>
+
+ <!-- Required English model resources at runtime -->
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-models-tokenizer-en</artifactId>
+ <version>${opennlp.models.version}</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-models-pos-en</artifactId>
+ <version>${opennlp.models.version}</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-models-sentdetect-en</artifactId>
+ <version>${opennlp.models.version}</version>
+ <scope>runtime</scope>
</dependency>
+ <!-- End model resources -->
+
<dependency>
<groupId>commons-io</groupId>
<artifactId>commons-io</artifactId>
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
index 97eda63..818c4ab 100644
---
a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java
@@ -25,9 +25,15 @@ import java.util.HashMap;
import java.util.List;
import java.util.Map;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
import opennlp.tools.chunker.ChunkerME;
import opennlp.tools.chunker.ChunkerModel;
import opennlp.tools.cmdline.parser.ParserTool;
+import opennlp.tools.models.ClassPathModelProvider;
+import opennlp.tools.models.DefaultClassPathModelProvider;
+import opennlp.tools.models.ModelType;
import opennlp.tools.parser.AbstractBottomUpParser;
import opennlp.tools.parser.Parse;
import opennlp.tools.parser.Parser;
@@ -49,13 +55,13 @@ import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.DownloadUtil;
import opennlp.tools.util.Span;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
public class ParserChunker2MatcherProcessor {
private static final Logger LOG =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+ private static final ClassPathModelProvider MODEL_PROVIDER = new
DefaultClassPathModelProvider();
+
static final int MIN_SENTENCE_LENGTH = 10;
protected static ParserChunker2MatcherProcessor instance;
@@ -89,32 +95,27 @@ public class ParserChunker2MatcherProcessor {
}
protected void initializeSentenceDetector() throws IOException {
- SentenceModel model = DownloadUtil.downloadModel(
- "en", DownloadUtil.ModelType.SENTENCE_DETECTOR,
SentenceModel.class);
- sentenceDetector = new ThreadSafeSentenceDetectorME(model);
+ final SentenceModel sm = MODEL_PROVIDER.load("en",
ModelType.SENTENCE_DETECTOR, SentenceModel.class);
+ sentenceDetector = new ThreadSafeSentenceDetectorME(sm);
}
protected void initializeTokenizer() throws IOException {
- TokenizerModel model = DownloadUtil.downloadModel(
- "en", DownloadUtil.ModelType.TOKENIZER, TokenizerModel.class);
- tokenizer = new ThreadSafeTokenizerME(model);
+ final TokenizerModel tm = MODEL_PROVIDER.load("en", ModelType.TOKENIZER,
TokenizerModel.class);
+ tokenizer = new ThreadSafeTokenizerME(tm);
}
protected void initializePosTagger() throws IOException {
- POSModel model = DownloadUtil.downloadModel(
- "en", DownloadUtil.ModelType.POS, POSModel.class);
- posTagger = new ThreadSafePOSTaggerME(model);
+ final POSModel pm = MODEL_PROVIDER.load("en", ModelType.POS_GENERIC,
POSModel.class);
+ posTagger = new ThreadSafePOSTaggerME(pm);
}
protected void initializeParser() throws IOException {
- ParserModel model = DownloadUtil.downloadModel(
- "en", DownloadUtil.ModelType.PARSER, ParserModel.class);
+ ParserModel model = DownloadUtil.downloadModel("en", ModelType.PARSER,
ParserModel.class);
parser = ParserFactory.create(model);
}
private void initializeChunker() throws IOException {
- ChunkerModel model = DownloadUtil.downloadModel(
- "en", DownloadUtil.ModelType.CHUNKER, ChunkerModel.class);
+ ChunkerModel model = DownloadUtil.downloadModel("en", ModelType.CHUNKER,
ChunkerModel.class);
chunker = new ChunkerME(model);
}
diff --git
a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java
b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java
index c5e5dca..c5db267 100644
---
a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java
+++
b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java
@@ -41,9 +41,7 @@ import java.util.List;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import opennlp.tools.textsimilarity.LemmaPair;
import opennlp.tools.textsimilarity.ParseTreeChunk;
-import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
import opennlp.tools.textsimilarity.SentencePairMatchResult;
import opennlp.tools.textsimilarity.TextProcessor;
@@ -51,13 +49,13 @@ public class ParserPure2MatcherProcessor extends
ParserChunker2MatcherProcessor
private static final Logger LOG =
LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
- protected static ParserPure2MatcherProcessor pinstance;
+ private static ParserPure2MatcherProcessor pInstance;
public synchronized static ParserPure2MatcherProcessor getInstance() {
- if (pinstance == null)
- pinstance = new ParserPure2MatcherProcessor();
+ if (pInstance == null)
+ pInstance = new ParserPure2MatcherProcessor();
- return pinstance;
+ return pInstance;
}
private ParserPure2MatcherProcessor() {
@@ -71,8 +69,8 @@ public class ParserPure2MatcherProcessor extends
ParserChunker2MatcherProcessor
}
}
- public synchronized List<List<ParseTreeChunk>>
formGroupedPhrasesFromChunksForSentence(
- String sentence) {
+ @Override
+ public synchronized List<List<ParseTreeChunk>>
formGroupedPhrasesFromChunksForSentence(String sentence) {
if (sentence == null || sentence.trim().length() < MIN_SENTENCE_LENGTH)
return null;
@@ -118,25 +116,16 @@ public class ParserPure2MatcherProcessor extends
ParserChunker2MatcherProcessor
return listOfChunks;
}
+ @Override
public SentencePairMatchResult assessRelevance(String para1, String para2) {
-
- List<List<ParseTreeChunk>> sent1GrpLst =
formGroupedPhrasesFromChunksForPara(para1), sent2GrpLst =
formGroupedPhrasesFromChunksForPara(para2);
-
- List<LemmaPair> origChunks1 =
listListParseTreeChunk2ListLemmaPairs(sent1GrpLst);
-
- ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic();
- List<List<ParseTreeChunk>> res = md
- .matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst);
- return new SentencePairMatchResult(res, origChunks1);
-
+ return super.assessRelevance(para1, para2);
}
public static void main(String[] args) throws Exception {
ParserPure2MatcherProcessor parser =
ParserPure2MatcherProcessor.getInstance();
String text = "Its classy design and the Mercedes name make it a very cool
vehicle to drive. ";
- List<List<ParseTreeChunk>> res = parser
- .formGroupedPhrasesFromChunksForPara(text);
+ List<List<ParseTreeChunk>> res =
parser.formGroupedPhrasesFromChunksForPara(text);
System.out.println(res);
String phrase1 = "Its classy design and the Mercedes name make it a very
cool vehicle to drive. "
diff --git a/opennlp-wsd/pom.xml b/opennlp-wsd/pom.xml
index da3a81d..c8c3d2f 100644
--- a/opennlp-wsd/pom.xml
+++ b/opennlp-wsd/pom.xml
@@ -37,6 +37,31 @@
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
</dependency>
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-tools-models</artifactId>
+ </dependency>
+
+ <!-- Required English model resources at runtime -->
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-models-tokenizer-en</artifactId>
+ <version>${opennlp.models.version}</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-models-pos-en</artifactId>
+ <version>${opennlp.models.version}</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-models-lemmatizer-en</artifactId>
+ <version>${opennlp.models.version}</version>
+ <scope>runtime</scope>
+ </dependency>
+ <!-- End model resources -->
<dependency>
<groupId>net.sf.extjwnl</groupId>
diff --git
a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java
b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java
index 3613c0d..7ae0d26 100644
--- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java
+++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDHelper.java
@@ -35,12 +35,16 @@ import org.slf4j.LoggerFactory;
import opennlp.tools.lemmatizer.Lemmatizer;
import opennlp.tools.lemmatizer.LemmatizerModel;
import opennlp.tools.lemmatizer.ThreadSafeLemmatizerME;
+import opennlp.tools.models.ModelType;
+import opennlp.tools.models.ClassPathModelProvider;
+import opennlp.tools.models.DefaultClassPathModelProvider;
+import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTagFormat;
import opennlp.tools.postag.POSTagger;
import opennlp.tools.postag.ThreadSafePOSTaggerME;
import opennlp.tools.tokenize.ThreadSafeTokenizerME;
import opennlp.tools.tokenize.Tokenizer;
-import opennlp.tools.util.DownloadUtil;
+import opennlp.tools.tokenize.TokenizerModel;
/**
* A helper class that loads and organizes resources, and provides helper
methods
@@ -52,6 +56,8 @@ public class WSDHelper {
private static final Pattern NUMBERS_PATTERN = Pattern.compile(".*[0-9].*");
+ private static final ClassPathModelProvider MODEL_PROVIDER = new
DefaultClassPathModelProvider();
+
private static Tokenizer tokenizer;
private static POSTagger tagger;
private static Lemmatizer lemmatizer;
@@ -274,9 +280,9 @@ public class WSDHelper {
private static Lemmatizer getLemmatizer(String lang) {
if (lemmatizer == null) {
try {
- LemmatizerModel lm = DownloadUtil.downloadModel(lang,
- DownloadUtil.ModelType.LEMMATIZER, LemmatizerModel.class);
- lemmatizer = new ThreadSafeLemmatizerME(lm);
+ final LemmatizerModel lm = MODEL_PROVIDER.load(
+ lang, ModelType.LEMMATIZER, LemmatizerModel.class);
+ lemmatizer = new ThreadSafeLemmatizerME(lm);
} catch (IOException e) {
throw new RuntimeException("Error opening or loading a Lemmatizer from
specified resource file!", e);
}
@@ -288,10 +294,11 @@ public class WSDHelper {
return getTagger("en");
}
- private static POSTagger getTagger(String language) {
+ private static POSTagger getTagger(String lang) {
if (tagger == null) {
try {
- tagger = new ThreadSafePOSTaggerME(language, POSTagFormat.PENN);
+ final POSModel pm = MODEL_PROVIDER.load(lang, ModelType.POS_GENERIC,
POSModel.class);
+ tagger = new ThreadSafePOSTaggerME(pm, POSTagFormat.PENN);
} catch (IOException e) {
throw new RuntimeException("Error opening or loading a Tokenizer for
specified language!", e);
}
@@ -303,10 +310,11 @@ public class WSDHelper {
return getTokenizer("en");
}
- private static Tokenizer getTokenizer(String language) {
+ private static Tokenizer getTokenizer(String lang) {
if (tokenizer == null) {
try {
- tokenizer = new ThreadSafeTokenizerME(language);
+ final TokenizerModel tm = MODEL_PROVIDER.load(lang,
ModelType.TOKENIZER, TokenizerModel.class);
+ tokenizer = new ThreadSafeTokenizerME(tm);
} catch (IOException e) {
throw new RuntimeException("Error opening or loading a Tokenizer for
specified language!", e);
}
diff --git a/pom.xml b/pom.xml
index 750c8d2..b808ac8 100644
--- a/pom.xml
+++ b/pom.xml
@@ -122,7 +122,8 @@
<maven.compiler.target>${java.version}</maven.compiler.target>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
- <opennlp.tools.version>2.5.4</opennlp.tools.version>
+ <opennlp.tools.version>2.5.5-SNAPSHOT</opennlp.tools.version>
+ <opennlp.models.version>1.2.0</opennlp.models.version>
<opennlp.forkCount>1.0C</opennlp.forkCount>
<commons-beanutils.version>1.10.1</commons-beanutils.version>
@@ -163,6 +164,11 @@
<groupId>${project.groupId}</groupId>
<version>${opennlp.tools.version}</version>
</dependency>
+ <dependency>
+ <artifactId>opennlp-tools-models</artifactId>
+ <groupId>${project.groupId}</groupId>
+ <version>${opennlp.tools.version}</version>
+ </dependency>
<dependency>
<artifactId>opennlp-tools</artifactId>
diff --git a/summarizer/pom.xml b/summarizer/pom.xml
index cab7aaf..4aeb1a2 100644
--- a/summarizer/pom.xml
+++ b/summarizer/pom.xml
@@ -49,6 +49,25 @@
<groupId>org.apache.opennlp</groupId>
<artifactId>opennlp-tools</artifactId>
</dependency>
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-tools-models</artifactId>
+ </dependency>
+
+ <!-- Required English model resources at runtime -->
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-models-pos-en</artifactId>
+ <version>${opennlp.models.version}</version>
+ <scope>runtime</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-models-sentdetect-en</artifactId>
+ <version>${opennlp.models.version}</version>
+ <scope>runtime</scope>
+ </dependency>
+ <!-- End model resources -->
<dependency>
<groupId>edu.mit</groupId>
diff --git a/summarizer/src/main/java/opennlp/summarization/Sentence.java
b/summarizer/src/main/java/opennlp/summarization/Sentence.java
index 2c03eef..3313e33 100755
--- a/summarizer/src/main/java/opennlp/summarization/Sentence.java
+++ b/summarizer/src/main/java/opennlp/summarization/Sentence.java
@@ -47,10 +47,10 @@ public class Sentence {
/**
* Instantiates a plain {@link Sentence} via a set of parameters.
*
- * @param id A numeric identifier with a postive value.
+ * @param id A numeric identifier with a positive value starting at {@code
zero}.
* @param stringVal The string representation of the sentence.
- * @param paragraph TODO clarify exact meaning of and constraints for this
parameter.
- * @param paraPos clarify exact meaning of and constraints for this
parameter.
+ * @param paragraph The n-th paragraph number within a document.
+ * @param paraPos The index position of the {@code paragraph}.
* @throws IllegalArgumentException Thrown if parameters are invalid.
*/
public Sentence(int id, String stringVal, int paragraph, int paraPos) {
diff --git
a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/NounPOSTagger.java
b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/NounPOSTagger.java
index 2acc60b..63e5844 100644
---
a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/NounPOSTagger.java
+++
b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/NounPOSTagger.java
@@ -24,10 +24,13 @@ import java.util.List;
import java.util.Map;
import java.util.Set;
+import opennlp.tools.models.ClassPathModelProvider;
+import opennlp.tools.models.DefaultClassPathModelProvider;
+import opennlp.tools.models.ModelType;
import opennlp.tools.postag.POSModel;
import opennlp.tools.postag.POSTaggerME;
+import opennlp.tools.postag.ThreadSafePOSTaggerME;
import opennlp.tools.tokenize.WhitespaceTokenizer;
-import opennlp.tools.util.DownloadUtil;
/**
* A {@link POSTagger} wrapper implementation that relies on an OpenNLP {@link
POSTaggerME}.
@@ -40,7 +43,9 @@ public class NounPOSTagger implements POSTagger {
public static final String[] TAGS_NOUNS = {"NOUN", "NN", "NNS", "NNP",
"NNPS"};
private static final Set<String> EOS_CHARS = Set.of(".", "?", "!");
- private final POSTaggerME tagger;
+ private static final ClassPathModelProvider MODEL_PROVIDER = new
DefaultClassPathModelProvider();
+
+ private final ThreadSafePOSTaggerME tagger;
private final Map<Integer, String[]> tagMap = new Hashtable<>();
/**
@@ -56,8 +61,8 @@ public class NounPOSTagger implements POSTagger {
throw new IllegalArgumentException("Parameter 'languageCode' must not be
null");
// init Tag map
tagMap.put(POSTagger.NOUN, TAGS_NOUNS);
- POSModel posModel = DownloadUtil.downloadModel(languageCode,
DownloadUtil.ModelType.POS, POSModel.class);
- tagger = new POSTaggerME(posModel);
+ final POSModel pm = MODEL_PROVIDER.load(languageCode,
ModelType.POS_GENERIC, POSModel.class);
+ tagger = new ThreadSafePOSTaggerME(pm);
}
/**
@@ -105,8 +110,10 @@ public class NounPOSTagger implements POSTagger {
*/
@Override
public List<String> getWordsOfType(String[] tokens, int type) {
- if (tokens == null) throw new IllegalArgumentException("Parameter 'tokens'
must not be null");
- if (type < 0 || type > PRONOUN) throw new
IllegalArgumentException("Parameter 'type' must be in range [0, 4]");
+ if (tokens == null)
+ throw new IllegalArgumentException("Parameter 'tokens' must not be
null");
+ if (type < 0 || type > PRONOUN)
+ throw new IllegalArgumentException("Parameter 'type' must be in range
[0, 4]");
List<String> ret = new ArrayList<>();
for (String t : tokens) {
diff --git
a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
index 9079b62..ca8eb18 100644
---
a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
+++
b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java
@@ -150,7 +150,7 @@ public class WordRelationshipDetermination {
}
for (ISynsetID id : rels) {
- ISynset s = this.DICTIONARY.getSynset(id);
+ ISynset s = DICTIONARY.getSynset(id);
IWord mat = inSynset(s, idxNoun);
if (mat != null) {
ret = new WordnetWord(noun, mat.getSenseKey(), mat.getID());
@@ -191,7 +191,7 @@ public class WordRelationshipDetermination {
List<Word> ret = new ArrayList<>();
try {
// openDict();
- List<IWordID> wordIDs = this.DICTIONARY.getIndexWord(noun,
POS.NOUN).getWordIDs();
+ List<IWordID> wordIDs = DICTIONARY.getIndexWord(noun,
POS.NOUN).getWordIDs();
for (IWordID wid : wordIDs) {
Word w = new WordnetWord(noun, wid);
ret.add(w);
diff --git
a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
index a638d68..6b50a9a 100755
---
a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
+++
b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java
@@ -31,19 +31,24 @@ import java.util.regex.Pattern;
import opennlp.summarization.Sentence;
import opennlp.summarization.DocProcessor;
+import opennlp.tools.models.ClassPathModelProvider;
+import opennlp.tools.models.DefaultClassPathModelProvider;
+import opennlp.tools.models.ModelType;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
import opennlp.tools.stemmer.PorterStemmer;
import opennlp.tools.stemmer.Stemmer;
-import opennlp.tools.util.DownloadUtil;
/**
* Parses a document to sentences.
*/
public class DefaultDocProcessor implements DocProcessor {
+
+ private static final ClassPathModelProvider MODEL_PROVIDER = new
DefaultClassPathModelProvider();
+
private static final String REGEX = "\"|'";
private final static Pattern REPLACEMENT_PATTERN =
- Pattern.compile("&#?[0-9 a-z A-Z][0-9 a-z A-Z][0-9 a-z A-Z]?;");
+ Pattern.compile("&#?[0-9a-zA-Z][0-9a-zA-Z][0-9a-zA-Z]?;");
// Sentence fragmentation to use..
private static final int OPEN_NLP = 1;
@@ -65,10 +70,10 @@ public class DefaultDocProcessor implements DocProcessor {
if (languageCode == null || languageCode.isBlank())
throw new IllegalArgumentException("Parameter 'languageCode' must not be
null or blank");
stemmer = new PorterStemmer();
- sentModel = DownloadUtil.downloadModel(languageCode,
DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class);
+ sentModel = MODEL_PROVIDER.load(languageCode, ModelType.SENTENCE_DETECTOR,
SentenceModel.class);
}
- // Str - Document or para
+ // Str - Document or paragraph
// sentences - List containing returned sentences
// iidx - if not null update with the words in the sentence + sent id
// processedSent - Sentences after stemming and stopword removal..
@@ -123,7 +128,6 @@ public class DefaultDocProcessor implements DocProcessor {
}
}
-
/**
* Reads a document's content from a file.
*