This is an automated email from the ASF dual-hosted git repository.
rzo1 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp-addons.git
The following commit(s) were added to refs/heads/master by this push:
new 7b9875b OPENNLP-1698: Switch to extjwnl in jwnl-addon - migrates
jwnl-addon to net.sf.extjwnl (2.0.5) - simplifies constructor of JWNLLemmatizer
- adds new unit test: JWNLLemmatizerTest with line coverage > 90%
7b9875b is described below
commit 7b9875b1cf6fd36b517414f34296de3ef88f057f
Author: Martin Wiesner <[email protected]>
AuthorDate: Wed Jan 22 11:34:08 2025 +0100
OPENNLP-1698: Switch to extjwnl in jwnl-addon
- migrates jwnl-addon to net.sf.extjwnl (2.0.5)
- simplifies constructor of JWNLLemmatizer
- adds new unit test: JWNLLemmatizerTest with line coverage > 90%
---
jwnl-addon/pom.xml | 27 +++++-
.../opennlp/jwnl/lemmatizer/JWNLLemmatizer.java | 106 ++++++---------------
.../jwnl/lemmatizer/JWNLLemmatizerTest.java | 77 +++++++++++++++
jwnl-addon/src/test/resources/log4j2.xml | 37 +++++++
pom.xml | 1 -
5 files changed, 168 insertions(+), 80 deletions(-)
diff --git a/jwnl-addon/pom.xml b/jwnl-addon/pom.xml
index 3a39da5..2511645 100644
--- a/jwnl-addon/pom.xml
+++ b/jwnl-addon/pom.xml
@@ -33,6 +33,11 @@
<packaging>jar</packaging>
<name>Apache OpenNLP JWNL Addon</name>
+ <properties>
+ <extjwnl.version>2.0.5</extjwnl.version>
+ <wn-data.version>1.2</wn-data.version>
+ </properties>
+
<dependencies>
<dependency>
<groupId>org.apache.opennlp</groupId>
@@ -40,10 +45,17 @@
</dependency>
<dependency>
- <groupId>net.sf.jwordnet</groupId>
- <artifactId>jwnl</artifactId>
- <version>1.3.3</version>
- <scope>compile</scope>
+ <groupId>net.sf.extjwnl</groupId>
+ <artifactId>extjwnl</artifactId>
+ <version>${extjwnl.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>net.sf.extjwnl</groupId>
+ <artifactId>extjwnl-data-wn31</artifactId>
+ <version>${wn-data.version}</version>
+ <optional>true</optional>
+ <scope>runtime</scope>
</dependency>
<dependency>
@@ -60,6 +72,13 @@
<groupId>org.junit.jupiter</groupId>
<artifactId>junit-jupiter-params</artifactId>
</dependency>
+
+ <dependency>
+ <groupId>org.apache.logging.log4j</groupId>
+ <artifactId>log4j-slf4j2-impl</artifactId>
+ <version>${log4j2.version}</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
<build>
diff --git
a/jwnl-addon/src/main/java/opennlp/jwnl/lemmatizer/JWNLLemmatizer.java
b/jwnl-addon/src/main/java/opennlp/jwnl/lemmatizer/JWNLLemmatizer.java
index ba55a0f..b85b2b8 100644
--- a/jwnl-addon/src/main/java/opennlp/jwnl/lemmatizer/JWNLLemmatizer.java
+++ b/jwnl-addon/src/main/java/opennlp/jwnl/lemmatizer/JWNLLemmatizer.java
@@ -17,92 +17,51 @@
package opennlp.jwnl.lemmatizer;
-import java.io.IOException;
import java.util.ArrayList;
-import java.util.HashMap;
import java.util.List;
-import java.util.Map;
-import net.didion.jwnl.JWNLException;
-import net.didion.jwnl.data.Adjective;
-import net.didion.jwnl.data.FileDictionaryElementFactory;
-import net.didion.jwnl.data.IndexWord;
-import net.didion.jwnl.data.POS;
-import net.didion.jwnl.data.PointerType;
-import net.didion.jwnl.data.VerbFrame;
-import net.didion.jwnl.dictionary.FileBackedDictionary;
-import net.didion.jwnl.dictionary.MorphologicalProcessor;
-import net.didion.jwnl.dictionary.file_manager.FileManager;
-import net.didion.jwnl.dictionary.file_manager.FileManagerImpl;
-import net.didion.jwnl.dictionary.morph.DefaultMorphologicalProcessor;
-import net.didion.jwnl.dictionary.morph.DetachSuffixesOperation;
-import net.didion.jwnl.dictionary.morph.LookupExceptionsOperation;
-import net.didion.jwnl.dictionary.morph.LookupIndexWordOperation;
-import net.didion.jwnl.dictionary.morph.Operation;
-import net.didion.jwnl.dictionary.morph.TokenizerOperation;
-import
net.didion.jwnl.princeton.data.PrincetonWN17FileDictionaryElementFactory;
-import net.didion.jwnl.princeton.file.PrincetonRandomAccessDictionaryFile;
+import net.sf.extjwnl.JWNLException;
+import net.sf.extjwnl.data.IndexWord;
+import net.sf.extjwnl.data.POS;
+import net.sf.extjwnl.dictionary.Dictionary;
+import net.sf.extjwnl.dictionary.MorphologicalProcessor;
+
import opennlp.tools.lemmatizer.Lemmatizer;
+/**
+ * A {@link Lemmatizer} implementation based on extJWNL
+ * and underlying WordNet resources.
+ *
+ * @see Dictionary
+ * @see MorphologicalProcessor
+ * @see POS
+ */
public class JWNLLemmatizer implements Lemmatizer {
- private net.didion.jwnl.dictionary.Dictionary dict;
- private MorphologicalProcessor morphy;
+ private final MorphologicalProcessor morphy;
/**
- * Creates JWNL dictionary and morphological processor objects in
- * JWNLemmatizer constructor. It also loads the JWNL configuration into the
- * constructor.
- * <p>
- * Constructor code based on Apache OpenNLP JWNLDictionary class.
+ * Initializes a {@link JWNLLemmatizer} instance.
+ * Loads {@link Dictionary JWNL dictionary} and {@link
MorphologicalProcessor} objects.
+ * It also loads the JWNL configuration.
*
- * @param wnDirectory
- * @throws IOException
+ * @throws JWNLException Thrown if errors occurred ramping up the WordNet
resources.
*/
- public JWNLLemmatizer(String wnDirectory) throws IOException {
+ public JWNLLemmatizer() throws JWNLException {
super();
- PointerType.initialize();
- Adjective.initialize();
- VerbFrame.initialize();
- Map<POS, String[][]> suffixMap = new HashMap<>();
- suffixMap.put(POS.NOUN, new String[][] { { "s", "" }, { "ses", "s" },
- { "xes", "x" }, { "zes", "z" }, { "ches", "ch" }, { "shes", "sh" },
- { "men", "man" }, { "ies", "y" } });
- suffixMap.put(POS.VERB, new String[][] { { "s", "" }, { "ies", "y" },
- { "es", "e" }, { "es", "" }, { "ed", "e" }, { "ed", "" },
- { "ing", "e" }, { "ing", "" } });
- suffixMap.put(POS.ADJECTIVE, new String[][] { { "er", "" }, { "est", "" },
- { "er", "e" }, { "est", "e" } });
- DetachSuffixesOperation tokDso = new DetachSuffixesOperation(suffixMap);
- tokDso.addDelegate(DetachSuffixesOperation.OPERATIONS, new Operation[] {
- new LookupIndexWordOperation(), new LookupExceptionsOperation() });
- TokenizerOperation tokOp = new TokenizerOperation(new String[] { " ", "-"
});
- tokOp.addDelegate(TokenizerOperation.TOKEN_OPERATIONS,
- new Operation[] { new LookupIndexWordOperation(),
- new LookupExceptionsOperation(), tokDso });
- DetachSuffixesOperation morphDso = new DetachSuffixesOperation(suffixMap);
- morphDso.addDelegate(DetachSuffixesOperation.OPERATIONS, new Operation[] {
- new LookupIndexWordOperation(), new LookupExceptionsOperation() });
- Operation[] operations = { new LookupExceptionsOperation(), morphDso,
tokOp };
- morphy = new DefaultMorphologicalProcessor(operations);
- FileManager manager = new FileManagerImpl(wnDirectory,
- PrincetonRandomAccessDictionaryFile.class);
- FileDictionaryElementFactory factory = new
PrincetonWN17FileDictionaryElementFactory();
- FileBackedDictionary.install(manager, morphy, factory, true);
- dict = net.didion.jwnl.dictionary.Dictionary.getInstance();
+ Dictionary dict = Dictionary.getDefaultResourceInstance();
morphy = dict.getMorphologicalProcessor();
}
/**
- * It takes a word and a POS tag and obtains a word's lemma from WordNet.
+ * Takes a word and a POS tag and obtains a word's lemma from WordNet.
*
- * @param word
- * @param postag
- * @return lemma
+ * @param word The word to find the corresponding lemma for.
+ * @param postag The POS tag associated with the {@code word}.
+ * @return lemma The lemma as provided by WordNet, or {@code null} if not
found.
*/
public String lemmatize(String word, String postag) {
String constantTag = "NNP";
- IndexWord baseForm;
String lemma;
try {
POS pos;
@@ -117,18 +76,15 @@ public class JWNLLemmatizer implements Lemmatizer {
} else {
pos = POS.ADVERB;
}
- baseForm = morphy.lookupBaseForm(pos, word);
+ IndexWord baseForm = morphy.lookupBaseForm(pos, word);
if (baseForm != null) {
- lemma = baseForm.getLemma().toString();
+ lemma = baseForm.getLemma();
+ } else if (postag.startsWith(constantTag)) {
+ lemma = word;
+ } else {
+ lemma= word.toLowerCase();
}
- else if (baseForm == null && postag.startsWith(constantTag)) {
- lemma = word;
- }
- else {
- lemma= word.toLowerCase();
- }
} catch (JWNLException e) {
- e.printStackTrace();
return null;
}
return lemma;
diff --git
a/jwnl-addon/src/test/java/opennlp/jwnl/lemmatizer/JWNLLemmatizerTest.java
b/jwnl-addon/src/test/java/opennlp/jwnl/lemmatizer/JWNLLemmatizerTest.java
new file mode 100644
index 0000000..e417830
--- /dev/null
+++ b/jwnl-addon/src/test/java/opennlp/jwnl/lemmatizer/JWNLLemmatizerTest.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.jwnl.lemmatizer;
+
+import java.util.List;
+import java.util.stream.Stream;
+
+import net.sf.extjwnl.JWNLException;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.Arguments;
+import org.junit.jupiter.params.provider.MethodSource;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertThrows;
+
+public class JWNLLemmatizerTest {
+
+ // SUT
+ private JWNLLemmatizer lemmatizer;
+
+ @BeforeEach
+ public void setUp() throws JWNLException {
+ lemmatizer = new JWNLLemmatizer();
+ }
+
+ @ParameterizedTest
+ @MethodSource("provideData")
+ public void testLemmatize(String word, String posTag, String expectedLemma) {
+ String lemma = lemmatizer.lemmatize(word, posTag);
+ assertNotNull(lemma);
+ assertEquals(expectedLemma, lemma);
+ }
+
+ @ParameterizedTest
+ @MethodSource("provideData")
+ public void testLemmatizeArray(String word, String posTag, String
expectedLemma) {
+ String[] lemma = lemmatizer.lemmatize(new String[]{word}, new
String[]{posTag});
+ assertNotNull(lemma);
+ assertEquals(1, lemma.length);
+ assertEquals(expectedLemma, lemma[0]);
+ }
+
+ @Test
+ public void testLemmatizeList() {
+ assertThrows(UnsupportedOperationException.class, () -> {
+ lemmatizer.lemmatize(List.of("mouse"), List.of("NN"));
+ });
+ }
+
+ private static Stream<Arguments> provideData() {
+ return Stream.of(
+ Arguments.of("the", "DT", "the"),
+ Arguments.of("cats", "NN", "cat"),
+ Arguments.of("saw", "VB", "see"),
+ Arguments.of("best", "JJS", "good"),
+ Arguments.of("upside", "RB", "upside")
+ );
+ }
+}
diff --git a/jwnl-addon/src/test/resources/log4j2.xml
b/jwnl-addon/src/test/resources/log4j2.xml
new file mode 100644
index 0000000..8b6b24c
--- /dev/null
+++ b/jwnl-addon/src/test/resources/log4j2.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one
+ or more contributor license agreements. See the NOTICE file
+ distributed with this work for additional information
+ regarding copyright ownership. The ASF licenses this file
+ to you under the Apache License, Version 2.0 (the
+ "License"); you may not use this file except in compliance
+ with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing,
+ software distributed under the License is distributed on an
+ "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ KIND, either express or implied. See the License for the
+ specific language governing permissions and limitations
+ under the License.
+-->
+<Configuration>
+ <Appenders>
+ <Console name="STDOUT" target="SYSTEM_OUT">
+ <!--
+ The pattern can be adjusted as needed, see
https://logging.apache.org/log4j/2.x/manual/layouts.html
+ -->
+ <PatternLayout pattern="%m%n"/>
+ </Console>
+ </Appenders>
+
+ <Loggers>
+ <Logger name="opennlp.jwnl.lemmatizer" level="warn"/>
+ <Logger name="opennlp.tools" level="warn"/>
+ <Root level="INFO">
+ <AppenderRef ref="STDOUT"/>
+ </Root>
+ </Loggers>
+</Configuration>
diff --git a/pom.xml b/pom.xml
index 77eb51f..5c46bf7 100644
--- a/pom.xml
+++ b/pom.xml
@@ -342,7 +342,6 @@
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
- <version>3.13.0</version>
<configuration>
<release>${java.version}</release>
<compilerArgument>-Xlint</compilerArgument>