This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch OPENNLP-1698-Switch-to-extjwnl-in-jwnl-addon in repository https://gitbox.apache.org/repos/asf/opennlp-addons.git
commit 62dea271fdc41ce7cd2c23b0552b75c15d374bc2 Author: Martin Wiesner <[email protected]> AuthorDate: Wed Jan 22 11:34:08 2025 +0100 OPENNLP-1698: Switch to extjwnl in jwnl-addon - migrates jwnl-addon to net.sf.extjwnl (2.0.5) - simplifies constructor of JWNLLemmatizer - adds new unit test: JWNLLemmatizerTest with line coverage > 90% --- jwnl-addon/pom.xml | 26 ++++- .../opennlp/jwnl/lemmatizer/JWNLLemmatizer.java | 106 ++++++--------------- .../jwnl/lemmatizer/JWNLLemmatizerTest.java | 77 +++++++++++++++ jwnl-addon/src/test/resources/log4j2.xml | 37 +++++++ pom.xml | 1 - 5 files changed, 167 insertions(+), 80 deletions(-) diff --git a/jwnl-addon/pom.xml b/jwnl-addon/pom.xml index 3a39da5..089968b 100644 --- a/jwnl-addon/pom.xml +++ b/jwnl-addon/pom.xml @@ -33,6 +33,11 @@ <packaging>jar</packaging> <name>Apache OpenNLP JWNL Addon</name> + <properties> + <extjwnl.version>2.0.5</extjwnl.version> + <wn-data.version>1.2</wn-data.version> + </properties> + <dependencies> <dependency> <groupId>org.apache.opennlp</groupId> @@ -40,10 +45,16 @@ </dependency> <dependency> - <groupId>net.sf.jwordnet</groupId> - <artifactId>jwnl</artifactId> - <version>1.3.3</version> - <scope>compile</scope> + <groupId>net.sf.extjwnl</groupId> + <artifactId>extjwnl</artifactId> + <version>${extjwnl.version}</version> + </dependency> + + <dependency> + <groupId>net.sf.extjwnl</groupId> + <artifactId>extjwnl-data-wn31</artifactId> + <version>${wn-data.version}</version> + <scope>runtime</scope> </dependency> <dependency> @@ -60,6 +71,13 @@ <groupId>org.junit.jupiter</groupId> <artifactId>junit-jupiter-params</artifactId> </dependency> + + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-slf4j2-impl</artifactId> + <version>${log4j2.version}</version> + <scope>test</scope> + </dependency> </dependencies> <build> diff --git a/jwnl-addon/src/main/java/opennlp/jwnl/lemmatizer/JWNLLemmatizer.java b/jwnl-addon/src/main/java/opennlp/jwnl/lemmatizer/JWNLLemmatizer.java index ba55a0f..b85b2b8 100644 --- a/jwnl-addon/src/main/java/opennlp/jwnl/lemmatizer/JWNLLemmatizer.java +++ b/jwnl-addon/src/main/java/opennlp/jwnl/lemmatizer/JWNLLemmatizer.java @@ -17,92 +17,51 @@ package opennlp.jwnl.lemmatizer; -import java.io.IOException; import java.util.ArrayList; -import java.util.HashMap; import java.util.List; -import java.util.Map; -import net.didion.jwnl.JWNLException; -import net.didion.jwnl.data.Adjective; -import net.didion.jwnl.data.FileDictionaryElementFactory; -import net.didion.jwnl.data.IndexWord; -import net.didion.jwnl.data.POS; -import net.didion.jwnl.data.PointerType; -import net.didion.jwnl.data.VerbFrame; -import net.didion.jwnl.dictionary.FileBackedDictionary; -import net.didion.jwnl.dictionary.MorphologicalProcessor; -import net.didion.jwnl.dictionary.file_manager.FileManager; -import net.didion.jwnl.dictionary.file_manager.FileManagerImpl; -import net.didion.jwnl.dictionary.morph.DefaultMorphologicalProcessor; -import net.didion.jwnl.dictionary.morph.DetachSuffixesOperation; -import net.didion.jwnl.dictionary.morph.LookupExceptionsOperation; -import net.didion.jwnl.dictionary.morph.LookupIndexWordOperation; -import net.didion.jwnl.dictionary.morph.Operation; -import net.didion.jwnl.dictionary.morph.TokenizerOperation; -import net.didion.jwnl.princeton.data.PrincetonWN17FileDictionaryElementFactory; -import net.didion.jwnl.princeton.file.PrincetonRandomAccessDictionaryFile; +import net.sf.extjwnl.JWNLException; +import net.sf.extjwnl.data.IndexWord; +import net.sf.extjwnl.data.POS; +import net.sf.extjwnl.dictionary.Dictionary; +import net.sf.extjwnl.dictionary.MorphologicalProcessor; + import opennlp.tools.lemmatizer.Lemmatizer; +/** + * A {@link Lemmatizer} implementation based on extJWNL + * and underlying WordNet resources. + * + * @see Dictionary + * @see MorphologicalProcessor + * @see POS + */ public class JWNLLemmatizer implements Lemmatizer { - private net.didion.jwnl.dictionary.Dictionary dict; - private MorphologicalProcessor morphy; + private final MorphologicalProcessor morphy; /** - * Creates JWNL dictionary and morphological processor objects in - * JWNLemmatizer constructor. It also loads the JWNL configuration into the - * constructor. - * <p> - * Constructor code based on Apache OpenNLP JWNLDictionary class. + * Initializes a {@link JWNLLemmatizer} instance. + * Loads {@link Dictionary JWNL dictionary} and {@link MorphologicalProcessor} objects. + * It also loads the JWNL configuration. * - * @param wnDirectory - * @throws IOException + * @throws JWNLException Thrown if errors occurred ramping up the WordNet resources. */ - public JWNLLemmatizer(String wnDirectory) throws IOException { + public JWNLLemmatizer() throws JWNLException { super(); - PointerType.initialize(); - Adjective.initialize(); - VerbFrame.initialize(); - Map<POS, String[][]> suffixMap = new HashMap<>(); - suffixMap.put(POS.NOUN, new String[][] { { "s", "" }, { "ses", "s" }, - { "xes", "x" }, { "zes", "z" }, { "ches", "ch" }, { "shes", "sh" }, - { "men", "man" }, { "ies", "y" } }); - suffixMap.put(POS.VERB, new String[][] { { "s", "" }, { "ies", "y" }, - { "es", "e" }, { "es", "" }, { "ed", "e" }, { "ed", "" }, - { "ing", "e" }, { "ing", "" } }); - suffixMap.put(POS.ADJECTIVE, new String[][] { { "er", "" }, { "est", "" }, - { "er", "e" }, { "est", "e" } }); - DetachSuffixesOperation tokDso = new DetachSuffixesOperation(suffixMap); - tokDso.addDelegate(DetachSuffixesOperation.OPERATIONS, new Operation[] { - new LookupIndexWordOperation(), new LookupExceptionsOperation() }); - TokenizerOperation tokOp = new TokenizerOperation(new String[] { " ", "-" }); - tokOp.addDelegate(TokenizerOperation.TOKEN_OPERATIONS, - new Operation[] { new LookupIndexWordOperation(), - new LookupExceptionsOperation(), tokDso }); - DetachSuffixesOperation morphDso = new DetachSuffixesOperation(suffixMap); - morphDso.addDelegate(DetachSuffixesOperation.OPERATIONS, new Operation[] { - new LookupIndexWordOperation(), new LookupExceptionsOperation() }); - Operation[] operations = { new LookupExceptionsOperation(), morphDso, tokOp }; - morphy = new DefaultMorphologicalProcessor(operations); - FileManager manager = new FileManagerImpl(wnDirectory, - PrincetonRandomAccessDictionaryFile.class); - FileDictionaryElementFactory factory = new PrincetonWN17FileDictionaryElementFactory(); - FileBackedDictionary.install(manager, morphy, factory, true); - dict = net.didion.jwnl.dictionary.Dictionary.getInstance(); + Dictionary dict = Dictionary.getDefaultResourceInstance(); morphy = dict.getMorphologicalProcessor(); } /** - * It takes a word and a POS tag and obtains a word's lemma from WordNet. + * Takes a word and a POS tag and obtains a word's lemma from WordNet. * - * @param word - * @param postag - * @return lemma + * @param word The word to find the corresponding lemma for. + * @param postag The POS tag associated with the {@code word}. + * @return lemma The lemma as provided by WordNet, or {@code null} if not found. */ public String lemmatize(String word, String postag) { String constantTag = "NNP"; - IndexWord baseForm; String lemma; try { POS pos; @@ -117,18 +76,15 @@ public class JWNLLemmatizer implements Lemmatizer { } else { pos = POS.ADVERB; } - baseForm = morphy.lookupBaseForm(pos, word); + IndexWord baseForm = morphy.lookupBaseForm(pos, word); if (baseForm != null) { - lemma = baseForm.getLemma().toString(); + lemma = baseForm.getLemma(); + } else if (postag.startsWith(constantTag)) { + lemma = word; + } else { + lemma= word.toLowerCase(); } - else if (baseForm == null && postag.startsWith(constantTag)) { - lemma = word; - } - else { - lemma= word.toLowerCase(); - } } catch (JWNLException e) { - e.printStackTrace(); return null; } return lemma; diff --git a/jwnl-addon/src/test/java/opennlp/jwnl/lemmatizer/JWNLLemmatizerTest.java b/jwnl-addon/src/test/java/opennlp/jwnl/lemmatizer/JWNLLemmatizerTest.java new file mode 100644 index 0000000..e417830 --- /dev/null +++ b/jwnl-addon/src/test/java/opennlp/jwnl/lemmatizer/JWNLLemmatizerTest.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.jwnl.lemmatizer; + +import java.util.List; +import java.util.stream.Stream; + +import net.sf.extjwnl.JWNLException; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class JWNLLemmatizerTest { + + // SUT + private JWNLLemmatizer lemmatizer; + + @BeforeEach + public void setUp() throws JWNLException { + lemmatizer = new JWNLLemmatizer(); + } + + @ParameterizedTest + @MethodSource("provideData") + public void testLemmatize(String word, String posTag, String expectedLemma) { + String lemma = lemmatizer.lemmatize(word, posTag); + assertNotNull(lemma); + assertEquals(expectedLemma, lemma); + } + + @ParameterizedTest + @MethodSource("provideData") + public void testLemmatizeArray(String word, String posTag, String expectedLemma) { + String[] lemma = lemmatizer.lemmatize(new String[]{word}, new String[]{posTag}); + assertNotNull(lemma); + assertEquals(1, lemma.length); + assertEquals(expectedLemma, lemma[0]); + } + + @Test + public void testLemmatizeList() { + assertThrows(UnsupportedOperationException.class, () -> { + lemmatizer.lemmatize(List.of("mouse"), List.of("NN")); + }); + } + + private static Stream<Arguments> provideData() { + return Stream.of( + Arguments.of("the", "DT", "the"), + Arguments.of("cats", "NN", "cat"), + Arguments.of("saw", "VB", "see"), + Arguments.of("best", "JJS", "good"), + Arguments.of("upside", "RB", "upside") + ); + } +} diff --git a/jwnl-addon/src/test/resources/log4j2.xml b/jwnl-addon/src/test/resources/log4j2.xml new file mode 100644 index 0000000..8b6b24c --- /dev/null +++ b/jwnl-addon/src/test/resources/log4j2.xml @@ -0,0 +1,37 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<Configuration> + <Appenders> + <Console name="STDOUT" target="SYSTEM_OUT"> + <!-- + The pattern can be adjusted as needed, see https://logging.apache.org/log4j/2.x/manual/layouts.html + --> + <PatternLayout pattern="%m%n"/> + </Console> + </Appenders> + + <Loggers> + <Logger name="opennlp.jwnl.lemmatizer" level="warn"/> + <Logger name="opennlp.tools" level="warn"/> + <Root level="INFO"> + <AppenderRef ref="STDOUT"/> + </Root> + </Loggers> +</Configuration> diff --git a/pom.xml b/pom.xml index 77eb51f..5c46bf7 100644 --- a/pom.xml +++ b/pom.xml @@ -342,7 +342,6 @@ <plugin> <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> - <version>3.13.0</version> <configuration> <release>${java.version}</release> <compilerArgument>-Xlint</compilerArgument>
