This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch OPENNLP-1584-FeatureGeneratorUtil-shall-detect-German-umlauts-with-dot-as-'cp' in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit 161279c6de84bca1013deeb49045d15cae91624a Author: Martin Wiesner <[email protected]> AuthorDate: Mon Jul 1 10:43:15 2024 +0200 OPENNLP-1584 FeatureGeneratorUtil shall detect German umlauts with dot as 'cp' - enhances the Pattern in FeatureGeneratorUtil - adds related test cases - improves JavaDoc of WindowFeatureGenerator along the path --- .../tools/util/featuregen/FeatureGeneratorUtil.java | 5 +---- .../tools/util/featuregen/WindowFeatureGenerator.java | 15 +++++++++------ .../util/featuregen/FeatureGeneratorUtilTest.java | 18 ++++++++++++++++++ 3 files changed, 28 insertions(+), 10 deletions(-) diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/FeatureGeneratorUtil.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/FeatureGeneratorUtil.java index e6b8af95..22373021 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/FeatureGeneratorUtil.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/FeatureGeneratorUtil.java @@ -25,10 +25,7 @@ import java.util.regex.Pattern; */ public class FeatureGeneratorUtil { - private static final String TOKEN_CLASS_PREFIX = "wc"; - private static final String TOKEN_AND_CLASS_PREFIX = "w&c"; - - private static final Pattern capPeriod = Pattern.compile("^[A-Z]\\.$"); + private static final Pattern capPeriod = Pattern.compile("^[A-ZÄÖÜ]\\.$"); /** * Generates a class name for the specified token. diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WindowFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WindowFeatureGenerator.java index d6359881..c58573aa 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WindowFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/WindowFeatureGenerator.java @@ -22,13 +22,16 @@ import java.util.ArrayList; import java.util.List; /** - * Generates previous and next features for a given {@link AdaptiveFeatureGenerator}. - * The window size can be specified. - * <p> + * Generates previous (left-sided) and next (right-sided) features for a + * given {@link AdaptiveFeatureGenerator}. The window size can be specified. + * <p><br/> * Features: - * Current token is always included unchanged - * Previous tokens are prefixed with p distance - * Next tokens are prefix with n distance + * <ul> + * <li> Current token is always included unchanged,</li> + * <li>Previous tokens are prefixed with {@code p} distance,</li> + * <li>Next tokens are prefix with {@code n} distance.</li> + * + * @see AdaptiveFeatureGenerator */ public class WindowFeatureGenerator implements AdaptiveFeatureGenerator { diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGeneratorUtilTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGeneratorUtilTest.java index 9655678b..cd35f092 100644 --- a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGeneratorUtilTest.java +++ b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/FeatureGeneratorUtilTest.java @@ -50,6 +50,24 @@ public class FeatureGeneratorUtilTest { Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature("#")); Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature("%")); Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature("&")); + Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature("§")); + Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature("^")); + Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature("°")); + Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature("(")); + Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature(")")); + Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature("/")); + Assertions.assertEquals("other", FeatureGeneratorUtil.tokenFeature("\\")); + } + + @Test + void testGerman() { + Assertions.assertEquals("ic", FeatureGeneratorUtil.tokenFeature("Änne")); + Assertions.assertEquals("ic", FeatureGeneratorUtil.tokenFeature("Özlem")); + Assertions.assertEquals("ic", FeatureGeneratorUtil.tokenFeature("Ümit")); + Assertions.assertEquals("cp", FeatureGeneratorUtil.tokenFeature("Ä.")); + Assertions.assertEquals("cp", FeatureGeneratorUtil.tokenFeature("Ö.")); + Assertions.assertEquals("cp", FeatureGeneratorUtil.tokenFeature("Ü.")); + Assertions.assertEquals("sc", FeatureGeneratorUtil.tokenFeature("Ü")); } @Test
