This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch OPENNLP-1585-Reduce-creation-of-String-objects-for-prefixes-in-several-FeatureGenerator-classes in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit 3bea64d4a42676d4579f65900f200094609b6981 Author: Martin Wiesner <[email protected]> AuthorDate: Mon Jul 1 11:18:53 2024 +0200 OPENNLP-1585 Reduce creation of String objects for prefixes in several FeatureGenerator classes - extract constants from prefixes in several FeatureGenerator classes - improves JavaDoc along the path --- .../main/java/opennlp/tools/ngram/NGramModel.java | 9 +++--- .../AdditionalContextFeatureGenerator.java | 4 ++- .../tools/util/featuregen/BrownCluster.java | 1 + .../featuregen/CharacterNgramFeatureGenerator.java | 6 ++-- .../tools/util/featuregen/GeneratorFactory.java | 2 +- .../util/featuregen/PrefixFeatureGenerator.java | 4 ++- .../SentenceFeatureGeneratorFactory.java | 1 - .../tools/util/featuregen/StringPattern.java | 32 +++++++++++++++++----- .../util/featuregen/SuffixFeatureGenerator.java | 4 ++- .../featuregen/TokenClassFeatureGenerator.java | 10 +++---- .../util/featuregen/TokenFeatureGenerator.java | 6 ++-- .../featuregen/TokenPatternFeatureGenerator.java | 12 +++++--- 12 files changed, 60 insertions(+), 31 deletions(-) diff --git a/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java b/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java index db1beee0..87572d7f 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java +++ b/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java @@ -46,16 +46,16 @@ public class NGramModel implements Iterable<StringList> { private final Map<StringList, Integer> mNGrams = new LinkedHashMap<>(); /** - * Initializes an empty instance. + * Instantiates an empty {@link NGramModel} instance. */ public NGramModel() { } /** - * Initializes the current instance. + * Instantiates a {@link NGramModel} via an {@link InputStream} reference. * * @param in the serialized model stream - * @throws IOException + * @throws IOException Thrown if errors occurred reading from {@code in}. */ public NGramModel(InputStream in) throws IOException { DictionaryEntryPersistor.create(in, entry -> { @@ -67,8 +67,7 @@ public class NGramModel implements Iterable<StringList> { countValueString = entry.attributes().getValue(COUNT); if (countValueString == null) { - throw new InvalidFormatException( - "The count attribute must be set!"); + throw new InvalidFormatException("The count attribute must be set!"); } count = Integer.parseInt(countValueString); diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/AdditionalContextFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/AdditionalContextFeatureGenerator.java index 24fdf5c9..c35a1c1a 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/AdditionalContextFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/AdditionalContextFeatureGenerator.java @@ -26,6 +26,8 @@ import java.util.List; */ public class AdditionalContextFeatureGenerator implements AdaptiveFeatureGenerator { + private static final String PREFIX = "ne="; + private String[][] additionalContext; @Override @@ -36,7 +38,7 @@ public class AdditionalContextFeatureGenerator implements AdaptiveFeatureGenerat String[] context = additionalContext[index]; for (String s : context) { - features.add("ne=" + s); + features.add(PREFIX + s); } } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownCluster.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownCluster.java index b4ddd2b7..37641a2a 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownCluster.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownCluster.java @@ -39,6 +39,7 @@ import opennlp.tools.util.model.SerializableArtifact; * <p> * Originally available at: <a href="http://metaoptimize.com/projects/wordreprs/"> * http://metaoptimize.com/projects/wordreprs/</a>. + * <p> * Further details can be found in the * <a href="https://dl.acm.org/doi/10.5555/1858681.1858721">related research paper</a>. * <p> diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CharacterNgramFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CharacterNgramFeatureGenerator.java index 50cb2522..62a0b109 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CharacterNgramFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CharacterNgramFeatureGenerator.java @@ -29,6 +29,8 @@ import opennlp.tools.util.StringUtil; */ public class CharacterNgramFeatureGenerator implements AdaptiveFeatureGenerator { + private static final String PREFIX = "ng="; + private final int minLength; private final int maxLength; @@ -45,7 +47,7 @@ public class CharacterNgramFeatureGenerator implements AdaptiveFeatureGenerator /** * Initializes a {@link CharacterNgramFeatureGenerator} with - * min 2 length and max 5 length of ngrams. + * min length of {@code 2} and max length of {@code 5} for ngrams. */ public CharacterNgramFeatureGenerator() { this(2, 5); @@ -58,7 +60,7 @@ public class CharacterNgramFeatureGenerator implements AdaptiveFeatureGenerator for (StringList tokenList : model) { if (tokenList.size() > 0) { - features.add("ng=" + StringUtil.toLowerCase(tokenList.getToken(0))); + features.add(PREFIX + StringUtil.toLowerCase(tokenList.getToken(0))); } } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java index efbae0ec..7dccaed6 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java @@ -516,7 +516,7 @@ public class GeneratorFactory { } /** - * @return null if the subclass uses {@link #resourceManager} to instantiate + * @return {@code null} if the subclass uses {@link #resourceManager} to instantiate * @throws InvalidFormatException */ public abstract AdaptiveFeatureGenerator create() throws InvalidFormatException; diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/PrefixFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/PrefixFeatureGenerator.java index 2e10195f..6466feef 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/PrefixFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/PrefixFeatureGenerator.java @@ -21,6 +21,8 @@ import java.util.List; public class PrefixFeatureGenerator implements AdaptiveFeatureGenerator { + private static final String PREFIX = "pre="; + static final int DEFAULT_MAX_LENGTH = 4; private final int prefixLength; @@ -38,7 +40,7 @@ public class PrefixFeatureGenerator implements AdaptiveFeatureGenerator { String[] previousOutcomes) { String[] prefs = getPrefixes(tokens[index]); for (String pref : prefs) { - features.add("pre=" + pref); + features.add(PREFIX + pref); } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SentenceFeatureGeneratorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SentenceFeatureGeneratorFactory.java index 3ffefaa6..d452fe7e 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SentenceFeatureGeneratorFactory.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SentenceFeatureGeneratorFactory.java @@ -31,7 +31,6 @@ public class SentenceFeatureGeneratorFactory @Override public AdaptiveFeatureGenerator create() throws InvalidFormatException { - String beginFeatureString = generatorElement.getAttribute("begin"); return new SentenceFeatureGenerator(getBool("begin", true), getBool("end", true)); } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/StringPattern.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/StringPattern.java index af4cb96b..500b5a24 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/StringPattern.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/StringPattern.java @@ -141,49 +141,49 @@ public class StringPattern { } /** - * @return true if all characters are letters. + * @return {@code true} if all characters are letters. */ public boolean isAllLetter() { return (pattern & ALL_LETTERS) > 0; } /** - * @return true if first letter is capital. + * @return {@code true} if first letter is capital. */ public boolean isInitialCapitalLetter() { return (pattern & INITAL_CAPITAL_LETTER) > 0; } /** - * @return true if all letters are capital. + * @return {@code true} if all letters are capital. */ public boolean isAllCapitalLetter() { return (pattern & ALL_CAPITAL_LETTER) > 0; } /** - * @return true if all letters are lower case. + * @return {@code true} if all letters are lower case. */ public boolean isAllLowerCaseLetter() { return (pattern & ALL_LOWERCASE_LETTER) > 0; } /** - * @return true if all chars are digits. + * @return {@code true} if all chars are digits. */ public boolean isAllDigit() { return (pattern & ALL_DIGIT) > 0; } /** - * @return true if all chars are hiragana. + * @return {@code true} if all chars are hiragana. */ public boolean isAllHiragana() { return (pattern & ALL_HIRAGANA) > 0; } /** - * @return true if all chars are katakana. + * @return {@code true} if all chars are katakana. */ public boolean isAllKatakana() { return (pattern & ALL_KATAKANA) > 0; @@ -196,26 +196,44 @@ public class StringPattern { return digits; } + /** + * @return {@code true} if a period is contained. + */ public boolean containsPeriod() { return (pattern & CONTAINS_PERIOD) > 0; } + /** + * @return {@code true} if a comma is contained. + */ public boolean containsComma() { return (pattern & CONTAINS_COMMA) > 0; } + /** + * @return {@code true} if a slash is contained. + */ public boolean containsSlash() { return (pattern & CONTAINS_SLASH) > 0; } + /** + * @return {@code true} if a digit is contained. + */ public boolean containsDigit() { return (pattern & CONTAINS_DIGIT) > 0; } + /** + * @return {@code true} if a hypen is contained. + */ public boolean containsHyphen() { return (pattern & CONTAINS_HYPHEN) > 0; } + /** + * @return {@code true} if a letters are contained. + */ public boolean containsLetters() { return (pattern & CONTAINS_LETTERS) > 0; } diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SuffixFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SuffixFeatureGenerator.java index f1a18d83..e890cf08 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SuffixFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/SuffixFeatureGenerator.java @@ -21,6 +21,8 @@ import java.util.List; public class SuffixFeatureGenerator implements AdaptiveFeatureGenerator { + private static final String PREFIX = "suf="; + static final int DEFAULT_MAX_LENGTH = 4; private final int suffixLength; @@ -38,7 +40,7 @@ public class SuffixFeatureGenerator implements AdaptiveFeatureGenerator { String[] previousOutcomes) { String[] suffs = getSuffixes(tokens[index]); for (String suff : suffs) { - features.add("suf=" + suff); + features.add(PREFIX + suff); } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenClassFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenClassFeatureGenerator.java index 56546c37..07a8d40f 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenClassFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenClassFeatureGenerator.java @@ -23,12 +23,12 @@ import opennlp.tools.util.StringUtil; /** - * Generates features for different for the class of the token. + * Generates features for the class of the token. */ public class TokenClassFeatureGenerator implements AdaptiveFeatureGenerator { - private static final String TOKEN_CLASS_PREFIX = "wc"; - private static final String TOKEN_AND_CLASS_PREFIX = "w&c"; + private static final String TOKEN_CLASS_PREFIX = "wc="; + private static final String TOKEN_AND_CLASS_PREFIX = "w&c="; private final boolean generateWordAndClassFeature; @@ -43,10 +43,10 @@ public class TokenClassFeatureGenerator implements AdaptiveFeatureGenerator { @Override public void createFeatures(List<String> features, String[] tokens, int index, String[] preds) { String wordClass = FeatureGeneratorUtil.tokenFeature(tokens[index]); - features.add(TOKEN_CLASS_PREFIX + "=" + wordClass); + features.add(TOKEN_CLASS_PREFIX + wordClass); if (generateWordAndClassFeature) { - features.add(TOKEN_AND_CLASS_PREFIX + "=" + StringUtil.toLowerCase(tokens[index]) + + features.add(TOKEN_AND_CLASS_PREFIX + StringUtil.toLowerCase(tokens[index]) + "," + wordClass); } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenFeatureGenerator.java index ced4a55c..53119e06 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenFeatureGenerator.java @@ -27,7 +27,7 @@ import opennlp.tools.util.StringUtil; */ public class TokenFeatureGenerator implements AdaptiveFeatureGenerator { - private static final String WORD_PREFIX = "w"; + private static final String WORD_PREFIX = "w="; private final boolean lowercase; /** @@ -49,10 +49,10 @@ public class TokenFeatureGenerator implements AdaptiveFeatureGenerator { @Override public void createFeatures(List<String> features, String[] tokens, int index, String[] preds) { if (lowercase) { - features.add(WORD_PREFIX + "=" + StringUtil.toLowerCase(tokens[index])); + features.add(WORD_PREFIX + StringUtil.toLowerCase(tokens[index])); } else { - features.add(WORD_PREFIX + "=" + tokens[index]); + features.add(WORD_PREFIX + tokens[index]); } } } diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenPatternFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenPatternFeatureGenerator.java index 99adfc98..512230a5 100644 --- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenPatternFeatureGenerator.java +++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/TokenPatternFeatureGenerator.java @@ -31,6 +31,10 @@ import opennlp.tools.util.StringUtil; */ public class TokenPatternFeatureGenerator implements AdaptiveFeatureGenerator { + private static final String SUB_TOKEN_PREFIX = "st=" ; + private static final String SUB_TOKEN_PART2_PREFIX = "pt2=" ; + private static final String SUB_TOKEN_PART3_PREFIX = "pt3=" ; + private final Pattern noLetters = Pattern.compile("[^a-zA-Z]"); private final Tokenizer tokenizer; @@ -57,7 +61,7 @@ public class TokenPatternFeatureGenerator implements AdaptiveFeatureGenerator { String[] tokenized = tokenizer.tokenize(toks[index]); if (tokenized.length == 1) { - feats.add("st=" + StringUtil.toLowerCase(toks[index])); + feats.add(SUB_TOKEN_PREFIX + StringUtil.toLowerCase(toks[index])); return; } @@ -68,12 +72,12 @@ public class TokenPatternFeatureGenerator implements AdaptiveFeatureGenerator { for (int i = 0; i < tokenized.length; i++) { if (i < tokenized.length - 1) { - feats.add("pt2=" + FeatureGeneratorUtil.tokenFeature(tokenized[i]) + + feats.add(SUB_TOKEN_PART2_PREFIX + FeatureGeneratorUtil.tokenFeature(tokenized[i]) + FeatureGeneratorUtil.tokenFeature(tokenized[i + 1])); } if (i < tokenized.length - 2) { - feats.add("pt3=" + FeatureGeneratorUtil.tokenFeature(tokenized[i]) + + feats.add(SUB_TOKEN_PART3_PREFIX + FeatureGeneratorUtil.tokenFeature(tokenized[i]) + FeatureGeneratorUtil.tokenFeature(tokenized[i + 1]) + FeatureGeneratorUtil.tokenFeature(tokenized[i + 2])); } @@ -81,7 +85,7 @@ public class TokenPatternFeatureGenerator implements AdaptiveFeatureGenerator { pattern.append(FeatureGeneratorUtil.tokenFeature(tokenized[i])); if (!noLetters.matcher(tokenized[i]).find()) { - feats.add("st=" + StringUtil.toLowerCase(tokenized[i])); + feats.add(SUB_TOKEN_PREFIX + StringUtil.toLowerCase(tokenized[i])); } }
