Copilot commented on code in PR #1104: URL: https://github.com/apache/opennlp/pull/1104#discussion_r3459140178
########## opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordBreakProperty.java: ########## @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.tokenize.uax29; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.UncheckedIOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +/** + * Looks up the Unicode {@link WordBreak Word_Break} property of a code point. + * + * <p>The data is loaded once from the {@code WordBreakProperty.txt} resource of the Unicode + * Character Database (parsed with simple cursor scanning, no regular expression). Lookup is O(1) + * for the Basic Multilingual Plane (a direct array index) and O(log n) for supplementary code + * points (a binary search over a small sorted range table), so it imposes no per-character + * allocation on the word boundary algorithm.</p> + */ +public final class WordBreakProperty { + + private static final String RESOURCE = "WordBreakProperty.txt"; + + private static final WordBreak[] VALUES = WordBreak.values(); + + // Word_Break value ordinal for each BMP code point; the default 0 is WordBreak.OTHER. + private static final byte[] BMP = new byte[0x10000]; + + // Supplementary ranges (above the BMP), sorted by start for binary search. + private static final int[] SUPPLEMENTARY_START; + private static final int[] SUPPLEMENTARY_END; + private static final byte[] SUPPLEMENTARY_VALUE; + + static { + final List<int[]> supplementary = new ArrayList<>(); + try (InputStream in = WordBreakProperty.class.getResourceAsStream(RESOURCE)) { + if (in == null) { + throw new IllegalStateException("Missing Word_Break data resource: " + RESOURCE); + } + load(in, supplementary); + } catch (IOException e) { + throw new UncheckedIOException("Unable to read Word_Break data resource " + RESOURCE, e); + } + supplementary.sort((a, b) -> Integer.compare(a[0], b[0])); + SUPPLEMENTARY_START = new int[supplementary.size()]; + SUPPLEMENTARY_END = new int[supplementary.size()]; + SUPPLEMENTARY_VALUE = new byte[supplementary.size()]; + for (int i = 0; i < supplementary.size(); i++) { + final int[] range = supplementary.get(i); + SUPPLEMENTARY_START[i] = range[0]; + SUPPLEMENTARY_END[i] = range[1]; + SUPPLEMENTARY_VALUE[i] = (byte) range[2]; + } + } + + private WordBreakProperty() { + } + + private static void load(InputStream in, List<int[]> supplementary) throws IOException { + try (BufferedReader reader = + new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))) { + String line; + while ((line = reader.readLine()) != null) { + final int hash = line.indexOf('#'); + final String content = (hash < 0 ? line : line.substring(0, hash)).strip(); + if (content.isEmpty()) { + continue; + } + final int semicolon = content.indexOf(';'); + final String codePoints = content.substring(0, semicolon).strip(); + final String value = content.substring(semicolon + 1).strip(); + final byte ordinal = (byte) WordBreak.fromPropertyName(value).ordinal(); + + final int dots = codePoints.indexOf(".."); + final int start; + final int end; + if (dots < 0) { + start = Integer.parseInt(codePoints, 16); + end = start; + } else { + start = Integer.parseInt(codePoints.substring(0, dots), 16); + end = Integer.parseInt(codePoints.substring(dots + 2), 16); + } + assign(start, end, ordinal, supplementary); + } + } + } + + private static void assign(int start, int end, byte ordinal, List<int[]> supplementary) { + final int bmpEnd = Math.min(end, 0xFFFF); + for (int codePoint = start; codePoint <= bmpEnd; codePoint++) { + BMP[codePoint] = ordinal; + } + if (end > 0xFFFF) { + supplementary.add(new int[] {Math.max(start, 0x10000), end, ordinal}); + } + } Review Comment: Filling BMP ranges one code point at a time is avoidably expensive during class initialization. Using a bulk fill (e.g., `Arrays.fill` over `[start, bmpEnd + 1)`) significantly reduces startup cost while preserving behavior. ########## opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordBoundaryConformanceTest.java: ########## @@ -0,0 +1,98 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.tokenize.uax29; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertTrue; + +/** + * Runs the official Unicode {@code WordBreakTest.txt} conformance suite against + * {@link WordSegmenter}. Each line marks boundaries with U+00F7 (division sign) and non-boundaries + * with U+00D7 (multiplication sign) between code points. + */ +public class WordBoundaryConformanceTest { + + private static final int BOUNDARY = 0x00F7; // division sign + + @Test + void testOfficialUnicodeWordBreakConformance() throws IOException { + int total = 0; + int passed = 0; + final List<String> failures = new ArrayList<>(); + + try (InputStream in = Objects.requireNonNull( + WordBoundaryConformanceTest.class.getResourceAsStream("WordBreakTest.txt"), + "Missing test resource: WordBreakTest.txt"); + BufferedReader reader = + new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))) { + String raw; + int lineNumber = 0; + while ((raw = reader.readLine()) != null) { + lineNumber++; + final int hash = raw.indexOf('#'); + final String content = (hash < 0 ? raw : raw.substring(0, hash)).strip(); + if (content.isEmpty()) { + continue; + } + final String[] tokens = content.split("\\s+"); + + final StringBuilder text = new StringBuilder(); + final List<Integer> expected = new ArrayList<>(); + expected.add(0); // tokens[0] is always a leading boundary marker. + int offset = 0; + for (int k = 1; k < tokens.length; k += 2) { + final int codePoint = Integer.parseInt(tokens[k], 16); + text.appendCodePoint(codePoint); + offset += Character.charCount(codePoint); + if (tokens[k + 1].codePointAt(0) == BOUNDARY) { + expected.add(offset); + } + } + + final int[] actual = WordSegmenter.boundaries(text); + final int[] expectedArray = expected.stream().mapToInt(Integer::intValue).toArray(); + total++; + if (Arrays.equals(actual, expectedArray)) { + passed++; + } else if (failures.size() < 25) { + failures.add("line " + lineNumber + ": " + content + + "\n expected=" + Arrays.toString(expectedArray) + + "\n actual =" + Arrays.toString(actual)); + } + } + } + + final int passRate = total == 0 ? 0 : passed * 100 / total; + System.out.println("UAX#29 word-break conformance: " + passed + "/" + total + + " (" + passRate + "%)"); + assertTrue(total > 1900, "expected the full conformance suite to load, ran only " + total); Review Comment: Printing directly to stdout from a unit test is noisy in CI and makes failures harder to spot. Consider removing the `System.out.println` entirely, or route the info through JUnit facilities (e.g., `TestReporter`) so it’s only shown when useful. ########## opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java: ########## @@ -0,0 +1,364 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.EnumMap; +import java.util.EnumSet; +import java.util.List; +import java.util.Locale; +import java.util.Objects; +import java.util.Set; + +import opennlp.tools.lemmatizer.Lemmatizer; +import opennlp.tools.stemmer.Stemmer; +import opennlp.tools.tokenize.uax29.WordTokenizer; +import opennlp.tools.util.Span; + +/** + * Builds {@link Term}s by segmenting text and applying a configured stack of normalization + * {@link Dimension}s to each token. The analyzer is the configuration; each {@link Term} is the + * layered result for one token, with the configured dimensions computed eagerly and any other + * dimension computed lazily on first request. + * + * <p>Segmentation uses the Unicode {@linkplain WordTokenizer UAX #29 word tokenizer}, so the + * input does not need to be pre-tokenized. The character-level dimensions ({@link Dimension#NFC} + * through {@link Dimension#ACCENT_FOLD}) have built-in defaults; {@link Dimension#STEM} and + * {@link Dimension#LEMMA} are enabled by supplying a {@link Stemmer} or {@link Lemmatizer}.</p> + * + * <p>An instance is immutable and is thread-safe when its configured transforms are. The built-in + * character normalizers are stateless, but the Snowball stemmers are not, so an analyzer configured + * with a {@link Stemmer} (for example through {@link NormalizationProfile#searchAnalyzer()}) should + * not be shared across threads when {@link Dimension#STEM} is used. Build one with + * {@link #builder()}.</p> + */ +public final class TermAnalyzer { + + private final List<Dimension> chain; + private final Dimension finalDimension; + private final EnumMap<Dimension, CharSequenceNormalizer> transforms; + private final Stemmer stemmer; + private final Lemmatizer lemmatizer; + private final WordTokenizer tokenizer; + + private TermAnalyzer(Builder builder) { + final List<Dimension> ordered = new ArrayList<>(builder.chain); + Collections.sort(ordered); // canonical pipeline order (enum declaration order) + this.chain = List.copyOf(ordered); + this.finalDimension = ordered.isEmpty() ? Dimension.ORIGINAL : ordered.get(ordered.size() - 1); + // Only the per-analyzer overrides from the builder; the defaults live on Dimension itself. + this.transforms = new EnumMap<>(builder.transforms); + this.stemmer = builder.stemmer; + this.lemmatizer = builder.lemmatizer; + this.tokenizer = builder.tokenizer; + } + + /** + * {@return a new builder} + */ + public static Builder builder() { + return new Builder(); + } + + /** + * Segments {@code text} with the UAX #29 word tokenizer and returns one {@link Term} per + * word token, in order. The terms carry no part-of-speech tag, so {@link Dimension#LEMMA} is not + * available from them. + * + * @param text The text to analyze. + * @return The terms. + */ + public List<Term> analyze(CharSequence text) { + final List<Span> spans = tokenizer.tokenizeSpans(text); + final List<Term> terms = new ArrayList<>(spans.size()); + for (final Span span : spans) { + terms.add(new Term(this, span.getCoveredText(text).toString(), span, null)); + } + return terms; + } + + /** + * Returns one {@link Term} per supplied token, attaching the matching part-of-speech tag so that + * {@link Dimension#LEMMA} can be computed. The terms have no source span. + * + * @param tokens The tokens. + * @param tags The part-of-speech tag for each token; must be the same length as {@code tokens}. + * @return The terms. + * @throws IllegalArgumentException if {@code tokens} and {@code tags} differ in length. + */ + public List<Term> analyze(String[] tokens, String[] tags) { + if (tokens.length != tags.length) { + throw new IllegalArgumentException( + "tokens and tags must be the same length, got " + tokens.length + " and " + tags.length); + } + final List<Term> terms = new ArrayList<>(tokens.length); + for (int i = 0; i < tokens.length; i++) { + terms.add(new Term(this, tokens[i], null, tags[i])); + } + return terms; + } + + /** + * {@return the configured dimensions that are computed eagerly, in canonical order} The list + * never includes {@link Dimension#ORIGINAL}, which is always present. + */ + public List<Dimension> dimensions() { + return chain; + } + + Dimension finalDimension() { + return finalDimension; + } + + // Applies one dimension's transform to a single token value. Fails loudly when a token-level + // dimension was requested without the engine (or tag) it needs. + String apply(Dimension dimension, String input, String posTag) { + switch (dimension) { + case ORIGINAL: + return input; + case STEM: + if (stemmer == null) { + throw new IllegalStateException( + "Dimension STEM requires a Stemmer; configure it with builder().stem(...)"); + } + return stemmer.stem(input).toString(); + case LEMMA: + if (lemmatizer == null) { + throw new IllegalStateException( + "Dimension LEMMA requires a Lemmatizer; configure it with builder().lemmatize(...)"); + } + if (posTag == null) { + throw new IllegalStateException( + "Dimension LEMMA requires a part-of-speech tag; use analyze(tokens, tags)"); + } + return lemmatizer.lemmatize(new String[] {input}, new String[] {posTag})[0]; + default: + // A builder override wins; otherwise the dimension's own default normalizer. + final CharSequenceNormalizer normalizer = transforms.containsKey(dimension) + ? transforms.get(dimension) : dimension.defaultNormalizer(); + return normalizer.normalize(input).toString(); + } Review Comment: `Dimension.defaultNormalizer()` can return `null` (per its implementation), which would cause a `NullPointerException` here for any character-level dimension without a default and without a builder override. Consider explicitly checking for `null` and throwing a targeted `IllegalStateException` explaining that the requested dimension has no default normalizer and must be configured via `builder().transform(...)`. ########## opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordBreakProperty.java: ########## @@ -0,0 +1,157 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.tokenize.uax29; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.UncheckedIOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.List; + +/** + * Looks up the Unicode {@link WordBreak Word_Break} property of a code point. + * + * <p>The data is loaded once from the {@code WordBreakProperty.txt} resource of the Unicode + * Character Database (parsed with simple cursor scanning, no regular expression). Lookup is O(1) + * for the Basic Multilingual Plane (a direct array index) and O(log n) for supplementary code + * points (a binary search over a small sorted range table), so it imposes no per-character + * allocation on the word boundary algorithm.</p> + */ +public final class WordBreakProperty { + + private static final String RESOURCE = "WordBreakProperty.txt"; + + private static final WordBreak[] VALUES = WordBreak.values(); + + // Word_Break value ordinal for each BMP code point; the default 0 is WordBreak.OTHER. + private static final byte[] BMP = new byte[0x10000]; + + // Supplementary ranges (above the BMP), sorted by start for binary search. + private static final int[] SUPPLEMENTARY_START; + private static final int[] SUPPLEMENTARY_END; + private static final byte[] SUPPLEMENTARY_VALUE; + + static { + final List<int[]> supplementary = new ArrayList<>(); + try (InputStream in = WordBreakProperty.class.getResourceAsStream(RESOURCE)) { + if (in == null) { + throw new IllegalStateException("Missing Word_Break data resource: " + RESOURCE); + } + load(in, supplementary); + } catch (IOException e) { + throw new UncheckedIOException("Unable to read Word_Break data resource " + RESOURCE, e); + } + supplementary.sort((a, b) -> Integer.compare(a[0], b[0])); + SUPPLEMENTARY_START = new int[supplementary.size()]; + SUPPLEMENTARY_END = new int[supplementary.size()]; + SUPPLEMENTARY_VALUE = new byte[supplementary.size()]; + for (int i = 0; i < supplementary.size(); i++) { + final int[] range = supplementary.get(i); + SUPPLEMENTARY_START[i] = range[0]; + SUPPLEMENTARY_END[i] = range[1]; + SUPPLEMENTARY_VALUE[i] = (byte) range[2]; + } + } + + private WordBreakProperty() { + } + + private static void load(InputStream in, List<int[]> supplementary) throws IOException { + try (BufferedReader reader = + new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))) { + String line; + while ((line = reader.readLine()) != null) { + final int hash = line.indexOf('#'); + final String content = (hash < 0 ? line : line.substring(0, hash)).strip(); + if (content.isEmpty()) { + continue; + } + final int semicolon = content.indexOf(';'); + final String codePoints = content.substring(0, semicolon).strip(); + final String value = content.substring(semicolon + 1).strip(); + final byte ordinal = (byte) WordBreak.fromPropertyName(value).ordinal(); + + final int dots = codePoints.indexOf(".."); + final int start; + final int end; + if (dots < 0) { + start = Integer.parseInt(codePoints, 16); + end = start; + } else { + start = Integer.parseInt(codePoints.substring(0, dots), 16); + end = Integer.parseInt(codePoints.substring(dots + 2), 16); + } + assign(start, end, ordinal, supplementary); + } + } + } + + private static void assign(int start, int end, byte ordinal, List<int[]> supplementary) { + final int bmpEnd = Math.min(end, 0xFFFF); + for (int codePoint = start; codePoint <= bmpEnd; codePoint++) { + BMP[codePoint] = ordinal; + } + if (end > 0xFFFF) { + supplementary.add(new int[] {Math.max(start, 0x10000), end, ordinal}); + } + } + + /** + * {@return the {@link WordBreak} value of a code point} + * + * @param codePoint The code point. Values outside {@code [0, U+10FFFF]} return + * {@link WordBreak#OTHER}. + */ + public static WordBreak of(int codePoint) { + return VALUES[ordinalOf(codePoint)]; + } + + /** + * {@return the {@link WordBreak#ordinal() ordinal} of a code point's {@link WordBreak} value} + * This is the allocation-free form of {@link #of(int)} for hot loops that work with ordinals. + * + * @param codePoint The code point. Values outside {@code [0, U+10FFFF]} return the ordinal of + * {@link WordBreak#OTHER}. + */ + public static int ordinalOf(int codePoint) { + if (codePoint >= 0 && codePoint <= 0xFFFF) { + return BMP[codePoint]; + } + return ordinalOfSupplementary(codePoint); + } Review Comment: `BMP[codePoint]` is a signed `byte` promoted to `int`, so values ≥ 128 would become negative due to sign extension. Even though current `WordBreak` ordinals fit in a byte, it’s safer to return `BMP[codePoint] & 0xFF` (and similarly mask `SUPPLEMENTARY_VALUE[mid]`) to prevent future latent index bugs. ########## opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NormalizationProfiles.java: ########## @@ -0,0 +1,122 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; +import java.util.MissingResourceException; +import java.util.Objects; +import java.util.Optional; +import java.util.Set; + +import opennlp.tools.langdetect.LanguageDetector; +import opennlp.tools.stemmer.snowball.SnowballStemmer; + +/** + * A registry of {@link NormalizationProfile}s by language, with detection-based fallback. This is + * the language dispatch the design note calls for: pick the profile for a requested language, or + * detect the language with a {@link LanguageDetector} when it is unspecified. The covered languages + * are exactly those with a Snowball stemmer. + * + * <p>Profiles are keyed by ISO 639-3 code (what {@link LanguageDetector} produces); + * {@link #forLanguage(String)} also accepts ISO 639-1 two-letter codes.</p> + */ +public final class NormalizationProfiles { + + private static final Map<String, NormalizationProfile> BY_LANGUAGE = build(); + + private NormalizationProfiles() { + } + + private static Map<String, NormalizationProfile> build() { + final Map<String, NormalizationProfile> map = new HashMap<>(); + // The generic accent fold is used for English and the major Romance languages, German uses its + // own ae/oe/ue/ss fold, and folding is disabled elsewhere (Nordic, non-Latin) where diacritics + // mark distinct letters. + final CharSequenceNormalizer latin = AccentFoldCharSequenceNormalizer.getInstance(); + final CharSequenceNormalizer german = GermanUmlautCharSequenceNormalizer.getInstance(); + add(map, "ara", SnowballStemmer.ALGORITHM.ARABIC, null); + add(map, "cat", SnowballStemmer.ALGORITHM.CATALAN, latin); + add(map, "dan", SnowballStemmer.ALGORITHM.DANISH, null); + add(map, "deu", SnowballStemmer.ALGORITHM.GERMAN, german); + add(map, "ell", SnowballStemmer.ALGORITHM.GREEK, null); + add(map, "eng", SnowballStemmer.ALGORITHM.ENGLISH, latin); + add(map, "fin", SnowballStemmer.ALGORITHM.FINNISH, null); + add(map, "fra", SnowballStemmer.ALGORITHM.FRENCH, latin); + add(map, "gle", SnowballStemmer.ALGORITHM.IRISH, null); + add(map, "hun", SnowballStemmer.ALGORITHM.HUNGARIAN, null); + add(map, "ind", SnowballStemmer.ALGORITHM.INDONESIAN, null); + add(map, "ita", SnowballStemmer.ALGORITHM.ITALIAN, latin); + add(map, "nld", SnowballStemmer.ALGORITHM.DUTCH, null); + add(map, "nor", SnowballStemmer.ALGORITHM.NORWEGIAN, null); + add(map, "por", SnowballStemmer.ALGORITHM.PORTUGUESE, latin); + add(map, "ron", SnowballStemmer.ALGORITHM.ROMANIAN, null); + add(map, "rus", SnowballStemmer.ALGORITHM.RUSSIAN, null); + add(map, "spa", SnowballStemmer.ALGORITHM.SPANISH, latin); + add(map, "swe", SnowballStemmer.ALGORITHM.SWEDISH, null); + return Map.copyOf(map); + } + + private static void add(Map<String, NormalizationProfile> map, String language, + SnowballStemmer.ALGORITHM algorithm, CharSequenceNormalizer accentFold) { + map.put(language, new NormalizationProfile(language, algorithm, accentFold)); + } + + /** + * Returns the profile for a language. + * + * @param language An ISO 639-3 or ISO 639-1 language code; case-insensitive. + * @return The profile, or empty if the language has no Snowball stemmer. + */ + public static Optional<NormalizationProfile> forLanguage(String language) { + Objects.requireNonNull(language, "language"); + String code = language.strip().toLowerCase(Locale.ROOT); + if (code.length() == 2) { + try { + final String iso3 = Locale.of(code).getISO3Language(); + if (!iso3.isEmpty()) { + code = iso3; + } + } catch (MissingResourceException ignored) { + // No ISO 639-3 code for this two-letter code; fall through and look up as given. + } + } + return Optional.ofNullable(BY_LANGUAGE.get(code)); + } Review Comment: `Locale.of(code)` is not available on older Java targets (e.g., Java 17), which will break compilation. Prefer `new Locale(code)` (or `Locale.forLanguageTag(code)`) for ISO3 resolution to keep compatibility with common LTS baselines. ########## opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordBreakPropertyTest.java: ########## @@ -0,0 +1,87 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.tokenize.uax29; + +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertSame; + +public class WordBreakPropertyTest { + + @Test + void testAsciiLettersAndDigits() { + assertSame(WordBreak.ALETTER, WordBreakProperty.of('a')); + assertSame(WordBreak.ALETTER, WordBreakProperty.of('Z')); + assertSame(WordBreak.NUMERIC, WordBreakProperty.of('0')); + assertSame(WordBreak.NUMERIC, WordBreakProperty.of('9')); + } + + @Test + void testWhitespaceAndLineBreaks() { + assertSame(WordBreak.WSEG_SPACE, WordBreakProperty.of(0x0020)); // space + assertSame(WordBreak.CR, WordBreakProperty.of(0x000D)); + assertSame(WordBreak.LF, WordBreakProperty.of(0x000A)); + assertSame(WordBreak.NEWLINE, WordBreakProperty.of(0x000B)); // vertical tab + } + + @Test + void testMidAndExtendClasses() { + assertSame(WordBreak.MID_NUM, WordBreakProperty.of(0x002C)); // comma + assertSame(WordBreak.MID_NUM_LET, WordBreakProperty.of(0x002E)); // full stop + assertSame(WordBreak.MID_LETTER, WordBreakProperty.of(0x003A)); // colon + assertSame(WordBreak.EXTEND_NUM_LET, WordBreakProperty.of(0x005F)); // low line + assertSame(WordBreak.EXTEND, WordBreakProperty.of(0x0301)); // combining acute + } + + @Test + void testQuotesJoinerAndScriptLetters() { + assertSame(WordBreak.SINGLE_QUOTE, WordBreakProperty.of(0x0027)); + assertSame(WordBreak.DOUBLE_QUOTE, WordBreakProperty.of(0x0022)); + assertSame(WordBreak.ZWJ, WordBreakProperty.of(0x200D)); + assertSame(WordBreak.HEBREW_LETTER, WordBreakProperty.of(0x05D0)); + assertSame(WordBreak.KATAKANA, WordBreakProperty.of(0x30A1)); + } + + @Test + void testSupplementaryCodePointsUseTheRangeTable() { + assertSame(WordBreak.REGIONAL_INDICATOR, WordBreakProperty.of(0x1F1E6)); // regional indicator A + assertSame(WordBreak.ALETTER, WordBreakProperty.of(0x1D400)); // math bold A + assertSame(WordBreak.OTHER, WordBreakProperty.of(0x1F600)); // grinning face + } + + @ParameterizedTest + @ValueSource(ints = {0x0021, 0x0040, 0x2014}) + void testUnassignedCodePointsAreOther(int codePoint) { + assertSame(WordBreak.OTHER, WordBreakProperty.of(codePoint)); + } Review Comment: The test name says “unassigned code points”, but the provided values (e.g., U+0021, U+0040, U+2014) are assigned punctuation/symbol characters. Renaming the test to reflect what it’s actually asserting (e.g., “punctuation/symbols map to OTHER”) would avoid confusion. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
