Re: [PR] OPENNLP-1850: UAX #29 word tokenizer and the layered Term model (2/4) (opennlp)

via GitHub Tue, 23 Jun 2026 03:58:13 -0700


Copilot commented on code in PR #1104:
URL: https://github.com/apache/opennlp/pull/1104#discussion_r3459140178



##########
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordBreakProperty.java:
##########
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize.uax29;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.UncheckedIOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Looks up the Unicode {@link WordBreak Word_Break} property of a code point.
+ *
+ * <p>The data is loaded once from the {@code WordBreakProperty.txt} resource 
of the Unicode
+ * Character Database (parsed with simple cursor scanning, no regular 
expression). Lookup is O(1)
+ * for the Basic Multilingual Plane (a direct array index) and O(log n) for 
supplementary code
+ * points (a binary search over a small sorted range table), so it imposes no 
per-character
+ * allocation on the word boundary algorithm.</p>
+ */
+public final class WordBreakProperty {
+
+  private static final String RESOURCE = "WordBreakProperty.txt";
+
+  private static final WordBreak[] VALUES = WordBreak.values();
+
+  // Word_Break value ordinal for each BMP code point; the default 0 is 
WordBreak.OTHER.
+  private static final byte[] BMP = new byte[0x10000];
+
+  // Supplementary ranges (above the BMP), sorted by start for binary search.
+  private static final int[] SUPPLEMENTARY_START;
+  private static final int[] SUPPLEMENTARY_END;
+  private static final byte[] SUPPLEMENTARY_VALUE;
+
+  static {
+    final List<int[]> supplementary = new ArrayList<>();
+    try (InputStream in = 
WordBreakProperty.class.getResourceAsStream(RESOURCE)) {
+      if (in == null) {
+        throw new IllegalStateException("Missing Word_Break data resource: " + 
RESOURCE);
+      }
+      load(in, supplementary);
+    } catch (IOException e) {
+      throw new UncheckedIOException("Unable to read Word_Break data resource 
" + RESOURCE, e);
+    }
+    supplementary.sort((a, b) -> Integer.compare(a[0], b[0]));
+    SUPPLEMENTARY_START = new int[supplementary.size()];
+    SUPPLEMENTARY_END = new int[supplementary.size()];
+    SUPPLEMENTARY_VALUE = new byte[supplementary.size()];
+    for (int i = 0; i < supplementary.size(); i++) {
+      final int[] range = supplementary.get(i);
+      SUPPLEMENTARY_START[i] = range[0];
+      SUPPLEMENTARY_END[i] = range[1];
+      SUPPLEMENTARY_VALUE[i] = (byte) range[2];
+    }
+  }
+
+  private WordBreakProperty() {
+  }
+
+  private static void load(InputStream in, List<int[]> supplementary) throws 
IOException {
+    try (BufferedReader reader =
+             new BufferedReader(new InputStreamReader(in, 
StandardCharsets.UTF_8))) {
+      String line;
+      while ((line = reader.readLine()) != null) {
+        final int hash = line.indexOf('#');
+        final String content = (hash < 0 ? line : line.substring(0, 
hash)).strip();
+        if (content.isEmpty()) {
+          continue;
+        }
+        final int semicolon = content.indexOf(';');
+        final String codePoints = content.substring(0, semicolon).strip();
+        final String value = content.substring(semicolon + 1).strip();
+        final byte ordinal = (byte) 
WordBreak.fromPropertyName(value).ordinal();
+
+        final int dots = codePoints.indexOf("..");
+        final int start;
+        final int end;
+        if (dots < 0) {
+          start = Integer.parseInt(codePoints, 16);
+          end = start;
+        } else {
+          start = Integer.parseInt(codePoints.substring(0, dots), 16);
+          end = Integer.parseInt(codePoints.substring(dots + 2), 16);
+        }
+        assign(start, end, ordinal, supplementary);
+      }
+    }
+  }
+
+  private static void assign(int start, int end, byte ordinal, List<int[]> 
supplementary) {
+    final int bmpEnd = Math.min(end, 0xFFFF);
+    for (int codePoint = start; codePoint <= bmpEnd; codePoint++) {
+      BMP[codePoint] = ordinal;
+    }
+    if (end > 0xFFFF) {
+      supplementary.add(new int[] {Math.max(start, 0x10000), end, ordinal});
+    }
+  }

Review Comment:
   Filling BMP ranges one code point at a time is avoidably expensive during 
class initialization. Using a bulk fill (e.g., `Arrays.fill` over `[start, 
bmpEnd + 1)`) significantly reduces startup cost while preserving behavior.



##########
opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordBoundaryConformanceTest.java:
##########
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize.uax29;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+import java.util.Objects;
+
+import org.junit.jupiter.api.Test;
+
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+/**
+ * Runs the official Unicode {@code WordBreakTest.txt} conformance suite 
against
+ * {@link WordSegmenter}. Each line marks boundaries with U+00F7 (division 
sign) and non-boundaries
+ * with U+00D7 (multiplication sign) between code points.
+ */
+public class WordBoundaryConformanceTest {
+
+  private static final int BOUNDARY = 0x00F7; // division sign
+
+  @Test
+  void testOfficialUnicodeWordBreakConformance() throws IOException {
+    int total = 0;
+    int passed = 0;
+    final List<String> failures = new ArrayList<>();
+
+    try (InputStream in = Objects.requireNonNull(
+             
WordBoundaryConformanceTest.class.getResourceAsStream("WordBreakTest.txt"),
+             "Missing test resource: WordBreakTest.txt");
+         BufferedReader reader =
+             new BufferedReader(new InputStreamReader(in, 
StandardCharsets.UTF_8))) {
+      String raw;
+      int lineNumber = 0;
+      while ((raw = reader.readLine()) != null) {
+        lineNumber++;
+        final int hash = raw.indexOf('#');
+        final String content = (hash < 0 ? raw : raw.substring(0, 
hash)).strip();
+        if (content.isEmpty()) {
+          continue;
+        }
+        final String[] tokens = content.split("\\s+");
+
+        final StringBuilder text = new StringBuilder();
+        final List<Integer> expected = new ArrayList<>();
+        expected.add(0); // tokens[0] is always a leading boundary marker.
+        int offset = 0;
+        for (int k = 1; k < tokens.length; k += 2) {
+          final int codePoint = Integer.parseInt(tokens[k], 16);
+          text.appendCodePoint(codePoint);
+          offset += Character.charCount(codePoint);
+          if (tokens[k + 1].codePointAt(0) == BOUNDARY) {
+            expected.add(offset);
+          }
+        }
+
+        final int[] actual = WordSegmenter.boundaries(text);
+        final int[] expectedArray = 
expected.stream().mapToInt(Integer::intValue).toArray();
+        total++;
+        if (Arrays.equals(actual, expectedArray)) {
+          passed++;
+        } else if (failures.size() < 25) {
+          failures.add("line " + lineNumber + ": " + content
+              + "\n    expected=" + Arrays.toString(expectedArray)
+              + "\n    actual  =" + Arrays.toString(actual));
+        }
+      }
+    }
+
+    final int passRate = total == 0 ? 0 : passed * 100 / total;
+    System.out.println("UAX#29 word-break conformance: " + passed + "/" + total
+        + " (" + passRate + "%)");
+    assertTrue(total > 1900, "expected the full conformance suite to load, ran 
only " + total);

Review Comment:
   Printing directly to stdout from a unit test is noisy in CI and makes 
failures harder to spot. Consider removing the `System.out.println` entirely, 
or route the info through JUnit facilities (e.g., `TestReporter`) so it’s only 
shown when useful.



##########
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/TermAnalyzer.java:
##########
@@ -0,0 +1,364 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.EnumMap;
+import java.util.EnumSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Objects;
+import java.util.Set;
+
+import opennlp.tools.lemmatizer.Lemmatizer;
+import opennlp.tools.stemmer.Stemmer;
+import opennlp.tools.tokenize.uax29.WordTokenizer;
+import opennlp.tools.util.Span;
+
+/**
+ * Builds {@link Term}s by segmenting text and applying a configured stack of 
normalization
+ * {@link Dimension}s to each token. The analyzer is the configuration; each 
{@link Term} is the
+ * layered result for one token, with the configured dimensions computed 
eagerly and any other
+ * dimension computed lazily on first request.
+ *
+ * <p>Segmentation uses the Unicode {@linkplain WordTokenizer UAX&#160;#29 
word tokenizer}, so the
+ * input does not need to be pre-tokenized. The character-level dimensions 
({@link Dimension#NFC}
+ * through {@link Dimension#ACCENT_FOLD}) have built-in defaults; {@link 
Dimension#STEM} and
+ * {@link Dimension#LEMMA} are enabled by supplying a {@link Stemmer} or 
{@link Lemmatizer}.</p>
+ *
+ * <p>An instance is immutable and is thread-safe when its configured 
transforms are. The built-in
+ * character normalizers are stateless, but the Snowball stemmers are not, so 
an analyzer configured
+ * with a {@link Stemmer} (for example through {@link 
NormalizationProfile#searchAnalyzer()}) should
+ * not be shared across threads when {@link Dimension#STEM} is used. Build one 
with
+ * {@link #builder()}.</p>
+ */
+public final class TermAnalyzer {
+
+  private final List<Dimension> chain;
+  private final Dimension finalDimension;
+  private final EnumMap<Dimension, CharSequenceNormalizer> transforms;
+  private final Stemmer stemmer;
+  private final Lemmatizer lemmatizer;
+  private final WordTokenizer tokenizer;
+
+  private TermAnalyzer(Builder builder) {
+    final List<Dimension> ordered = new ArrayList<>(builder.chain);
+    Collections.sort(ordered); // canonical pipeline order (enum declaration 
order)
+    this.chain = List.copyOf(ordered);
+    this.finalDimension = ordered.isEmpty() ? Dimension.ORIGINAL : 
ordered.get(ordered.size() - 1);
+    // Only the per-analyzer overrides from the builder; the defaults live on 
Dimension itself.
+    this.transforms = new EnumMap<>(builder.transforms);
+    this.stemmer = builder.stemmer;
+    this.lemmatizer = builder.lemmatizer;
+    this.tokenizer = builder.tokenizer;
+  }
+
+  /**
+   * {@return a new builder}
+   */
+  public static Builder builder() {
+    return new Builder();
+  }
+
+  /**
+   * Segments {@code text} with the UAX&#160;#29 word tokenizer and returns 
one {@link Term} per
+   * word token, in order. The terms carry no part-of-speech tag, so {@link 
Dimension#LEMMA} is not
+   * available from them.
+   *
+   * @param text The text to analyze.
+   * @return The terms.
+   */
+  public List<Term> analyze(CharSequence text) {
+    final List<Span> spans = tokenizer.tokenizeSpans(text);
+    final List<Term> terms = new ArrayList<>(spans.size());
+    for (final Span span : spans) {
+      terms.add(new Term(this, span.getCoveredText(text).toString(), span, 
null));
+    }
+    return terms;
+  }
+
+  /**
+   * Returns one {@link Term} per supplied token, attaching the matching 
part-of-speech tag so that
+   * {@link Dimension#LEMMA} can be computed. The terms have no source span.
+   *
+   * @param tokens The tokens.
+   * @param tags   The part-of-speech tag for each token; must be the same 
length as {@code tokens}.
+   * @return The terms.
+   * @throws IllegalArgumentException if {@code tokens} and {@code tags} 
differ in length.
+   */
+  public List<Term> analyze(String[] tokens, String[] tags) {
+    if (tokens.length != tags.length) {
+      throw new IllegalArgumentException(
+          "tokens and tags must be the same length, got " + tokens.length + " 
and " + tags.length);
+    }
+    final List<Term> terms = new ArrayList<>(tokens.length);
+    for (int i = 0; i < tokens.length; i++) {
+      terms.add(new Term(this, tokens[i], null, tags[i]));
+    }
+    return terms;
+  }
+
+  /**
+   * {@return the configured dimensions that are computed eagerly, in 
canonical order} The list
+   * never includes {@link Dimension#ORIGINAL}, which is always present.
+   */
+  public List<Dimension> dimensions() {
+    return chain;
+  }
+
+  Dimension finalDimension() {
+    return finalDimension;
+  }
+
+  // Applies one dimension's transform to a single token value. Fails loudly 
when a token-level
+  // dimension was requested without the engine (or tag) it needs.
+  String apply(Dimension dimension, String input, String posTag) {
+    switch (dimension) {
+      case ORIGINAL:
+        return input;
+      case STEM:
+        if (stemmer == null) {
+          throw new IllegalStateException(
+              "Dimension STEM requires a Stemmer; configure it with 
builder().stem(...)");
+        }
+        return stemmer.stem(input).toString();
+      case LEMMA:
+        if (lemmatizer == null) {
+          throw new IllegalStateException(
+              "Dimension LEMMA requires a Lemmatizer; configure it with 
builder().lemmatize(...)");
+        }
+        if (posTag == null) {
+          throw new IllegalStateException(
+              "Dimension LEMMA requires a part-of-speech tag; use 
analyze(tokens, tags)");
+        }
+        return lemmatizer.lemmatize(new String[] {input}, new String[] 
{posTag})[0];
+      default:
+        // A builder override wins; otherwise the dimension's own default 
normalizer.
+        final CharSequenceNormalizer normalizer = 
transforms.containsKey(dimension)
+            ? transforms.get(dimension) : dimension.defaultNormalizer();
+        return normalizer.normalize(input).toString();
+    }

Review Comment:
   `Dimension.defaultNormalizer()` can return `null` (per its implementation), 
which would cause a `NullPointerException` here for any character-level 
dimension without a default and without a builder override. Consider explicitly 
checking for `null` and throwing a targeted `IllegalStateException` explaining 
that the requested dimension has no default normalizer and must be configured 
via `builder().transform(...)`.



##########
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/tokenize/uax29/WordBreakProperty.java:
##########
@@ -0,0 +1,157 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize.uax29;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.UncheckedIOException;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.List;
+
+/**
+ * Looks up the Unicode {@link WordBreak Word_Break} property of a code point.
+ *
+ * <p>The data is loaded once from the {@code WordBreakProperty.txt} resource 
of the Unicode
+ * Character Database (parsed with simple cursor scanning, no regular 
expression). Lookup is O(1)
+ * for the Basic Multilingual Plane (a direct array index) and O(log n) for 
supplementary code
+ * points (a binary search over a small sorted range table), so it imposes no 
per-character
+ * allocation on the word boundary algorithm.</p>
+ */
+public final class WordBreakProperty {
+
+  private static final String RESOURCE = "WordBreakProperty.txt";
+
+  private static final WordBreak[] VALUES = WordBreak.values();
+
+  // Word_Break value ordinal for each BMP code point; the default 0 is 
WordBreak.OTHER.
+  private static final byte[] BMP = new byte[0x10000];
+
+  // Supplementary ranges (above the BMP), sorted by start for binary search.
+  private static final int[] SUPPLEMENTARY_START;
+  private static final int[] SUPPLEMENTARY_END;
+  private static final byte[] SUPPLEMENTARY_VALUE;
+
+  static {
+    final List<int[]> supplementary = new ArrayList<>();
+    try (InputStream in = 
WordBreakProperty.class.getResourceAsStream(RESOURCE)) {
+      if (in == null) {
+        throw new IllegalStateException("Missing Word_Break data resource: " + 
RESOURCE);
+      }
+      load(in, supplementary);
+    } catch (IOException e) {
+      throw new UncheckedIOException("Unable to read Word_Break data resource 
" + RESOURCE, e);
+    }
+    supplementary.sort((a, b) -> Integer.compare(a[0], b[0]));
+    SUPPLEMENTARY_START = new int[supplementary.size()];
+    SUPPLEMENTARY_END = new int[supplementary.size()];
+    SUPPLEMENTARY_VALUE = new byte[supplementary.size()];
+    for (int i = 0; i < supplementary.size(); i++) {
+      final int[] range = supplementary.get(i);
+      SUPPLEMENTARY_START[i] = range[0];
+      SUPPLEMENTARY_END[i] = range[1];
+      SUPPLEMENTARY_VALUE[i] = (byte) range[2];
+    }
+  }
+
+  private WordBreakProperty() {
+  }
+
+  private static void load(InputStream in, List<int[]> supplementary) throws 
IOException {
+    try (BufferedReader reader =
+             new BufferedReader(new InputStreamReader(in, 
StandardCharsets.UTF_8))) {
+      String line;
+      while ((line = reader.readLine()) != null) {
+        final int hash = line.indexOf('#');
+        final String content = (hash < 0 ? line : line.substring(0, 
hash)).strip();
+        if (content.isEmpty()) {
+          continue;
+        }
+        final int semicolon = content.indexOf(';');
+        final String codePoints = content.substring(0, semicolon).strip();
+        final String value = content.substring(semicolon + 1).strip();
+        final byte ordinal = (byte) 
WordBreak.fromPropertyName(value).ordinal();
+
+        final int dots = codePoints.indexOf("..");
+        final int start;
+        final int end;
+        if (dots < 0) {
+          start = Integer.parseInt(codePoints, 16);
+          end = start;
+        } else {
+          start = Integer.parseInt(codePoints.substring(0, dots), 16);
+          end = Integer.parseInt(codePoints.substring(dots + 2), 16);
+        }
+        assign(start, end, ordinal, supplementary);
+      }
+    }
+  }
+
+  private static void assign(int start, int end, byte ordinal, List<int[]> 
supplementary) {
+    final int bmpEnd = Math.min(end, 0xFFFF);
+    for (int codePoint = start; codePoint <= bmpEnd; codePoint++) {
+      BMP[codePoint] = ordinal;
+    }
+    if (end > 0xFFFF) {
+      supplementary.add(new int[] {Math.max(start, 0x10000), end, ordinal});
+    }
+  }
+
+  /**
+   * {@return the {@link WordBreak} value of a code point}
+   *
+   * @param codePoint The code point. Values outside {@code [0, U+10FFFF]} 
return
+   *     {@link WordBreak#OTHER}.
+   */
+  public static WordBreak of(int codePoint) {
+    return VALUES[ordinalOf(codePoint)];
+  }
+
+  /**
+   * {@return the {@link WordBreak#ordinal() ordinal} of a code point's {@link 
WordBreak} value}
+   * This is the allocation-free form of {@link #of(int)} for hot loops that 
work with ordinals.
+   *
+   * @param codePoint The code point. Values outside {@code [0, U+10FFFF]} 
return the ordinal of
+   *     {@link WordBreak#OTHER}.
+   */
+  public static int ordinalOf(int codePoint) {
+    if (codePoint >= 0 && codePoint <= 0xFFFF) {
+      return BMP[codePoint];
+    }
+    return ordinalOfSupplementary(codePoint);
+  }

Review Comment:
   `BMP[codePoint]` is a signed `byte` promoted to `int`, so values ≥ 128 would 
become negative due to sign extension. Even though current `WordBreak` ordinals 
fit in a byte, it’s safer to return `BMP[codePoint] & 0xFF` (and similarly mask 
`SUPPLEMENTARY_VALUE[mid]`) to prevent future latent index bugs.



##########
opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/NormalizationProfiles.java:
##########
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import java.util.HashMap;
+import java.util.Locale;
+import java.util.Map;
+import java.util.MissingResourceException;
+import java.util.Objects;
+import java.util.Optional;
+import java.util.Set;
+
+import opennlp.tools.langdetect.LanguageDetector;
+import opennlp.tools.stemmer.snowball.SnowballStemmer;
+
+/**
+ * A registry of {@link NormalizationProfile}s by language, with 
detection-based fallback. This is
+ * the language dispatch the design note calls for: pick the profile for a 
requested language, or
+ * detect the language with a {@link LanguageDetector} when it is unspecified. 
The covered languages
+ * are exactly those with a Snowball stemmer.
+ *
+ * <p>Profiles are keyed by ISO 639-3 code (what {@link LanguageDetector} 
produces);
+ * {@link #forLanguage(String)} also accepts ISO 639-1 two-letter codes.</p>
+ */
+public final class NormalizationProfiles {
+
+  private static final Map<String, NormalizationProfile> BY_LANGUAGE = build();
+
+  private NormalizationProfiles() {
+  }
+
+  private static Map<String, NormalizationProfile> build() {
+    final Map<String, NormalizationProfile> map = new HashMap<>();
+    // The generic accent fold is used for English and the major Romance 
languages, German uses its
+    // own ae/oe/ue/ss fold, and folding is disabled elsewhere (Nordic, 
non-Latin) where diacritics
+    // mark distinct letters.
+    final CharSequenceNormalizer latin = 
AccentFoldCharSequenceNormalizer.getInstance();
+    final CharSequenceNormalizer german = 
GermanUmlautCharSequenceNormalizer.getInstance();
+    add(map, "ara", SnowballStemmer.ALGORITHM.ARABIC, null);
+    add(map, "cat", SnowballStemmer.ALGORITHM.CATALAN, latin);
+    add(map, "dan", SnowballStemmer.ALGORITHM.DANISH, null);
+    add(map, "deu", SnowballStemmer.ALGORITHM.GERMAN, german);
+    add(map, "ell", SnowballStemmer.ALGORITHM.GREEK, null);
+    add(map, "eng", SnowballStemmer.ALGORITHM.ENGLISH, latin);
+    add(map, "fin", SnowballStemmer.ALGORITHM.FINNISH, null);
+    add(map, "fra", SnowballStemmer.ALGORITHM.FRENCH, latin);
+    add(map, "gle", SnowballStemmer.ALGORITHM.IRISH, null);
+    add(map, "hun", SnowballStemmer.ALGORITHM.HUNGARIAN, null);
+    add(map, "ind", SnowballStemmer.ALGORITHM.INDONESIAN, null);
+    add(map, "ita", SnowballStemmer.ALGORITHM.ITALIAN, latin);
+    add(map, "nld", SnowballStemmer.ALGORITHM.DUTCH, null);
+    add(map, "nor", SnowballStemmer.ALGORITHM.NORWEGIAN, null);
+    add(map, "por", SnowballStemmer.ALGORITHM.PORTUGUESE, latin);
+    add(map, "ron", SnowballStemmer.ALGORITHM.ROMANIAN, null);
+    add(map, "rus", SnowballStemmer.ALGORITHM.RUSSIAN, null);
+    add(map, "spa", SnowballStemmer.ALGORITHM.SPANISH, latin);
+    add(map, "swe", SnowballStemmer.ALGORITHM.SWEDISH, null);
+    return Map.copyOf(map);
+  }
+
+  private static void add(Map<String, NormalizationProfile> map, String 
language,
+      SnowballStemmer.ALGORITHM algorithm, CharSequenceNormalizer accentFold) {
+    map.put(language, new NormalizationProfile(language, algorithm, 
accentFold));
+  }
+
+  /**
+   * Returns the profile for a language.
+   *
+   * @param language An ISO 639-3 or ISO 639-1 language code; case-insensitive.
+   * @return The profile, or empty if the language has no Snowball stemmer.
+   */
+  public static Optional<NormalizationProfile> forLanguage(String language) {
+    Objects.requireNonNull(language, "language");
+    String code = language.strip().toLowerCase(Locale.ROOT);
+    if (code.length() == 2) {
+      try {
+        final String iso3 = Locale.of(code).getISO3Language();
+        if (!iso3.isEmpty()) {
+          code = iso3;
+        }
+      } catch (MissingResourceException ignored) {
+        // No ISO 639-3 code for this two-letter code; fall through and look 
up as given.
+      }
+    }
+    return Optional.ofNullable(BY_LANGUAGE.get(code));
+  }

Review Comment:
   `Locale.of(code)` is not available on older Java targets (e.g., Java 17), 
which will break compilation. Prefer `new Locale(code)` (or 
`Locale.forLanguageTag(code)`) for ISO3 resolution to keep compatibility with 
common LTS baselines.



##########
opennlp-core/opennlp-runtime/src/test/java/opennlp/tools/tokenize/uax29/WordBreakPropertyTest.java:
##########
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.tokenize.uax29;
+
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertSame;
+
+public class WordBreakPropertyTest {
+
+  @Test
+  void testAsciiLettersAndDigits() {
+    assertSame(WordBreak.ALETTER, WordBreakProperty.of('a'));
+    assertSame(WordBreak.ALETTER, WordBreakProperty.of('Z'));
+    assertSame(WordBreak.NUMERIC, WordBreakProperty.of('0'));
+    assertSame(WordBreak.NUMERIC, WordBreakProperty.of('9'));
+  }
+
+  @Test
+  void testWhitespaceAndLineBreaks() {
+    assertSame(WordBreak.WSEG_SPACE, WordBreakProperty.of(0x0020)); // space
+    assertSame(WordBreak.CR, WordBreakProperty.of(0x000D));
+    assertSame(WordBreak.LF, WordBreakProperty.of(0x000A));
+    assertSame(WordBreak.NEWLINE, WordBreakProperty.of(0x000B));   // vertical 
tab
+  }
+
+  @Test
+  void testMidAndExtendClasses() {
+    assertSame(WordBreak.MID_NUM, WordBreakProperty.of(0x002C));      // comma
+    assertSame(WordBreak.MID_NUM_LET, WordBreakProperty.of(0x002E));  // full 
stop
+    assertSame(WordBreak.MID_LETTER, WordBreakProperty.of(0x003A));   // colon
+    assertSame(WordBreak.EXTEND_NUM_LET, WordBreakProperty.of(0x005F)); // low 
line
+    assertSame(WordBreak.EXTEND, WordBreakProperty.of(0x0301));       // 
combining acute
+  }
+
+  @Test
+  void testQuotesJoinerAndScriptLetters() {
+    assertSame(WordBreak.SINGLE_QUOTE, WordBreakProperty.of(0x0027));
+    assertSame(WordBreak.DOUBLE_QUOTE, WordBreakProperty.of(0x0022));
+    assertSame(WordBreak.ZWJ, WordBreakProperty.of(0x200D));
+    assertSame(WordBreak.HEBREW_LETTER, WordBreakProperty.of(0x05D0));
+    assertSame(WordBreak.KATAKANA, WordBreakProperty.of(0x30A1));
+  }
+
+  @Test
+  void testSupplementaryCodePointsUseTheRangeTable() {
+    assertSame(WordBreak.REGIONAL_INDICATOR, WordBreakProperty.of(0x1F1E6)); 
// regional indicator A
+    assertSame(WordBreak.ALETTER, WordBreakProperty.of(0x1D400));            
// math bold A
+    assertSame(WordBreak.OTHER, WordBreakProperty.of(0x1F600));              
// grinning face
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {0x0021, 0x0040, 0x2014})
+  void testUnassignedCodePointsAreOther(int codePoint) {
+    assertSame(WordBreak.OTHER, WordBreakProperty.of(codePoint));
+  }

Review Comment:
   The test name says “unassigned code points”, but the provided values (e.g., 
U+0021, U+0040, U+2014) are assigned punctuation/symbol characters. Renaming 
the test to reflect what it’s actually asserting (e.g., “punctuation/symbols 
map to OTHER”) would avoid confusion.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] OPENNLP-1850: UAX #29 word tokenizer and the layered Term model (2/4) (opennlp)

Reply via email to