krickert commented on code in PR #1103: URL: https://github.com/apache/opennlp/pull/1103#discussion_r3446967067
########## opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetMap.java: ########## @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.Arrays; + +/** + * A mapping between character offsets in a normalized string and the original text it came from. + * + * <p>Normalization that collapses runs or substitutes supplementary characters changes string + * length, so an offset into the normalized form no longer lines up with the original. This map + * records, for every normalized character, the original character offset it was produced from, + * which lets a match found in the normalized form be reported in original coordinates.</p> + * + * <p>The internal mapping is non-decreasing, so {@link #toOriginalOffset(int)} is a direct array + * read (O(1)) and {@link #toNormalizedOffset(int)} is a binary search (O(log n)). The map is + * built in the same single cursor pass that produces the normalized text, via {@link Builder}.</p> + */ +public final class OffsetMap { + + // normalizedToOriginal[k] is the original char offset that produced normalized char k. + // It has one extra trailing slot mapping the end of the normalized text to the end of the + // original text, so offsets in [0, normalizedLength] are all valid. + private final int[] normalizedToOriginal; + private final int originalLength; + + private OffsetMap(int[] normalizedToOriginal, int originalLength) { + this.normalizedToOriginal = normalizedToOriginal; + this.originalLength = originalLength; + } + + /** + * Maps a normalized character offset back to the original text. + * + * @param normalizedOffset An offset in {@code [0, normalizedLength]}. + * @return The corresponding original character offset. + * @throws IndexOutOfBoundsException Thrown if {@code normalizedOffset} is out of range. + */ + public int toOriginalOffset(int normalizedOffset) { + if (normalizedOffset < 0 || normalizedOffset >= normalizedToOriginal.length) { + throw new IndexOutOfBoundsException("normalized offset " + normalizedOffset + + " is outside [0, " + normalizedLength() + "]"); + } + return normalizedToOriginal[normalizedOffset]; + } + + /** + * Maps an original character offset forward to the normalized text. + * + * <p>Returns the first normalized offset whose source is at or after {@code originalOffset}. + * When several original characters collapse to one normalized character, they all map to that + * single normalized offset.</p> + * + * @param originalOffset An offset in {@code [0, originalLength]}. + * @return The corresponding normalized character offset. + * @throws IndexOutOfBoundsException Thrown if {@code originalOffset} is out of range. + */ + public int toNormalizedOffset(int originalOffset) { + if (originalOffset < 0 || originalOffset > originalLength) { + throw new IndexOutOfBoundsException("original offset " + originalOffset + + " is outside [0, " + originalLength + "]"); + } + int low = 0; + int high = normalizedToOriginal.length - 1; + int answer = normalizedToOriginal.length - 1; + while (low <= high) { + final int mid = (low + high) >>> 1; + if (normalizedToOriginal[mid] >= originalOffset) { + answer = mid; + high = mid - 1; + } else { + low = mid + 1; + } + } + return answer; + } Review Comment: Fixed. `toNormalizedOffset` now uses floor semantics: the rightmost normalized index whose source offset is at or before the queried offset, clamped to 0. An original offset that falls inside a collapsed run now maps to that run's single normalized character instead of the following one. Added `OffsetMapTest` and extended `CharClassTest` with collapsed-run cases for mixed Unicode whitespace, tabs, newlines, single-char and all-whitespace inputs, and surrogate pairs. ########## opennlp-api/src/main/java/opennlp/tools/util/normalizer/OffsetMap.java: ########## @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.util.Arrays; + +/** + * A mapping between character offsets in a normalized string and the original text it came from. + * + * <p>Normalization that collapses runs or substitutes supplementary characters changes string + * length, so an offset into the normalized form no longer lines up with the original. This map + * records, for every normalized character, the original character offset it was produced from, + * which lets a match found in the normalized form be reported in original coordinates.</p> + * + * <p>The internal mapping is non-decreasing, so {@link #toOriginalOffset(int)} is a direct array + * read (O(1)) and {@link #toNormalizedOffset(int)} is a binary search (O(log n)). The map is + * built in the same single cursor pass that produces the normalized text, via {@link Builder}.</p> + */ +public final class OffsetMap { + + // normalizedToOriginal[k] is the original char offset that produced normalized char k. + // It has one extra trailing slot mapping the end of the normalized text to the end of the + // original text, so offsets in [0, normalizedLength] are all valid. + private final int[] normalizedToOriginal; + private final int originalLength; + + private OffsetMap(int[] normalizedToOriginal, int originalLength) { + this.normalizedToOriginal = normalizedToOriginal; + this.originalLength = originalLength; + } + + /** + * Maps a normalized character offset back to the original text. + * + * @param normalizedOffset An offset in {@code [0, normalizedLength]}. + * @return The corresponding original character offset. + * @throws IndexOutOfBoundsException Thrown if {@code normalizedOffset} is out of range. + */ + public int toOriginalOffset(int normalizedOffset) { + if (normalizedOffset < 0 || normalizedOffset >= normalizedToOriginal.length) { + throw new IndexOutOfBoundsException("normalized offset " + normalizedOffset + + " is outside [0, " + normalizedLength() + "]"); + } + return normalizedToOriginal[normalizedOffset]; + } + + /** + * Maps an original character offset forward to the normalized text. + * + * <p>Returns the first normalized offset whose source is at or after {@code originalOffset}. + * When several original characters collapse to one normalized character, they all map to that + * single normalized offset.</p> Review Comment: Fixed. The javadoc now describes the floor behavior: it maps to the character the offset falls within, and offsets before the first retained character map to 0. ########## rat-excludes: ########## @@ -64,3 +64,6 @@ src/test/resources/*.info <!-- other licence --> src/main/java/opennlp/tools/stemmer/snowball/*.java src/main/resources/opennlp/tools/stopword/*.txt + +<!-- OPENNLP-1850 bundled Unicode data file (Unicode License V3, no Apache header possible) --> +src/main/resources/opennlp/tools/util/normalizer/confusables.txt Review Comment: Correct as written. `rat-excludes` entries are module-relative (RAT runs per Maven module), so they drop the module prefix. This matches the existing convention in this file: `src/main/resources/opennlp/tools/stopword/*.txt` resolves against `opennlp-core/opennlp-runtime/`, and `src/test/resources/opennlp/spellcheck/*.txt` (added in OPENNLP-1832) resolves against `opennlp-extensions/opennlp-spellcheck/`. Adding an `opennlp-core/opennlp-runtime/` prefix would make it the only inconsistent entry and would not match RAT's per-module resolution. ########## opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Confusables.java: ########## @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.UncheckedIOException; +import java.nio.charset.StandardCharsets; +import java.text.Normalizer; +import java.util.HashMap; +import java.util.Map; + +/** + * Computes the Unicode confusable <em>skeleton</em> of text, following the algorithm in + * <a href="https://www.unicode.org/reports/tr39/">UTS #39</a> (Unicode Security Mechanisms). Two + * strings are confusable, for example Latin {@code "paypal"} and a version using Cyrillic + * lookalikes, exactly when their skeletons are equal. + * + * <p>The mapping is loaded once from the {@code confusables.txt} resource of the Unicode security + * data (parsed with simple cursor scanning, no regular expression). The skeleton of a string is + * {@code NFD(map(NFD(s)))}: decompose, replace each code point with its prototype, and decompose + * again. This changes length and offsets, so it belongs to the derived, matching-only form rather + * than to any offset-preserving transform.</p> + */ +public final class Confusables { + + private static final String RESOURCE = "confusables.txt"; + + // Maps a single confusable code point to its prototype sequence (one or more code points). + private static final Map<Integer, String> PROTOTYPES = load(); + + private Confusables() { + } + + private static Map<Integer, String> load() { + final Map<Integer, String> map = new HashMap<>(12000); + try (InputStream in = Confusables.class.getResourceAsStream(RESOURCE)) { + if (in == null) { + throw new IllegalStateException("Missing confusables data resource: " + RESOURCE); + } + try (BufferedReader reader = + new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))) { + String line; + while ((line = reader.readLine()) != null) { + final int hash = line.indexOf('#'); + final String content = (hash < 0 ? line : line.substring(0, hash)).strip(); + if (content.isEmpty()) { + continue; + } + final int firstSemicolon = content.indexOf(';'); + final int secondSemicolon = content.indexOf(';', firstSemicolon + 1); + if (firstSemicolon < 0 || secondSemicolon < 0) { + continue; + } + final int source = Integer.parseInt(content.substring(0, firstSemicolon).strip(), 16); + final String target = content.substring(firstSemicolon + 1, secondSemicolon).strip(); + final StringBuilder prototype = new StringBuilder(); + for (final String codePoint : target.split("\\s+")) { + prototype.appendCodePoint(Integer.parseInt(codePoint, 16)); + } + map.put(source, prototype.toString()); Review Comment: Fixed. Replaced the regex split with a manual cursor scan over the whitespace-delimited hex tokens, which matches the class's no-regex contract and avoids compiling a Pattern for every line during static init. The parser is exercised end to end by the new `ConfusableSkeletonTest`. ########## LICENSE: ########## @@ -370,3 +370,47 @@ The following license applies to the SLF4J API: LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +The following license applies to the bundled Unicode data file in +opennlp-core/opennlp-runtime/src/main/resources/opennlp/tools/util/normalizer +(confusables.txt): + + UNICODE LICENSE V3 + + COPYRIGHT AND PERMISSION NOTICE + + Copyright (c) 1991-2026 Unicode, Inc. Review Comment: Fixed. Set the embedded Unicode License V3 block to 1991-2025 to match the bundled `confusables.txt` header and the NOTICE attribution. ########## opennlp-core/opennlp-runtime/src/main/java/opennlp/tools/util/normalizer/Confusables.java: ########## @@ -0,0 +1,120 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.util.normalizer; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.UncheckedIOException; +import java.nio.charset.StandardCharsets; +import java.text.Normalizer; +import java.util.HashMap; +import java.util.Map; + +/** + * Computes the Unicode confusable <em>skeleton</em> of text, following the algorithm in + * <a href="https://www.unicode.org/reports/tr39/">UTS #39</a> (Unicode Security Mechanisms). Two + * strings are confusable, for example Latin {@code "paypal"} and a version using Cyrillic + * lookalikes, exactly when their skeletons are equal. + * + * <p>The mapping is loaded once from the {@code confusables.txt} resource of the Unicode security + * data (parsed with simple cursor scanning, no regular expression). The skeleton of a string is + * {@code NFD(map(NFD(s)))}: decompose, replace each code point with its prototype, and decompose + * again. This changes length and offsets, so it belongs to the derived, matching-only form rather + * than to any offset-preserving transform.</p> + */ +public final class Confusables { + + private static final String RESOURCE = "confusables.txt"; + + // Maps a single confusable code point to its prototype sequence (one or more code points). + private static final Map<Integer, String> PROTOTYPES = load(); + + private Confusables() { + } + + private static Map<Integer, String> load() { + final Map<Integer, String> map = new HashMap<>(12000); + try (InputStream in = Confusables.class.getResourceAsStream(RESOURCE)) { + if (in == null) { + throw new IllegalStateException("Missing confusables data resource: " + RESOURCE); + } + try (BufferedReader reader = + new BufferedReader(new InputStreamReader(in, StandardCharsets.UTF_8))) { + String line; + while ((line = reader.readLine()) != null) { + final int hash = line.indexOf('#'); + final String content = (hash < 0 ? line : line.substring(0, hash)).strip(); + if (content.isEmpty()) { + continue; + } + final int firstSemicolon = content.indexOf(';'); + final int secondSemicolon = content.indexOf(';', firstSemicolon + 1); + if (firstSemicolon < 0 || secondSemicolon < 0) { + continue; + } + final int source = Integer.parseInt(content.substring(0, firstSemicolon).strip(), 16); + final String target = content.substring(firstSemicolon + 1, secondSemicolon).strip(); + final StringBuilder prototype = new StringBuilder(); + for (final String codePoint : target.split("\\s+")) { + prototype.appendCodePoint(Integer.parseInt(codePoint, 16)); + } + map.put(source, prototype.toString()); + } + } + } catch (IOException e) { + throw new UncheckedIOException("Unable to read confusables data resource " + RESOURCE, e); + } + return map; + } + + /** + * Returns the confusable skeleton of {@code text}: {@code NFD(map(NFD(text)))} where {@code map} + * replaces each code point with its UTS #39 prototype. The skeleton is for comparison only; + * it is not human-readable text and does not preserve offsets. + * + * @param text The text to reduce. + * @return The skeleton. + */ + public static String skeleton(CharSequence text) { + final String decomposed = Normalizer.normalize(text, Normalizer.Form.NFD); + final StringBuilder mapped = new StringBuilder(decomposed.length()); + for (int i = 0; i < decomposed.length(); ) { + final int codePoint = decomposed.codePointAt(i); Review Comment: Fixed. Added `ConfusableSkeletonTest` covering skeleton generation directly: known table mappings (Cyrillic a U+0430 to Latin a, Greek omicron U+03BF to Latin o), a spoofed-word confusable match, the reflexive case, distinct non-confusable text, and an ASCII identity case. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
