mawiesne commented on code in PR #989: URL: https://github.com/apache/opennlp/pull/989#discussion_r2972114373
########## opennlp-api/src/main/java/opennlp/tools/util/LanguageCodeValidator.java: ########## @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +/** + * Validates language codes against ISO 639 standards. + * <p> + * Accepts: + * <ul> + * <li>ISO 639-1 two-letter language codes (e.g., {@code "en"}, {@code "de"})</li> + * <li>ISO 639-2/3 three-letter language codes consisting of lowercase ASCII letters + * (e.g., {@code "eng"}, {@code "deu"}, {@code "dut"}, {@code "und"})</li> + * <li>The special code {@code "x-unspecified"} used internally by OpenNLP</li> + * </ul> + * + * @see <a href="https://iso639-3.sil.org/">ISO 639-3</a> + */ +public final class LanguageCodeValidator { + + private static final String X_UNSPECIFIED = "x-unspecified"; + + private static final Set<String> ISO639_1_CODES = Review Comment: We could also use: https://docs.oracle.com/en/java/javase/21/docs/api/java.base/java/util/Locale.html#availableLocales() _Locale#static Stream<Locale> availableLocales()_ and then collect each valid 3-letter code via _loc.getISO3Language()_ This way, the check below would only allow actually existing codes + extra special codes. ########## opennlp-api/src/main/java/opennlp/tools/util/LanguageCodeValidator.java: ########## @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +/** + * Validates language codes against ISO 639 standards. + * <p> + * Accepts: + * <ul> + * <li>ISO 639-1 two-letter language codes (e.g., {@code "en"}, {@code "de"})</li> + * <li>ISO 639-2/3 three-letter language codes consisting of lowercase ASCII letters + * (e.g., {@code "eng"}, {@code "deu"}, {@code "dut"}, {@code "und"})</li> + * <li>The special code {@code "x-unspecified"} used internally by OpenNLP</li> + * </ul> + * + * @see <a href="https://iso639-3.sil.org/">ISO 639-3</a> + */ +public final class LanguageCodeValidator { + + private static final String X_UNSPECIFIED = "x-unspecified"; + + private static final Set<String> ISO639_1_CODES = + new HashSet<>(Arrays.asList(Locale.getISOLanguages())); + + private LanguageCodeValidator() { + // utility class, not intended to be instantiated + } + + /** + * Checks whether the given language code is a valid ISO 639 code. + * <p> + * Two-letter codes are validated against the known set of ISO 639-1 codes. + * Three-letter codes are accepted if they consist entirely of lowercase ASCII letters, + * covering ISO 639-2 (both bibliographic and terminological) and ISO 639-3 codes. + * + * @param languageCode The language code to check. Must not be {@code null}. + * @return {@code true} if the code is valid, {@code false} otherwise. + * @throws NullPointerException if {@code languageCode} is {@code null}. + */ + public static boolean isValid(String languageCode) { + if (languageCode == null) { + throw new NullPointerException("languageCode must not be null"); Review Comment: This should be IllegalArgumentException, adjust Javadoc as well ########## opennlp-api/src/main/java/opennlp/tools/util/LanguageCodeValidator.java: ########## @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +/** + * Validates language codes against ISO 639 standards. + * <p> + * Accepts: + * <ul> + * <li>ISO 639-1 two-letter language codes (e.g., {@code "en"}, {@code "de"})</li> + * <li>ISO 639-2/3 three-letter language codes consisting of lowercase ASCII letters + * (e.g., {@code "eng"}, {@code "deu"}, {@code "dut"}, {@code "und"})</li> + * <li>The special code {@code "x-unspecified"} used internally by OpenNLP</li> + * </ul> + * + * @see <a href="https://iso639-3.sil.org/">ISO 639-3</a> + */ +public final class LanguageCodeValidator { + + private static final String X_UNSPECIFIED = "x-unspecified"; + + private static final Set<String> ISO639_1_CODES = + new HashSet<>(Arrays.asList(Locale.getISOLanguages())); + + private LanguageCodeValidator() { + // utility class, not intended to be instantiated + } + + /** + * Checks whether the given language code is a valid ISO 639 code. + * <p> + * Two-letter codes are validated against the known set of ISO 639-1 codes. + * Three-letter codes are accepted if they consist entirely of lowercase ASCII letters, + * covering ISO 639-2 (both bibliographic and terminological) and ISO 639-3 codes. + * + * @param languageCode The language code to check. Must not be {@code null}. + * @return {@code true} if the code is valid, {@code false} otherwise. + * @throws NullPointerException if {@code languageCode} is {@code null}. + */ + public static boolean isValid(String languageCode) { + if (languageCode == null) { + throw new NullPointerException("languageCode must not be null"); + } + + if (X_UNSPECIFIED.equals(languageCode)) { + return true; + } + + int len = languageCode.length(); + if (len == 2) { + return ISO639_1_CODES.contains(languageCode); + } + if (len == 3) { + return isLowerAsciiAlpha(languageCode); Review Comment: Can be replaced with extra Set<String> lookup. ########## opennlp-api/src/main/java/opennlp/tools/util/LanguageCodeValidator.java: ########## @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.tools.util; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.Locale; +import java.util.Set; + +/** + * Validates language codes against ISO 639 standards. + * <p> + * Accepts: + * <ul> + * <li>ISO 639-1 two-letter language codes (e.g., {@code "en"}, {@code "de"})</li> + * <li>ISO 639-2/3 three-letter language codes consisting of lowercase ASCII letters + * (e.g., {@code "eng"}, {@code "deu"}, {@code "dut"}, {@code "und"})</li> + * <li>The special code {@code "x-unspecified"} used internally by OpenNLP</li> + * </ul> + * + * @see <a href="https://iso639-3.sil.org/">ISO 639-3</a> + */ +public final class LanguageCodeValidator { + + private static final String X_UNSPECIFIED = "x-unspecified"; + + private static final Set<String> ISO639_1_CODES = + new HashSet<>(Arrays.asList(Locale.getISOLanguages())); + + private LanguageCodeValidator() { + // utility class, not intended to be instantiated + } + + /** + * Checks whether the given language code is a valid ISO 639 code. + * <p> + * Two-letter codes are validated against the known set of ISO 639-1 codes. + * Three-letter codes are accepted if they consist entirely of lowercase ASCII letters, + * covering ISO 639-2 (both bibliographic and terminological) and ISO 639-3 codes. + * + * @param languageCode The language code to check. Must not be {@code null}. + * @return {@code true} if the code is valid, {@code false} otherwise. + * @throws NullPointerException if {@code languageCode} is {@code null}. + */ + public static boolean isValid(String languageCode) { + if (languageCode == null) { + throw new NullPointerException("languageCode must not be null"); + } + + if (X_UNSPECIFIED.equals(languageCode)) { + return true; + } + + int len = languageCode.length(); + if (len == 2) { + return ISO639_1_CODES.contains(languageCode); + } + if (len == 3) { + return isLowerAsciiAlpha(languageCode); + } + return false; + } + + /** + * Validates the given language code and throws an {@link IllegalArgumentException} + * if it is not a recognized ISO 639 language code. + * + * @param languageCode The language code to validate. Must not be {@code null}. + * @throws IllegalArgumentException if the code is not a valid ISO 639 language code. + * @throws NullPointerException if {@code languageCode} is {@code null}. + */ + public static void validateLanguageCode(String languageCode) { + if (!isValid(languageCode)) { + throw new IllegalArgumentException( + "Unknown language code '" + languageCode + "', must be a valid ISO 639 code!"); + } + } + + private static boolean isLowerAsciiAlpha(String s) { Review Comment: This impl allows fake ISO codes. Not good. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
