This is an automated email from the ASF dual-hosted git repository. rzo1 pushed a commit to branch OPENNLP-1633-Remove-dependency-towards-jackson-databind-in-opennlp-dl-module in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit e700f4d7212d25a61f0afa57783a70c7701bbca2 Author: Richard Zowalla <[email protected]> AuthorDate: Tue Oct 29 12:53:02 2024 +0100 OPENNLP-1633 - Remove dependency towards jackson-databind in opennlp-dl module --- opennlp-brat-annotator/pom.xml | 7 -- opennlp-dl/pom.xml | 12 -- .../dl/doccat/DocumentCategorizerConfig.java | 33 +++++- .../opennlp/dl/doccat/DocumentCategorizerDL.java | 42 +++---- .../dl/doccat/DocumentCategorizerConfigTest.java | 125 +++++++++++++++++++++ 5 files changed, 169 insertions(+), 50 deletions(-) diff --git a/opennlp-brat-annotator/pom.xml b/opennlp-brat-annotator/pom.xml index 6a4b7b52..58426af5 100644 --- a/opennlp-brat-annotator/pom.xml +++ b/opennlp-brat-annotator/pom.xml @@ -61,13 +61,6 @@ <artifactId>jackson-databind</artifactId> <version>${jackson.version}</version> <scope>runtime</scope> - <exclusions> - <!-- Byte-Buddy became a dependency by accident - TODO remove it with update version > 2.17.0 --> - <exclusion> - <groupId>net.bytebuddy</groupId> - <artifactId>byte-buddy</artifactId> - </exclusion> - </exclusions> </dependency> <dependency> diff --git a/opennlp-dl/pom.xml b/opennlp-dl/pom.xml index ab52d402..22e80a88 100644 --- a/opennlp-dl/pom.xml +++ b/opennlp-dl/pom.xml @@ -41,18 +41,6 @@ <artifactId>onnxruntime</artifactId> <version>${onnxruntime.version}</version> </dependency> - <dependency> - <groupId>com.fasterxml.jackson.core</groupId> - <artifactId>jackson-databind</artifactId> - <version>${jackson.version}</version> - <exclusions> - <!-- Byte-Buddy became a dependency by accident - TODO remove it with update version > 2.17.0 --> - <exclusion> - <groupId>net.bytebuddy</groupId> - <artifactId>byte-buddy</artifactId> - </exclusion> - </exclusions> - </dependency> <dependency> <groupId>org.slf4j</groupId> <artifactId>slf4j-api</artifactId> diff --git a/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerConfig.java b/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerConfig.java index 8e6d04e2..218266e9 100644 --- a/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerConfig.java +++ b/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerConfig.java @@ -18,18 +18,39 @@ package opennlp.dl.doccat; import java.util.Collections; +import java.util.HashMap; import java.util.Map; +import java.util.Objects; +import java.util.regex.Matcher; +import java.util.regex.Pattern; -public class DocumentCategorizerConfig { +public record DocumentCategorizerConfig(Map<String, String> id2label) { - private Map<String, String> id2label; + private static final Pattern ID_TO_LABEL_PATTERN = Pattern.compile("\"id2label\"\\s*:\\s*\\{(.*?)\\}", Pattern.DOTALL); + private static final Pattern ENTRY_PATTERN = Pattern.compile("\"(\\d+)\"\\s*:\\s*\"(.*?)\""); - public Map<String, String> getId2label() { + @Override + public Map<String, String> id2label() { return Collections.unmodifiableMap(id2label); } - public void setId2label(Map<String, String> id2label) { - this.id2label = id2label; - } + public static DocumentCategorizerConfig fromJson(String json) { + Objects.requireNonNull(json, "json must not be null"); + + final Map<String, String> id2label = new HashMap<>(); + final Matcher matcher = ID_TO_LABEL_PATTERN.matcher(json); + + if (matcher.find()) { + final String id2labelContent = matcher.group(1); + final Matcher entryMatcher = ENTRY_PATTERN.matcher(id2labelContent); + while (entryMatcher.find()) { + final String key = entryMatcher.group(1); + final String value = entryMatcher.group(2); + id2label.put(key, value); + } + } + + return new DocumentCategorizerConfig(id2label); + } } diff --git a/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java b/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java index a06d4b09..822af6af 100644 --- a/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java +++ b/opennlp-dl/src/main/java/opennlp/dl/doccat/DocumentCategorizerDL.java @@ -30,14 +30,13 @@ import java.util.Map; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; + import java.util.stream.IntStream; import ai.onnxruntime.OnnxTensor; import ai.onnxruntime.OrtEnvironment; import ai.onnxruntime.OrtException; import ai.onnxruntime.OrtSession; -import com.fasterxml.jackson.databind.DeserializationFeature; -import com.fasterxml.jackson.databind.ObjectMapper; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -67,16 +66,15 @@ public class DocumentCategorizerDL extends AbstractDL implements DocumentCategor /** * Instantiates a {@link DocumentCategorizer document categorizer} using ONNX models. * - * @param model The ONNX model file. - * @param vocabulary The model file's vocabulary file. - * @param categories The categories. + * @param model The ONNX model file. + * @param vocabulary The model file's vocabulary file. + * @param categories The categories. * @param classificationScoringStrategy Implementation of {@link ClassificationScoringStrategy} used * to calculate the classification scores given the score of each * individual document part. - * @param inferenceOptions {@link InferenceOptions} to control the inference. - * + * @param inferenceOptions {@link InferenceOptions} to control the inference. * @throws OrtException Thrown if the {@code model} cannot be loaded. - * @throws IOException Thrown if errors occurred loading the {@code model} or {@code vocabulary}. + * @throws IOException Thrown if errors occurred loading the {@code model} or {@code vocabulary}. */ public DocumentCategorizerDL(File model, File vocabulary, Map<Integer, String> categories, ClassificationScoringStrategy classificationScoringStrategy, @@ -102,21 +100,20 @@ public class DocumentCategorizerDL extends AbstractDL implements DocumentCategor /** * Instantiates a {@link DocumentCategorizer document categorizer} using ONNX models. * - * @param model The ONNX model file. - * @param vocabulary The model file's vocabulary file. - * @param config The model's config file. The file will be used to determine the classification categories. + * @param model The ONNX model file. + * @param vocabulary The model file's vocabulary file. + * @param config The model's config file. The file will be used to determine the classification categories. * @param classificationScoringStrategy Implementation of {@link ClassificationScoringStrategy} used * to calculate the classification scores given the score of each * individual document part. - * @param inferenceOptions {@link InferenceOptions} to control the inference. - * + * @param inferenceOptions {@link InferenceOptions} to control the inference. * @throws OrtException Thrown if the {@code model} cannot be loaded. - * @throws IOException Thrown if errors occurred loading the {@code model} or {@code vocabulary}. + * @throws IOException Thrown if errors occurred loading the {@code model} or {@code vocabulary}. */ public DocumentCategorizerDL(File model, File vocabulary, File config, ClassificationScoringStrategy classificationScoringStrategy, InferenceOptions inferenceOptions) - throws IOException, OrtException { + throws IOException, OrtException { this.env = OrtEnvironment.getEnvironment(); @@ -175,7 +172,7 @@ public class DocumentCategorizerDL extends AbstractDL implements DocumentCategor logger.error("Unload to perform document classification inference", ex); } - return new double[]{}; + return new double[] {}; } @@ -315,6 +312,7 @@ public class DocumentCategorizerDL extends AbstractDL implements DocumentCategor /** * Applies softmax to an array of values. + * * @param input An array of values. * @return The output array. */ @@ -346,18 +344,12 @@ public class DocumentCategorizerDL extends AbstractDL implements DocumentCategor } private Map<Integer, String> readCategoriesFromFile(File config) throws IOException { - - final String json = new String(Files.readAllBytes(config.toPath())); - - final ObjectMapper objectMapper = new ObjectMapper(); - objectMapper.configure(DeserializationFeature.FAIL_ON_UNKNOWN_PROPERTIES, false); - final DocumentCategorizerConfig documentCategorizerConfig = - objectMapper.readValue(json, DocumentCategorizerConfig.class); + DocumentCategorizerConfig.fromJson(new String(Files.readAllBytes(config.toPath()))); final Map<Integer, String> categories = new HashMap<>(); - for (final String key : documentCategorizerConfig.getId2label().keySet()) { - categories.put(Integer.valueOf(key), documentCategorizerConfig.getId2label().get(key)); + for (final String key : documentCategorizerConfig.id2label().keySet()) { + categories.put(Integer.valueOf(key), documentCategorizerConfig.id2label().get(key)); } return categories; diff --git a/opennlp-dl/src/test/java/opennlp/dl/doccat/DocumentCategorizerConfigTest.java b/opennlp-dl/src/test/java/opennlp/dl/doccat/DocumentCategorizerConfigTest.java new file mode 100644 index 00000000..a7ff5339 --- /dev/null +++ b/opennlp-dl/src/test/java/opennlp/dl/doccat/DocumentCategorizerConfigTest.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.dl.doccat; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + + +public class DocumentCategorizerConfigTest { + + @Test + public void testId2LabelsFromJsonPrettyValid() { + final String json = """ + { + "_num_labels": 5, + "architectures": [ + "BertForSequenceClassification" + ], + "attention_probs_dropout_prob": 0.1, + "directionality": "bidi", + "finetuning_task": "sentiment-analysis", + "hidden_act": "gelu", + "hidden_dropout_prob": 0.1, + "hidden_size": 768, + "id2label": { + "0": "1 star", + "1": "2 stars", + "2": "3 stars", + "3": "4 stars", + "4": "5 stars" + }, + "initializer_range": 0.02, + "intermediate_size": 3072, + "label2id": { + "1 star": 0, + "2 stars": 1, + "3 stars": 2, + "4 stars": 3, + "5 stars": 4 + }, + "layer_norm_eps": 1e-12, + "max_position_embeddings": 512, + "model_type": "bert", + "num_attention_heads": 12, + "num_hidden_layers": 12, + "output_past": true, + "pad_token_id": 0, + "pooler_fc_size": 768, + "pooler_num_attention_heads": 12, + "pooler_num_fc_layers": 3, + "pooler_size_per_head": 128, + "pooler_type": "first_token_transform", + "type_vocab_size": 2, + "vocab_size": 105879 + } + """; + + final DocumentCategorizerConfig config = DocumentCategorizerConfig.fromJson(json); + assertNotNull(config); + assertEquals(5, config.id2label().size()); + assertEquals("1 star", config.id2label().get("0")); + assertEquals("2 stars", config.id2label().get("1")); + assertEquals("3 stars", config.id2label().get("2")); + assertEquals("4 stars", config.id2label().get("3")); + assertEquals("5 stars", config.id2label().get("4")); + } + + @Test + public void testId2LabelsFromJsonUglyValid() { + final String json = """ + {"_num_labels":5,"architectures":["BertForSequenceClassification"],"attention_probs_dropout_prob":0.1,"directionality":"bidi","finetuning_task":"sentiment-analysis", + "hidden_act":"gelu","hidden_dropout_prob":0.1,"hidden_size":768,"id2label":{"0":"1 star","1":"2 stars","2":"3 stars","3":"4 stars","4":"5 stars"},"initializer_range":0.02, + "intermediate_size":3072,"label2id":{"1 star":0,"2 stars":1,"3 stars":2,"4 stars":3,"5 stars":4},"layer_norm_eps":1e-12,"max_position_embeddings":512,"model_type":"bert", + "num_attention_heads":12,"num_hidden_layers":12,"output_past":true,"pad_token_id":0,"pooler_fc_size":768,"pooler_num_attention_heads":12,"pooler_num_fc_layers":3, + "pooler_size_per_head":128,"pooler_type":"first_token_transform","type_vocab_size":2,"vocab_size":105879} + """; + + final DocumentCategorizerConfig config = DocumentCategorizerConfig.fromJson(json); + assertNotNull(config); + assertEquals(5, config.id2label().size()); + assertEquals("1 star", config.id2label().get("0")); + assertEquals("2 stars", config.id2label().get("1")); + assertEquals("3 stars", config.id2label().get("2")); + assertEquals("4 stars", config.id2label().get("3")); + assertEquals("5 stars", config.id2label().get("4")); + } + + @Test + public void testId2LabelsFromJsonNoValues() { + final String json = """ + {"_num_labels":5,"architectures":["BertForSequenceClassification"],"attention_probs_dropout_prob":0.1,"directionality":"bidi","finetuning_task":"sentiment-analysis", + "hidden_act":"gelu","hidden_dropout_prob":0.1,"hidden_size":768,"layer_norm_eps":1e-12,"max_position_embeddings":512,"model_type":"bert", + "num_attention_heads":12,"num_hidden_layers":12,"output_past":true,"pad_token_id":0,"pooler_fc_size":768,"pooler_num_attention_heads":12,"pooler_num_fc_layers":3, + "pooler_size_per_head":128,"pooler_type":"first_token_transform","type_vocab_size":2,"vocab_size":105879} + """; + + final DocumentCategorizerConfig config = DocumentCategorizerConfig.fromJson(json); + assertNotNull(config); + assertEquals(0, config.id2label().size()); + } + + @Test + public void testId2LabelsFromJsonEmptyInput() { + final String json = ""; + final DocumentCategorizerConfig config = DocumentCategorizerConfig.fromJson(json); + assertNotNull(config); + assertEquals(0, config.id2label().size()); + } +}
