This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch migrate-mallet-addon-to-opennlp-tools-2_1_0 in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit f369e6c8b582d89e03baf564b95d4537cc7bf76e Author: Martin Wiesner <[email protected]> AuthorDate: Fri Jan 20 11:24:53 2023 +0100 updates sandbox component 'mallet-addon' to be compatible with latest opennlp-tools release - adjusts opennlp-tools to 2.1.0 - adjusts parent project (org.apache.apache) to version 18 - adjusts Java language level to 11 - updates to mallet version 2.0.8 to mitigate several CVEs, adds exclusions and related newer versions to mitigate CVEs - adjusts some array declarations to comply with Java, not C, style - improves resource handling of streams - removes unused imports --- mallet-addon/pom.xml | 44 +++++++++++++++++++--- .../java/opennlp/addons/mallet/CRFTrainer.java | 15 +++----- .../opennlp/addons/mallet/ClassifierModel.java | 27 +++++++------ .../addons/mallet/ClassifierModelSerializer.java | 15 +++----- .../java/opennlp/addons/mallet/MaxentTrainer.java | 18 ++------- .../opennlp/addons/mallet/TransducerModel.java | 20 +++++----- .../addons/mallet/TransducerModelSerializer.java | 9 ++--- 7 files changed, 82 insertions(+), 66 deletions(-) diff --git a/mallet-addon/pom.xml b/mallet-addon/pom.xml index c5f2ca9..d1e134f 100644 --- a/mallet-addon/pom.xml +++ b/mallet-addon/pom.xml @@ -21,10 +21,17 @@ <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> - + <parent> + <groupId>org.apache</groupId> + <artifactId>apache</artifactId> + <!-- TODO OPENNLP-1452 once this is resolved, move to 29 as well. --> + <version>18</version> + <relativePath /> + </parent> + <groupId>kottmann.opennlp</groupId> <artifactId>mallet-addon</artifactId> - <version>1.6.0-SNAPSHOT</version> + <version>2.1.1-SNAPSHOT</version> <packaging>jar</packaging> <name>Apache OpenNLP Mallet Addon</name> @@ -33,13 +40,37 @@ <dependency> <groupId>org.apache.opennlp</groupId> <artifactId>opennlp-tools</artifactId> - <version>1.6.1-SNAPSHOT</version> + <version>2.1.0</version> </dependency> <dependency> <groupId>cc.mallet</groupId> <artifactId>mallet</artifactId> - <version>2.0.7</version> + <version>2.0.8</version> + <exclusions> + <exclusion> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + </exclusion> + <exclusion> + <groupId>org.jdom</groupId> + <artifactId>jdom</artifactId> + </exclusion> + <exclusion> + <groupId>org.beanshell</groupId> + <artifactId>bsh</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.jdom</groupId> + <artifactId>jdom</artifactId> + <version>1.1.3</version> + </dependency> + <dependency> + <groupId>org.apache-extras.beanshell</groupId> + <artifactId>bsh</artifactId> + <version>2.0b6</version> </dependency> </dependencies> @@ -67,8 +98,9 @@ <groupId>org.apache.maven.plugins</groupId> <artifactId>maven-compiler-plugin</artifactId> <configuration> - <source>1.7</source> - <target>1.7</target> + <source>11</source> + <target>11</target> + <compilerArgument>-Xlint</compilerArgument> </configuration> </plugin> <plugin> diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java index 7e6de66..0700e2b 100644 --- a/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java +++ b/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java @@ -20,7 +20,6 @@ package opennlp.addons.mallet; import java.io.IOException; -import java.util.Map; import java.util.regex.Pattern; import opennlp.tools.ml.AbstractSequenceTrainer; @@ -30,7 +29,6 @@ import opennlp.tools.ml.model.SequenceClassificationModel; import opennlp.tools.ml.model.SequenceStream; import cc.mallet.fst.CRF; import cc.mallet.fst.CRFOptimizableByLabelLikelihood; -import cc.mallet.fst.CRFTrainerByLabelLikelihood; import cc.mallet.fst.CRFTrainerByValueGradients; import cc.mallet.fst.Transducer; import cc.mallet.optimize.Optimizable; @@ -71,17 +69,17 @@ public class CRFTrainer extends AbstractSequenceTrainer { int nameIndex = 0; Sequence sequence; while ((sequence = sequences.read()) != null) { - FeatureVector featureVectors[] = new FeatureVector[sequence.getEvents().length]; - Label malletOutcomes[] = new Label[sequence.getEvents().length]; + FeatureVector[] featureVectors = new FeatureVector[sequence.getEvents().length]; + Label[] malletOutcomes = new Label[sequence.getEvents().length]; - Event events[] = sequence.getEvents(); + Event[] events = sequence.getEvents(); for (int eventIndex = 0; eventIndex < events.length; eventIndex++) { Event event = events[eventIndex]; - String features[] = event.getContext(); - int malletFeatures[] = new int[features.length]; + String[] features = event.getContext(); + int[] malletFeatures = new int[features.length]; for (int featureIndex = 0; featureIndex < features.length; featureIndex++) { malletFeatures[featureIndex] = dataAlphabet.lookupIndex( @@ -109,8 +107,7 @@ public class CRFTrainer extends AbstractSequenceTrainer { CRF crf = new CRF(trainingData.getDataAlphabet(), trainingData.getTargetAlphabet()); - String startStateName = crf.addOrderNStates(trainingData, getOrders(), - (boolean[]) null, + String startStateName = crf.addOrderNStates(trainingData, getOrders(), null, // default label "other", Pattern.compile("other,*-cont"), // forbidden pattern null, // allowed pattern diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java index 5f6661d..1426be9 100644 --- a/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java +++ b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModel.java @@ -35,18 +35,19 @@ import cc.mallet.types.LabelVector; class ClassifierModel implements MaxentModel, SerializableArtifact { - private Classifier classifer; + private final Classifier classifier; public ClassifierModel(Classifier classifer) { - this.classifer = classifer; + this.classifier = classifer; } - Classifier getClassifer() { - return classifer; + Classifier getClassifier() { + return classifier; } - + + @Override public double[] eval(String[] features) { - Alphabet dataAlphabet = classifer.getAlphabet(); + Alphabet dataAlphabet = classifier.getAlphabet(); List<Integer> malletFeatureList = new ArrayList<>(features.length); @@ -62,15 +63,15 @@ class ClassifierModel implements MaxentModel, SerializableArtifact { malletFeatures[i] = malletFeatureList.get(i); } - FeatureVector fv = new FeatureVector(classifer.getAlphabet(), + FeatureVector fv = new FeatureVector(classifier.getAlphabet(), malletFeatures); Instance instance = new Instance(fv, null, null, null); - Classification result = classifer.classify(instance); + Classification result = classifier.classify(instance); LabelVector labeling = result.getLabelVector(); - LabelAlphabet targetAlphabet = classifer.getLabelAlphabet(); + LabelAlphabet targetAlphabet = classifier.getLabelAlphabet(); double outcomes[] = new double[targetAlphabet.size()]; for (int i = 0; i < outcomes.length; i++) { @@ -84,10 +85,12 @@ class ClassifierModel implements MaxentModel, SerializableArtifact { return outcomes; } + @Override public double[] eval(String[] context, double[] probs) { return eval(context); } + @Override public double[] eval(String[] context, float[] values) { return eval(context); } @@ -109,17 +112,17 @@ class ClassifierModel implements MaxentModel, SerializableArtifact { @Override public String getOutcome(int i) { - return classifer.getLabelAlphabet().lookupLabel(i).getEntry().toString(); + return classifier.getLabelAlphabet().lookupLabel(i).getEntry().toString(); } @Override public int getIndex(String outcome) { - return classifer.getLabelAlphabet().lookupIndex(outcome); + return classifier.getLabelAlphabet().lookupIndex(outcome); } @Override public int getNumOutcomes() { - return classifer.getLabelAlphabet().size(); + return classifier.getLabelAlphabet().size(); } @Override diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java index 9cfb6f2..f3b4806 100644 --- a/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java +++ b/mallet-addon/src/main/java/opennlp/addons/mallet/ClassifierModelSerializer.java @@ -26,20 +26,16 @@ import java.io.ObjectOutputStream; import java.io.OutputStream; import cc.mallet.classify.Classifier; -import opennlp.tools.util.InvalidFormatException; import opennlp.tools.util.model.ArtifactSerializer; // The standard method for saving classifiers in Mallet is through Java serialization. -public class ClassifierModelSerializer implements - ArtifactSerializer<ClassifierModel> { +public class ClassifierModelSerializer implements ArtifactSerializer<ClassifierModel> { @Override - public ClassifierModel create(InputStream in) throws IOException, - InvalidFormatException { + public ClassifierModel create(InputStream in) throws IOException { - ObjectInputStream ois = new ObjectInputStream(in); - try { + try ( ObjectInputStream ois = new ObjectInputStream(in)) { Classifier classifier = (Classifier) ois.readObject(); return new ClassifierModel(classifier); } catch (ClassNotFoundException e) { @@ -48,10 +44,9 @@ public class ClassifierModelSerializer implements } @Override - public void serialize(ClassifierModel artifact, OutputStream out) - throws IOException { + public void serialize(ClassifierModel artifact, OutputStream out) throws IOException { ObjectOutputStream oos = new ObjectOutputStream(out); - oos.writeObject(artifact.getClassifer()); + oos.writeObject(artifact.getClassifier()); oos.flush(); } } diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java index e9524a9..cfcb294 100644 --- a/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java +++ b/mallet-addon/src/main/java/opennlp/addons/mallet/MaxentTrainer.java @@ -22,22 +22,12 @@ package opennlp.addons.mallet; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; -import java.util.Map; import opennlp.tools.ml.AbstractEventTrainer; import opennlp.tools.ml.model.DataIndexer; import opennlp.tools.ml.model.MaxentModel; -import cc.mallet.classify.C45Trainer; import cc.mallet.classify.Classifier; -import cc.mallet.classify.MaxEntGETrainer; -import cc.mallet.classify.MaxEntL1Trainer; -import cc.mallet.classify.MaxEntPRTrainer; import cc.mallet.classify.MaxEntTrainer; -import cc.mallet.classify.NaiveBayes; -import cc.mallet.classify.NaiveBayesEMTrainer; -import cc.mallet.classify.NaiveBayesTrainer; -import cc.mallet.optimize.LimitedMemoryBFGS; -import cc.mallet.optimize.Optimizer; import cc.mallet.types.Alphabet; import cc.mallet.types.FeatureVector; import cc.mallet.types.Instance; @@ -61,13 +51,13 @@ public class MaxentTrainer extends AbstractEventTrainer { Collection<Instance> instances = new ArrayList<>(); - String predLabels[] = indexer.getPredLabels(); + String[] predLabels = indexer.getPredLabels(); - int outcomes[] = indexer.getOutcomeList(); + int[] outcomes = indexer.getOutcomeList(); for (int contextIndex = 0; contextIndex < indexer.getContexts().length; contextIndex++) { - int malletFeatures[] = new int[indexer.getContexts()[contextIndex].length]; - double weights[] = new double[indexer.getContexts()[contextIndex].length]; + int[] malletFeatures = new int[indexer.getContexts()[contextIndex].length]; + double[] weights = new double[indexer.getContexts()[contextIndex].length]; for (int featureIndex = 0; featureIndex < malletFeatures.length; featureIndex++) { malletFeatures[featureIndex] = dataAlphabet.lookupIndex( diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java index e713d83..91afec3 100644 --- a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java +++ b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java @@ -36,7 +36,7 @@ import cc.mallet.types.Sequence; public class TransducerModel<T> implements SequenceClassificationModel<T>, SerializableArtifact { - private Transducer model; + private final Transducer model; public TransducerModel(Transducer model) { this.model = model; @@ -45,7 +45,8 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria Transducer getModel() { return model; } - + + @Override public opennlp.tools.util.Sequence bestSequence(T[] sequence, Object[] additionalContext, BeamSearchContextGenerator<T> cg, SequenceValidator<T> validator) { @@ -59,7 +60,8 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria // TODO: How to implement min score filtering here? return bestSequences(numSequences, sequence, additionalContext, cg, validator); } - + + @Override public opennlp.tools.util.Sequence[] bestSequences(int numSequences, T[] sequence, Object[] additionalContext, BeamSearchContextGenerator<T> cg, SequenceValidator<T> validator) { @@ -67,16 +69,16 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria // TODO: CRF.getInputAlphabet Alphabet dataAlphabet = model.getInputPipe().getAlphabet(); - FeatureVector featureVectors[] = new FeatureVector[sequence.length]; + FeatureVector[] featureVectors = new FeatureVector[sequence.length]; // TODO:: The feature generator needs to get the detected sequence in the end // to update the adaptive data! - String prior[] = new String[sequence.length]; + String[] prior = new String[sequence.length]; Arrays.fill(prior, "s"); // <- HACK, this will degrade performance! // TODO: Put together a feature generator which doesn't fail if outcomes is null! for (int i = 0; i < sequence.length; i++) { - String features[] = cg.getContext(i, sequence, null, additionalContext); + String[] features = cg.getContext(i, sequence, null, additionalContext); List<Integer> malletFeatureList = new ArrayList<>(features.length); @@ -86,7 +88,7 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria } } - int malletFeatures[] = new int[malletFeatureList.size()]; + int[] malletFeatures = new int[malletFeatureList.size()]; for (int k = 0; k < malletFeatureList.size(); k++) { malletFeatures[k] = malletFeatureList.get(k); } @@ -97,7 +99,7 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria FeatureVectorSequence malletSequence = new FeatureVectorSequence(featureVectors); - Sequence[] answers = null; + Sequence[] answers; if (numSequences == 1) { answers = new Sequence[1]; answers[0] = model.transduce(malletSequence); @@ -136,7 +138,7 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria Alphabet targetAlphabet = model.getInputPipe().getTargetAlphabet(); - String outcomes[] = new String[targetAlphabet.size()]; + String[] outcomes = new String[targetAlphabet.size()]; for (int i = 0; i < targetAlphabet.size(); i++) { outcomes[i] = targetAlphabet.lookupObject(i).toString(); diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java index b793ca2..6e05eab 100644 --- a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java +++ b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java @@ -32,10 +32,8 @@ import cc.mallet.fst.Transducer; public class TransducerModelSerializer implements ArtifactSerializer<TransducerModel> { @Override - public TransducerModel create(InputStream in) throws IOException, - InvalidFormatException { - ObjectInputStream ois = new ObjectInputStream(in); - try { + public TransducerModel create(InputStream in) throws IOException, InvalidFormatException { + try (ObjectInputStream ois = new ObjectInputStream(in)) { Transducer classifier = (Transducer) ois.readObject(); return new TransducerModel(classifier); } catch (ClassNotFoundException e) { @@ -44,8 +42,7 @@ public class TransducerModelSerializer implements ArtifactSerializer<TransducerM } @Override - public void serialize(TransducerModel artifact, OutputStream out) - throws IOException { + public void serialize(TransducerModel artifact, OutputStream out) throws IOException { ObjectOutputStream oos = new ObjectOutputStream(out); oos.writeObject(artifact.getModel()); oos.flush();
