This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch update-sandbox-components-to-use-opennlp-tools-version-2.4 in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit 8ff740d6a3b9f933ef3d3cc4a02e0b2596491b19 Author: Martin Wiesner <[email protected]> AuthorDate: Thu Jul 4 09:23:59 2024 +0200 Update sandbox components to use opennlp-tools version 2.3.4 / 2.4.0 - adds logger to WSDEvaluator to avoid plain System.out logging - clears some compiler warnings --- .../java/opennlp/addons/mallet/CRFTrainer.java | 4 +- .../opennlp/addons/mallet/TransducerModel.java | 8 ++-- .../addons/mallet/TransducerModelSerializer.java | 6 +-- .../apps/solr/IterativeSearchRequestHandler.java | 8 +--- opennlp-wsd/pom.xml | 7 +++- .../opennlp/tools/disambiguator/WSDEvaluator.java | 8 +++- pom.xml | 6 +-- .../wikinews_importer/AnnotatingMarkupParser.java | 45 ++++++++-------------- 8 files changed, 41 insertions(+), 51 deletions(-) diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java index 9145be1..11cd4b7 100644 --- a/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java +++ b/mallet-addon/src/main/java/opennlp/addons/mallet/CRFTrainer.java @@ -58,7 +58,7 @@ public class CRFTrainer extends AbstractTrainer implements SequenceTrainer { } @Override - public <T> SequenceClassificationModel<String> train(SequenceStream<T> sequences) + public <T> SequenceClassificationModel train(SequenceStream<T> sequences) throws IOException { Alphabet dataAlphabet = new Alphabet(); @@ -146,7 +146,7 @@ public class CRFTrainer extends AbstractTrainer implements SequenceTrainer { // can be very similar to the other model // one important difference is that the feature gen needs to be integrated // ... - return new TransducerModel<>(crf); + return new TransducerModel(crf); } // TODO: We need to return a sequence model here. How should that be done ?! diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java index 0c1fe67..47bb341 100644 --- a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java +++ b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModel.java @@ -34,7 +34,7 @@ import cc.mallet.types.FeatureVector; import cc.mallet.types.FeatureVectorSequence; import cc.mallet.types.Sequence; -public class TransducerModel<T> implements SequenceClassificationModel<T>, SerializableArtifact { +public class TransducerModel implements SequenceClassificationModel, SerializableArtifact { private final Transducer model; @@ -47,14 +47,14 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria } @Override - public opennlp.tools.util.Sequence bestSequence(T[] sequence, + public <T> opennlp.tools.util.Sequence bestSequence(T[] sequence, Object[] additionalContext, BeamSearchContextGenerator<T> cg, SequenceValidator<T> validator) { return bestSequences(1, sequence, additionalContext, cg, validator)[0]; } @Override - public opennlp.tools.util.Sequence[] bestSequences(int numSequences, + public <T> opennlp.tools.util.Sequence[] bestSequences(int numSequences, T[] sequence, Object[] additionalContext, double minSequenceScore, BeamSearchContextGenerator<T> cg, SequenceValidator<T> validator) { // TODO: How to implement min score filtering here? @@ -62,7 +62,7 @@ public class TransducerModel<T> implements SequenceClassificationModel<T>, Seria } @Override - public opennlp.tools.util.Sequence[] bestSequences(int numSequences, + public <T> opennlp.tools.util.Sequence[] bestSequences(int numSequences, T[] sequence, Object[] additionalContext, BeamSearchContextGenerator<T> cg, SequenceValidator<T> validator) { diff --git a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java index 9513618..9d24cc1 100644 --- a/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java +++ b/mallet-addon/src/main/java/opennlp/addons/mallet/TransducerModelSerializer.java @@ -28,13 +28,13 @@ import java.io.OutputStream; import opennlp.tools.util.model.ArtifactSerializer; import cc.mallet.fst.Transducer; -public class TransducerModelSerializer implements ArtifactSerializer<TransducerModel<?>> { +public class TransducerModelSerializer implements ArtifactSerializer<TransducerModel> { @Override - public TransducerModel<?> create(InputStream in) throws IOException { + public TransducerModel create(InputStream in) throws IOException { try (ObjectInputStream ois = new ObjectInputStream(in)) { Transducer classifier = (Transducer) ois.readObject(); - return new TransducerModel<>(classifier); + return new TransducerModel(classifier); } catch (ClassNotFoundException e) { throw new IOException(e); } diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeSearchRequestHandler.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeSearchRequestHandler.java index 51f838f..1cba60a 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeSearchRequestHandler.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeSearchRequestHandler.java @@ -30,7 +30,6 @@ import org.apache.commons.lang.StringUtils; import org.apache.lucene.document.Document; import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.queryparser.classic.ParseException; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.Query; @@ -123,7 +122,6 @@ public class IterativeSearchRequestHandler extends SearchHandler { rsp.setAllValues(rsp3.getValues()); } - @SuppressWarnings("unchecked") public DocList filterResultsBySyntMatchReduceDocSet(DocList docList, SolrQueryRequest req, SolrParams params) { //if (!docList.hasScores()) @@ -257,8 +255,7 @@ public class IterativeSearchRequestHandler extends SearchHandler { rsp.add("response", results); } - private Query buildFilter(String[] fqs, SolrQueryRequest req) - throws IOException, ParseException { + private Query buildFilter(String[] fqs, SolrQueryRequest req) { if (fqs != null && fqs.length > 0) { BooleanQuery.Builder fquery = new BooleanQuery.Builder(); for (String fq : fqs) { @@ -323,13 +320,12 @@ public class IterativeSearchRequestHandler extends SearchHandler { alreadyFound.add(hit.doc); } } - public static class PairComparable implements Comparator<Pair> { + public static class PairComparable implements Comparator<Pair<Integer, Float>> { @Override public int compare(Pair o1, Pair o2) { int b = -2; if ( o1.getSecond() instanceof Float && o2.getSecond() instanceof Float){ - b = (((Float) o2.getSecond()).compareTo((Float) o1.getSecond())); } return b; diff --git a/opennlp-wsd/pom.xml b/opennlp-wsd/pom.xml index aac1196..51421c5 100644 --- a/opennlp-wsd/pom.xml +++ b/opennlp-wsd/pom.xml @@ -59,16 +59,19 @@ <groupId>org.junit.jupiter</groupId> <artifactId>junit-jupiter-api</artifactId> </dependency> - <dependency> <groupId>org.junit.jupiter</groupId> <artifactId>junit-jupiter-engine</artifactId> </dependency> - <dependency> <groupId>org.junit.jupiter</groupId> <artifactId>junit-jupiter-params</artifactId> </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-simple</artifactId> + <version>${slf4j.version}</version> + </dependency> </dependencies> <build> diff --git a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java index eeab5c1..17dcce9 100644 --- a/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java +++ b/opennlp-wsd/src/main/java/opennlp/tools/disambiguator/WSDEvaluator.java @@ -20,6 +20,9 @@ package opennlp.tools.disambiguator; import opennlp.tools.util.eval.Evaluator; import opennlp.tools.util.eval.Mean; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + /** * The {@link WSDEvaluator} measures the performance of the given * {@link WSDisambiguator} with the provided reference {@code WordToDisambiguate}. @@ -29,6 +32,8 @@ import opennlp.tools.util.eval.Mean; */ public class WSDEvaluator extends Evaluator<WSDSample> { + private static final Logger LOG = LoggerFactory.getLogger(WSDEvaluator.class); + private final Mean accuracy = new Mean(); /** @@ -61,8 +66,7 @@ public class WSDEvaluator extends Evaluator<WSDSample> { reference.getTargetPosition()); if (predictedSense == null) { - System.out - .println("There was no sense for : " + reference.getTargetWord()); + LOG.debug("There was no sense for: {}", reference.getTargetWord()); return null; } // get the senseKey from the result diff --git a/pom.xml b/pom.xml index d3f284c..0234a9e 100644 --- a/pom.xml +++ b/pom.xml @@ -117,11 +117,11 @@ <maven.compiler.target>${java.version}</maven.compiler.target> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> - <opennlp.tools.version>2.3.3</opennlp.tools.version> + <opennlp.tools.version>2.3.4-SNAPSHOT</opennlp.tools.version> <opennlp.forkCount>1.0C</opennlp.forkCount> - <slf4j.version>1.7.36</slf4j.version> - <log4j2.version>2.20.0</log4j2.version> + <slf4j.version>2.0.13</slf4j.version> + <log4j2.version>2.23.1</log4j2.version> <uimaj.version>3.4.1</uimaj.version> <jersey-client.version>2.41</jersey-client.version> diff --git a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java index 90f1721..2624ae7 100644 --- a/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java +++ b/wikinews-importer/src/main/java/org/apache/opennlp/wikinews_importer/AnnotatingMarkupParser.java @@ -22,7 +22,6 @@ import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Set; -import java.util.regex.Pattern; import info.bliki.htmlcleaner.ContentToken; import info.bliki.htmlcleaner.TagNode; @@ -48,35 +47,26 @@ import info.bliki.wiki.tags.WPATag; */ public class AnnotatingMarkupParser implements ITextConverter { - public static final String HREF_ATTR_KEY = "href"; + private static final String HREF_ATTR_KEY = "href"; - public static final String WIKILINK_TITLE_ATTR_KEY = "title"; + private static final String WIKILINK_TITLE_ATTR_KEY = "title"; + private static final String WIKILINK_TARGET_ATTR_KEY = "href"; + private static final String WIKIOBJECT_ATTR_KEY = "wikiobject"; - public static final String WIKILINK_TARGET_ATTR_KEY = "href"; + private static final Set<String> PARAGRAPH_TAGS = Set.of("p"); + private static final Set<String> HEADING_TAGS = Set.of("h1", "h2", "h3", "h4", "h5", "h6"); - public static final String WIKIOBJECT_ATTR_KEY = "wikiobject"; + private final List<Annotation> wikilinks = new ArrayList<>(); + private final List<Annotation> headers = new ArrayList<>(); + private final List<Annotation> paragraphs = new ArrayList<>(); - public static final Set<String> PARAGRAPH_TAGS = Set.of("p"); + private String languageCode = "en"; - public static final Set<String> HEADING_TAGS = Set.of("h1", "h2", "h3", "h4", "h5", "h6"); + private final WikiModel model; - public static final Pattern INTERWIKI_PATTERN = Pattern.compile("http://[\\w-]+\\.wikipedia\\.org/wiki/.*"); + private String redirect; - protected final List<Annotation> wikilinks = new ArrayList<>(); - - protected final List<Annotation> headers = new ArrayList<>(); - - protected final List<Annotation> paragraphs = new ArrayList<>(); - - protected String languageCode = "en"; - - protected final WikiModel model; - - protected String redirect; - - protected String text; - - protected static final Pattern REDIRECT_PATTERN = Pattern.compile("^#REDIRECT \\[\\[([^\\]]*)\\]\\]"); + private String text; public AnnotatingMarkupParser() { model = makeWikiModel(languageCode); @@ -119,9 +109,8 @@ public class AnnotatingMarkupParser implements ITextConverter { return; } for (Object node : nodes) { - if (node instanceof WPATag) { + if (node instanceof WPATag tag) { // extract wikilink annotations - WPATag tag = (WPATag) node; String wikilinkLabel = tag.getAttributes().get(WIKILINK_TITLE_ATTR_KEY); String wikilinkTarget = tag.getAttributes().get(WIKILINK_TARGET_ATTR_KEY); if (wikilinkLabel != null) { @@ -142,8 +131,7 @@ public class AnnotatingMarkupParser implements ITextConverter { tag.getBodyString(countingBuffer); } - } else if (node instanceof ContentToken) { - ContentToken contentToken = (ContentToken) node; + } else if (node instanceof ContentToken contentToken) { countingBuffer.append(contentToken.getContent()); } else if (node instanceof List) { } else if (node instanceof WPList) { @@ -152,8 +140,7 @@ public class AnnotatingMarkupParser implements ITextConverter { // do not hold grammatically correct // interesting sentences that are representative of the // language. - } else if (node instanceof TagNode) { - TagNode tagNode = (TagNode) node; + } else if (node instanceof TagNode tagNode) { Map<String, String> attributes = tagNode.getAttributes(); Map<String, Object> oAttributes = tagNode.getObjectAttributes(); boolean hasSpecialHandling = false;
