This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch experimental/cleanup-dependency-mess-of-opennlp-similarity in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit beff0dca60f4db9ab5615a00ff3710bf65fb21e0 Author: Martin Wiesner <[email protected]> AuthorDate: Tue Dec 10 11:26:20 2024 +0100 reorganizes dependencies of 'opennlp-similarity' component switches 'tika-app' to more lightweight 'tika-core' dep switches 'docx4j' to more lightweight / modern 'docx4j-core' dep (11.5.1, jakarta) switches to ud-models in opennlp-similarity component uses thread-safe Tokenizer, POSTagger and SentenceDetector impl classes to avoid race conditions, as shown by JUnit tests sometimes --- opennlp-similarity/pom.xml | 84 +++++---- .../review_builder/FBOpenGraphSearchManager.java | 148 --------------- .../review_builder/WebPageReviewExtractor.java | 2 - .../tools/apps/utils/email/EmailSender.java | 26 +-- .../tools/apps/utils/email/SMTPAuthenticator.java | 4 +- ...cClassifierTrainingSetMultilingualExtender.java | 6 +- .../DocClassifierTrainingSetVerifier.java | 4 +- .../enron_email_recognizer/EmailNormalizer.java | 13 +- .../EmailTrainingSetFormer.java | 9 +- .../main/java/opennlp/tools/nl2code/NL2Obj.java | 13 +- .../similarity/apps/ContentGeneratorRunner.java | 21 +-- .../tools/similarity/apps/solr/CommentsRel.java | 2 +- .../apps/solr/ContentGeneratorRequestHandler.java | 43 +---- .../solr/SearchResultsReRankerRequestHandler.java | 26 +-- .../apps/solr/WordDocBuilderEndNotes.java | 45 ++--- .../ParserChunker2MatcherProcessor.java | 201 ++++++++------------- .../ParserPure2MatcherProcessor.java | 60 +++--- .../src/test/resources/models/en-sent.bin | Bin 98533 -> 0 bytes pom.xml | 18 +- 19 files changed, 238 insertions(+), 487 deletions(-) diff --git a/opennlp-similarity/pom.xml b/opennlp-similarity/pom.xml index 58dd8a2..b10aa48 100644 --- a/opennlp-similarity/pom.xml +++ b/opennlp-similarity/pom.xml @@ -27,6 +27,12 @@ <name>Apache OpenNLP Similarity distribution</name> <properties> + <jakarta.bind-api.version>4.0.2</jakarta.bind-api.version> + <jakarta.mail.version>2.1.3</jakarta.mail.version> + + <tika.version>3.0.0</tika.version> + <solr.version>8.11.3</solr.version> + <docx4j.version>11.5.1</docx4j.version> <dl4j.version>1.0.0-M2.1</dl4j.version> <hdf5.version>1.14.3-1.5.10</hdf5.version> <javacpp.version>1.5.11</javacpp.version> @@ -83,27 +89,24 @@ <groupId>org.apache.opennlp</groupId> <artifactId>opennlp-tools</artifactId> </dependency> - <dependency> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-api</artifactId> + <groupId>org.apache.commons</groupId> + <artifactId>commons-math3</artifactId> </dependency> - <dependency> - <groupId>commons-lang</groupId> - <artifactId>commons-lang</artifactId> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <scope>runtime</scope> </dependency> <dependency> - <groupId>commons-codec</groupId> - <artifactId>commons-codec</artifactId> + <groupId>jakarta.xml.bind</groupId> + <artifactId>jakarta.xml.bind-api</artifactId> + <version>${jakarta.bind-api.version}</version> </dependency> <dependency> - <groupId>commons-collections</groupId> - <artifactId>commons-collections</artifactId> - </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-math3</artifactId> + <groupId>jakarta.mail</groupId> + <artifactId>jakarta.mail-api</artifactId> + <version>${jakarta.mail.version}</version> </dependency> <dependency> <groupId>org.json</groupId> @@ -112,19 +115,20 @@ </dependency> <dependency> <groupId>org.apache.tika</groupId> - <artifactId>tika-app</artifactId> - <version>3.0.0</version> + <artifactId>tika-core</artifactId> + <version>${tika.version}</version> </dependency> <dependency> - <groupId>net.sf.opencsv</groupId> - <artifactId>opencsv</artifactId> - <version>2.3</version> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parser-html-module</artifactId> + <version>${tika.version}</version> + <scope>runtime</scope> </dependency> <dependency> <groupId>org.apache.solr</groupId> <artifactId>solr-core</artifactId> - <version>8.11.3</version> + <version>${solr.version}</version> <exclusions> <exclusion> <groupId>org.apache.hadoop</groupId> @@ -138,20 +142,13 @@ <groupId>org.eclipse.jetty.http2</groupId> <artifactId>*</artifactId> </exclusion> + <exclusion> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>*</artifactId> + </exclusion> </exclusions> </dependency> - <dependency> - <groupId>javax.mail</groupId> - <artifactId>mail</artifactId> - <version>1.4.7</version> - </dependency> - <dependency> - <groupId>com.restfb</groupId> - <artifactId>restfb</artifactId> - <version>1.49.0</version> - </dependency> - <dependency> <groupId>net.billylieurance.azuresearch</groupId> <artifactId>azure-bing-search-java</artifactId> @@ -181,8 +178,8 @@ <dependency> <groupId>org.docx4j</groupId> - <artifactId>docx4j</artifactId> - <version>6.1.2</version> + <artifactId>docx4j-core</artifactId> + <version>${docx4j.version}</version> <exclusions> <!-- Exclusion here as log4j version 2 bindings are used during tests/runtime--> <exclusion> @@ -217,11 +214,7 @@ </exclusion> </exclusions> </dependency> - <dependency> - <groupId>org.deeplearning4j</groupId> - <artifactId>deeplearning4j-ui</artifactId> - <version>${dl4j.version}</version> - </dependency> + <dependency> <groupId>org.deeplearning4j</groupId> <artifactId>deeplearning4j-nlp</artifactId> @@ -252,10 +245,15 @@ <groupId>org.junit.jupiter</groupId> <artifactId>junit-jupiter-params</artifactId> </dependency> + + <!-- Logging --> <dependency> - <groupId>org.apache.logging.log4j</groupId> - <artifactId>log4j-api</artifactId> - <scope>test</scope> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-api</artifactId> + </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>log4j-over-slf4j</artifactId> </dependency> <dependency> <groupId>org.apache.logging.log4j</groupId> @@ -265,7 +263,7 @@ <dependency> <groupId>org.apache.logging.log4j</groupId> <artifactId>log4j-slf4j2-impl</artifactId> - <scope>test</scope> + <scope>runtime</scope> </dependency> </dependencies> @@ -444,7 +442,7 @@ <configuration> <source>${maven.compiler.source}</source> <target>${maven.compiler.target}</target> - <compilerArgument>-Xlint</compilerArgument> + <compilerArgument>-Xlint:-options</compilerArgument> </configuration> </plugin> diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/FBOpenGraphSearchManager.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/FBOpenGraphSearchManager.java deleted file mode 100644 index f2a130a..0000000 --- a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/FBOpenGraphSearchManager.java +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.tools.apps.review_builder; - -import java.util.ArrayList; -import java.util.List; - -import com.restfb.Connection; -import com.restfb.DefaultFacebookClient; -import com.restfb.FacebookClient; -import com.restfb.Parameter; -import com.restfb.exception.FacebookException; -import com.restfb.types.Event; -import com.restfb.types.Page; -import org.apache.commons.lang.StringUtils; - -import opennlp.tools.jsmlearning.ProfileReaderWriter; -import opennlp.tools.similarity.apps.utils.PageFetcher; - -public class FBOpenGraphSearchManager { - - public final List<String[]> profiles; - protected FacebookClient mFBClient; - protected final PageFetcher pageFetcher = new PageFetcher(); - protected static final int NUM_TRIES = 5; - protected static final long WAIT_BTW_TRIES=1000; //milliseconds between re-tries - - public FBOpenGraphSearchManager(){ - profiles = ProfileReaderWriter.readProfiles("C:\\nc\\features\\analytics\\dealanalyzer\\sweetjack-localcoupon-may12012tooct302012.csv"); - } - - public void setFacebookClient(FacebookClient c){ - this.mFBClient=c; - } - - public List<Event> getFBEventsByName(String event) - { - List<Event> events = new ArrayList<>(); - - for(int i=0; i < NUM_TRIES; i++) - { - try - { - Connection<Event> publicSearch = - mFBClient.fetchConnection("search", Event.class, - Parameter.with("q", event), Parameter.with("type", "event"),Parameter.with("limit", 100)); - System.out.println("Searching FB events for " + event); - events= publicSearch.getData(); - break; - } - catch(FacebookException e) - { - System.out.println("FBError "+e); - try - { - Thread.sleep(WAIT_BTW_TRIES); - } - catch (InterruptedException e1) - { - System.out.println("Error "+e1); - } - } - } - return events; - } - - public Long getFBPageLikes(String merchant) - { - List<Page> groups = new ArrayList<>(); - - for(int i=0; i < NUM_TRIES; i++) - { - try - { - Connection<Page> publicSearch = - mFBClient.fetchConnection("search", Page.class, - Parameter.with("q", merchant), Parameter.with("type", "page"),Parameter.with("limit", 100)); - System.out.println("Searching FB Pages for " + merchant); - groups= publicSearch.getData(); - break; - } - catch(FacebookException e) - { - System.out.println("FBError "+e); - try - { - Thread.sleep(WAIT_BTW_TRIES); - } - catch (InterruptedException e1) - { - System.out.println("Error "+e1); - } - } - } - - for (Page p: groups){ - if (p!=null && p.getLikes()!=null && p.getLikes()>0) - return p.getLikes(); - } - - //stats fwb">235</span> - - for (Page p: groups){ - if (p.getId()==null) - continue; - String content = pageFetcher.fetchOrigHTML("http://www.facebook.com/"+p.getId()); - - String likes = StringUtils.substringBetween(content, "stats fwb\">", "<" ); - if (likes==null) - continue; - int nLikes =0; - try { - nLikes = Integer.parseInt(likes); - } catch (Exception e){ - - } - if (nLikes>0){ - return (long)nLikes; - } - - } - return null; - } - - public static void main(String[] args){ - FBOpenGraphSearchManager man = new FBOpenGraphSearchManager (); - man.setFacebookClient(new DefaultFacebookClient()); - - long res = man.getFBPageLikes("chain saw"); - System.out.println(res); - - } -} diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/WebPageReviewExtractor.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/WebPageReviewExtractor.java index 4448f58..14574f3 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/WebPageReviewExtractor.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/WebPageReviewExtractor.java @@ -28,7 +28,6 @@ import opennlp.tools.similarity.apps.HitBase; import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer; import opennlp.tools.similarity.apps.utils.Utils; import opennlp.tools.textsimilarity.TextProcessor; -import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; import org.apache.commons.lang.StringUtils; import org.slf4j.Logger; @@ -392,7 +391,6 @@ public class WebPageReviewExtractor extends WebPageExtractor { public static void main(String[] args){ String resourceDir = "C:/stanford-corenlp/src/test/resources/"; - ParserChunker2MatcherProcessor proc = ParserChunker2MatcherProcessor.getInstance(resourceDir); //ProductFinderInAWebPage init = new ProductFinderInAWebPage("C:/workspace/relevanceEngine/src/test/resources"); diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/EmailSender.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/EmailSender.java index c5388fa..94ba811 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/EmailSender.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/EmailSender.java @@ -17,19 +17,19 @@ package opennlp.tools.apps.utils.email; -import javax.activation.DataHandler; -import javax.activation.DataSource; -import javax.activation.FileDataSource; -import javax.mail.Authenticator; -import javax.mail.BodyPart; -import javax.mail.Message; -import javax.mail.Multipart; -import javax.mail.Session; -import javax.mail.Transport; -import javax.mail.internet.InternetAddress; -import javax.mail.internet.MimeBodyPart; -import javax.mail.internet.MimeMessage; -import javax.mail.internet.MimeMultipart; +import jakarta.activation.DataHandler; +import jakarta.activation.DataSource; +import jakarta.activation.FileDataSource; +import jakarta.mail.Authenticator; +import jakarta.mail.BodyPart; +import jakarta.mail.Message; +import jakarta.mail.Multipart; +import jakarta.mail.Session; +import jakarta.mail.Transport; +import jakarta.mail.internet.InternetAddress; +import jakarta.mail.internet.MimeBodyPart; +import jakarta.mail.internet.MimeMessage; +import jakarta.mail.internet.MimeMultipart; import java.util.Properties; import java.util.regex.Matcher; import java.util.regex.Pattern; diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/SMTPAuthenticator.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/SMTPAuthenticator.java index c48ab34..55f56dd 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/SMTPAuthenticator.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/SMTPAuthenticator.java @@ -17,12 +17,12 @@ package opennlp.tools.apps.utils.email; -import javax.mail.PasswordAuthentication; +import jakarta.mail.PasswordAuthentication; /** * This contains the required information for the smtp authorization! */ -public class SMTPAuthenticator extends javax.mail.Authenticator { +public class SMTPAuthenticator extends jakarta.mail.Authenticator { private final String username; private final String password; diff --git a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java index 29a5107..18d778c 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java @@ -27,11 +27,11 @@ import java.net.URL; import java.nio.channels.Channels; import java.nio.channels.ReadableByteChannel; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.util.ArrayList; import java.util.HashSet; import java.util.List; -import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; /* @@ -86,7 +86,7 @@ public class DocClassifierTrainingSetMultilingualExtender { List<String> filteredEntries = new ArrayList<>(); String content=null; try { - content = FileUtils.readFileToString(new File(filename), StandardCharsets.UTF_8); + content = Files.readString(new File(filename).toPath(), StandardCharsets.UTF_8); } catch (IOException e) { e.printStackTrace(); } @@ -127,7 +127,7 @@ public class DocClassifierTrainingSetMultilingualExtender { continue; System.out.println("processing "+f.getName()); - content = FileUtils.readFileToString(f, "utf-8"); + content = Files.readString(f.toPath(), StandardCharsets.UTF_8); int langIndex =0; for(String[] begEnd: MULTILINGUAL_TOKENS){ String urlDirty = StringUtils.substringBetween(content, begEnd[0], begEnd[1]); diff --git a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java index 95c2b27..d774c4d 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java @@ -18,12 +18,12 @@ package opennlp.tools.doc_classifier; import java.io.File; import java.io.IOException; +import java.nio.file.Files; import java.util.ArrayList; import java.util.List; import opennlp.tools.jsmlearning.ProfileReaderWriter; -import org.apache.commons.io.FileUtils; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; @@ -96,7 +96,7 @@ public class DocClassifierTrainingSetVerifier { && resultsClassif.get(0).equals( ClassifierTrainingSetIndexer.getCategoryFromFilePath(f.getAbsolutePath()))){ String destFileName = f.getAbsolutePath().replace(sourceDir, destinationDir); - FileUtils.copyFile(f, new File(destFileName)); + Files.copy(f.toPath(), new File(destFileName).toPath()); bRejected = false; } else { System.out.println("File "+ f.getAbsolutePath() + "\n classified as "+ diff --git a/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailNormalizer.java b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailNormalizer.java index 6e1ebe9..3fde124 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailNormalizer.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailNormalizer.java @@ -20,10 +20,9 @@ package opennlp.tools.enron_email_recognizer; import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.util.ArrayList; -import org.apache.commons.io.FileUtils; - public class EmailNormalizer { protected final ArrayList<File> queue = new ArrayList<>(); @@ -67,7 +66,7 @@ public class EmailNormalizer { public void normalizeAndWriteIntoANewFile(File f){ String content = ""; try { - content = FileUtils.readFileToString(f, StandardCharsets.UTF_8); + content = Files.readString(f.toPath(), StandardCharsets.UTF_8); } catch (IOException e) { e.printStackTrace(); } @@ -95,10 +94,10 @@ public class EmailNormalizer { String directoryNew = f.getAbsolutePath().replace(origFolder, newFolder); try { String fullFileNameNew = directoryNew +"txt"; - FileUtils.writeStringToFile(new File(fullFileNameNew), buf.toString(), StandardCharsets.UTF_8); - } catch (IOException e) { - e.printStackTrace(); - } + Files.writeString(new File(fullFileNameNew).toPath(), buf.toString(), StandardCharsets.UTF_8); + } catch (IOException e) { + e.printStackTrace(); + } } public void normalizeDirectory(File f){ diff --git a/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailTrainingSetFormer.java b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailTrainingSetFormer.java index 1a8ce6d..2551052 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailTrainingSetFormer.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/enron_email_recognizer/EmailTrainingSetFormer.java @@ -20,10 +20,9 @@ package opennlp.tools.enron_email_recognizer; import java.io.File; import java.io.IOException; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.util.List; -import org.apache.commons.io.FileUtils; - public class EmailTrainingSetFormer { static final String DATA_DIR = "/Users/bgalitsky/Downloads/"; static final String FILE_LIST_FILE = "cats4_11-17.txt"; @@ -32,14 +31,14 @@ public class EmailTrainingSetFormer { //enron_with_categories/5/70665.cats:4,10,1 public static void createPosTrainingSet(){ try { - List<String> lines = FileUtils.readLines(new File(DATA_DIR + FILE_LIST_FILE), StandardCharsets.UTF_8); + List<String> lines = Files.readAllLines(new File(DATA_DIR + FILE_LIST_FILE).toPath(), StandardCharsets.UTF_8); for(String l: lines){ int endOfFname = l.indexOf('.'), startOfFname = l.lastIndexOf('/'); String filenameOld = DATA_DIR + l.substring(0, endOfFname)+".txt"; String content = normalize(new File(filenameOld)); String filenameNew = DESTINATION_DIR + l.substring(startOfFname+1, endOfFname)+".txt"; //FileUtils.copyFile(new File(filenameOld), new File(filenameNew)); - FileUtils.writeStringToFile(new File(filenameNew), content, StandardCharsets.UTF_8); + Files.writeString(new File(filenameNew).toPath(), content, StandardCharsets.UTF_8); } } catch (Exception e) { e.printStackTrace(); @@ -52,7 +51,7 @@ public class EmailTrainingSetFormer { public static String normalize(File f){ String content=""; try { - content = FileUtils.readFileToString(f, StandardCharsets.UTF_8); + content = Files.readString(f.toPath(), StandardCharsets.UTF_8); } catch (IOException e) { e.printStackTrace(); } diff --git a/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2Obj.java b/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2Obj.java index e4beac6..3d8929f 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2Obj.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/nl2code/NL2Obj.java @@ -30,18 +30,15 @@ public class NL2Obj { ObjectControlOp prevOp; public NL2Obj(String path) { + this(); + } + + public NL2Obj() { prevOp = new ObjectControlOp(); prevOp.setOperatorIf(""); prevOp.setOperatorFor(""); - parser = ParserChunker2MatcherProcessor.getInstance(path); + parser = ParserChunker2MatcherProcessor.getInstance(); } - - public NL2Obj() { - prevOp = new ObjectControlOp(); - prevOp.setOperatorIf(""); - prevOp.setOperatorFor(""); - parser = ParserChunker2MatcherProcessor.getInstance(); - } static final String[] EPISTEMIC_STATES_LIST = new String[] { "select", "verify", "find", "start", "stop", "go", "check" diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java index b6bc2b1..0bf2e59 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorRunner.java @@ -18,26 +18,13 @@ package opennlp.tools.similarity.apps; import java.util.List; -import javax.mail.internet.AddressException; -import javax.mail.internet.InternetAddress; - -import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; +import jakarta.mail.internet.AddressException; +import jakarta.mail.internet.InternetAddress; public class ContentGeneratorRunner { + public static void main(String[] args) { - ParserChunker2MatcherProcessor sm = null; - - try { - String resourceDir = args[2]; - if (resourceDir!=null) - sm = ParserChunker2MatcherProcessor.getInstance(resourceDir); - else - sm = ParserChunker2MatcherProcessor.getInstance(); - - } catch (Exception e) { - e.printStackTrace(); - } - + String bingKey = args[7]; if (bingKey == null){ bingKey = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0="; diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/CommentsRel.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/CommentsRel.java index e80e94e..85c4714 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/CommentsRel.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/CommentsRel.java @@ -23,7 +23,7 @@ import java.io.File; import java.io.IOException; import java.math.BigInteger; -import javax.xml.bind.JAXBException; +import jakarta.xml.bind.JAXBException; import org.docx4j.XmlUtils; import org.docx4j.jaxb.Context; diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java index a40c0bb..bd79132 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java @@ -16,17 +16,14 @@ */ package opennlp.tools.similarity.apps.solr; -import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; import java.util.List; import java.util.logging.Logger; -import javax.mail.internet.AddressException; -import javax.mail.internet.InternetAddress; +import jakarta.mail.internet.AddressException; +import jakarta.mail.internet.InternetAddress; import org.apache.solr.common.util.NamedList; import org.apache.solr.handler.component.SearchHandler; @@ -36,7 +33,6 @@ import org.apache.solr.response.SolrQueryResponse; import opennlp.tools.similarity.apps.HitBase; import opennlp.tools.similarity.apps.RelatedSentenceFinder; import opennlp.tools.similarity.apps.RelatedSentenceFinderML; -import opennlp.tools.textsimilarity.chunker2matcher.ParserChunker2MatcherProcessor; public class ContentGeneratorRequestHandler extends SearchHandler { private static final Logger LOG = @@ -97,44 +93,13 @@ public class ContentGeneratorRequestHandler extends SearchHandler { } - static class StreamLogger extends Thread{ - - private final InputStream mInputStream; - - public StreamLogger(InputStream is) { - this.mInputStream = is; - } - - public void run() { - try { - InputStreamReader isr = new InputStreamReader(mInputStream); - BufferedReader br = new BufferedReader(isr); - String line; - while ((line = br.readLine()) != null) { - System.out.println(line); - } - } catch (IOException ioe) { - ioe.printStackTrace(); - } - } - } - public String cgRunner(String[] args) { - int count=0; + + int count=0; for(String a: args){ System.out.print(count+">>" + a + " | "); count++; } - try { - String resourceDir = args[2]; - ParserChunker2MatcherProcessor sm = null; - if (resourceDir!=null) - sm = ParserChunker2MatcherProcessor.getInstance(resourceDir); - else - sm = ParserChunker2MatcherProcessor.getInstance(); - } catch (Exception e) { - e.printStackTrace(); - } String bingKey = args[7]; if (bingKey == null){ diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerRequestHandler.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerRequestHandler.java index 3e77f43..c7345fc 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerRequestHandler.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerRequestHandler.java @@ -16,11 +16,11 @@ */ package opennlp.tools.similarity.apps.solr; +import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.Comparator; import java.util.Iterator; import java.util.List; -import java.util.logging.Logger; import opennlp.tools.similarity.apps.HitBase; import opennlp.tools.textsimilarity.ParseTreeChunk; @@ -34,16 +34,16 @@ import org.apache.solr.common.util.NamedList; import org.apache.solr.handler.component.SearchHandler; import org.apache.solr.request.SolrQueryRequest; import org.apache.solr.response.SolrQueryResponse; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class SearchResultsReRankerRequestHandler extends SearchHandler { - private static final Logger LOG = - Logger.getLogger("com.become.search.requestHandlers.SearchResultsReRankerRequestHandler"); + + private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + private final static int MAX_SEARCH_RESULTS = 100; private final ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer(); private ParserChunker2MatcherProcessor sm = null; - private static final String RESOURCE_DIR = "/home/solr/solr-4.4.0/example/src/test/resources"; - //"C:/workspace/TestSolr/src/test/resources"; - //"/data1/solr/example/src/test/resources"; public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp){ // get query string @@ -66,10 +66,6 @@ public class SearchResultsReRankerRequestHandler extends SearchHandler { List<HitBase> searchResults = new ArrayList<>(); - - - - for (int i = 0; i< MAX_SEARCH_RESULTS; i++){ String title = req.getParams().get("t"+i); String descr = req.getParams().get("d"+i); @@ -106,7 +102,6 @@ public class SearchResultsReRankerRequestHandler extends SearchHandler { } } - List<HitBase> reRankedResults; query = query.replace('+', ' '); if (tooFewKeywords(query)|| orQuery(query)){ @@ -165,12 +160,11 @@ public class SearchResultsReRankerRequestHandler extends SearchHandler { return false; } - private List<HitBase> calculateMatchScoreResortHits(List<HitBase> hits, - String searchQuery) { + private List<HitBase> calculateMatchScoreResortHits(List<HitBase> hits, String searchQuery) { try { - sm = ParserChunker2MatcherProcessor.getInstance(RESOURCE_DIR); - } catch (Exception e){ - LOG.severe(e.getMessage()); + sm = ParserChunker2MatcherProcessor.getInstance(); + } catch (RuntimeException e){ + LOG.error(e.getMessage(), e); } List<HitBase> newHitList = new ArrayList<>(); diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderEndNotes.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderEndNotes.java index afe37fc..dcda0ce 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderEndNotes.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/WordDocBuilderEndNotes.java @@ -16,15 +16,11 @@ */ package opennlp.tools.similarity.apps.solr; - import java.io.File; import java.math.BigInteger; import java.util.ArrayList; import java.util.List; -import javax.xml.bind.JAXBException; - -import org.apache.commons.lang.StringUtils; import org.docx4j.XmlUtils; import org.docx4j.jaxb.Context; import org.docx4j.openpackaging.exceptions.InvalidFormatException; @@ -69,7 +65,7 @@ public class WordDocBuilderEndNotes extends WordDocBuilderSingleImageSearchCall{ String processedParaTitle = processParagraphTitle(para.getTitle()); if (processedParaTitle!=null && - !processedParaTitle.endsWith("..") || StringUtils.isAlphanumeric(processedParaTitle)){ + !processedParaTitle.endsWith("..") || processedParaTitle.chars().allMatch(this::isAlphanumeric)){ wordMLPackage.getMainDocumentPart().addStyledParagraphOfText("Subtitle",processedParaTitle); } String paraText = processParagraphText(para.getFragments().toString()); @@ -85,7 +81,7 @@ public class WordDocBuilderEndNotes extends WordDocBuilderSingleImageSearchCall{ "<w:rStyle w:val=\"EndnoteReference\"/></w:rPr><w:endnoteRef/></w:r><w:r><w:t xml:space=\"preserve\"> "+ url + "</w:t></w:r></w:p>"; try { endnote.getEGBlockLevelElts().add( XmlUtils.unmarshalString(endnoteBody)); - } catch (JAXBException e) { + } catch (Exception e) { e.printStackTrace(); } @@ -95,7 +91,7 @@ public class WordDocBuilderEndNotes extends WordDocBuilderSingleImageSearchCall{ try { wordMLPackage.getMainDocumentPart().addParagraph(docBody); - } catch (JAXBException e) { + } catch (Exception e) { e.printStackTrace(); } @@ -172,20 +168,25 @@ public class WordDocBuilderEndNotes extends WordDocBuilderSingleImageSearchCall{ return bestPart; } + private boolean isAlphanumeric(final int codePoint) { + return (codePoint >= 65 && codePoint <= 90) || + (codePoint >= 97 && codePoint <= 122) || + (codePoint >= 48 && codePoint <= 57); + } - public static void main(String[] args){ - WordDocBuilderEndNotes b = new WordDocBuilderEndNotes(); - List<HitBase> content = new ArrayList<>(); - for(int i = 0; i<10; i++){ - HitBase h = new HitBase(); - h.setTitle("albert einstein "+i); - List<Fragment> frs = new ArrayList<>(); - frs.add(new Fragment(" content "+i, 0)); - h.setFragments(frs); - h.setUrl("http://www."+i+".com"); - content.add(h); - } - - b.buildWordDoc(content, "albert einstein"); - } + public static void main(String[] args){ + WordDocBuilderEndNotes b = new WordDocBuilderEndNotes(); + List<HitBase> content = new ArrayList<>(); + for(int i = 0; i<10; i++){ + HitBase h = new HitBase(); + h.setTitle("albert einstein "+i); + List<Fragment> frs = new ArrayList<>(); + frs.add(new Fragment(" content "+i, 0)); + h.setFragments(frs); + h.setUrl("http://www."+i+".com"); + content.add(h); + } + + b.buildWordDoc(content, "albert einstein"); + } } diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java index 22dc78b..97eda63 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserChunker2MatcherProcessor.java @@ -18,11 +18,7 @@ package opennlp.tools.textsimilarity.chunker2matcher; -import java.io.BufferedInputStream; -import java.io.File; -import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStream; import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.HashMap; @@ -39,18 +35,19 @@ import opennlp.tools.parser.ParserFactory; import opennlp.tools.parser.ParserModel; import opennlp.tools.postag.POSModel; import opennlp.tools.postag.POSTagger; -import opennlp.tools.postag.POSTaggerME; +import opennlp.tools.postag.ThreadSafePOSTaggerME; import opennlp.tools.sentdetect.SentenceDetector; -import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; +import opennlp.tools.sentdetect.ThreadSafeSentenceDetectorME; import opennlp.tools.textsimilarity.LemmaPair; import opennlp.tools.textsimilarity.ParseTreeChunk; import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic; import opennlp.tools.textsimilarity.SentencePairMatchResult; import opennlp.tools.textsimilarity.TextProcessor; +import opennlp.tools.tokenize.ThreadSafeTokenizerME; import opennlp.tools.tokenize.Tokenizer; -import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; +import opennlp.tools.util.DownloadUtil; import opennlp.tools.util.Span; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -60,11 +57,6 @@ public class ParserChunker2MatcherProcessor { private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); static final int MIN_SENTENCE_LENGTH = 10; - private static final String MODEL_DIR_KEY = "nlp.models.dir"; - // TODO config - // this is where resources should live - private static String MODEL_DIR=null; - private static final String MODEL_DIR_REL = "src/test/resources/models"; protected static ParserChunker2MatcherProcessor instance; private SentenceDetector sentenceDetector; @@ -75,30 +67,6 @@ public class ParserChunker2MatcherProcessor { private static final int NUMBER_OF_SECTIONS_IN_SENTENCE_CHUNKS = 5; private Map<String, String[][]> sentence_parseObject; - public SentenceDetector getSentenceDetector() { - return sentenceDetector; - } - - public void setSentenceDetector(SentenceDetector sentenceDetector) { - this.sentenceDetector = sentenceDetector; - } - - public Tokenizer getTokenizer() { - return tokenizer; - } - - public void setTokenizer(Tokenizer tokenizer) { - this.tokenizer = tokenizer; - } - - public ChunkerME getChunker() { - return chunker; - } - - public void setChunker(ChunkerME chunker) { - this.chunker = chunker; - } - @SuppressWarnings("unchecked") protected ParserChunker2MatcherProcessor() { try { @@ -108,29 +76,65 @@ public class ParserChunker2MatcherProcessor { LOG.warn("parsing cache file does not exist (but should be created)"); sentence_parseObject = new HashMap<>(); } - if (sentence_parseObject == null) - sentence_parseObject = new HashMap<>(); try { - if (MODEL_DIR==null || MODEL_DIR.equals("/models")) { - String absPath = new File(".").getAbsolutePath(); - absPath = absPath.substring(0, absPath.length()-1); - MODEL_DIR = absPath + MODEL_DIR_REL; - } - //get full path from constructor - initializeSentenceDetector(); initializeTokenizer(); initializePosTagger(); initializeParser(); initializeChunker(); - } catch (Exception e) { // a typical error when 'model' is not installed - LOG.warn("The model can't be read and we rely on cache"); - LOG.warn("Please put OpenNLP model files in 'src/test/resources' (folder 'model')"); + } catch (IOException e) { + LOG.warn("A model can't be loaded: {}", e.getMessage()); } } - // closing the processor, clearing loaded ling models and serializing parsing cache + protected void initializeSentenceDetector() throws IOException { + SentenceModel model = DownloadUtil.downloadModel( + "en", DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class); + sentenceDetector = new ThreadSafeSentenceDetectorME(model); + } + + protected void initializeTokenizer() throws IOException { + TokenizerModel model = DownloadUtil.downloadModel( + "en", DownloadUtil.ModelType.TOKENIZER, TokenizerModel.class); + tokenizer = new ThreadSafeTokenizerME(model); + } + + protected void initializePosTagger() throws IOException { + POSModel model = DownloadUtil.downloadModel( + "en", DownloadUtil.ModelType.POS, POSModel.class); + posTagger = new ThreadSafePOSTaggerME(model); + } + + protected void initializeParser() throws IOException { + ParserModel model = DownloadUtil.downloadModel( + "en", DownloadUtil.ModelType.PARSER, ParserModel.class); + parser = ParserFactory.create(model); + } + + private void initializeChunker() throws IOException { + ChunkerModel model = DownloadUtil.downloadModel( + "en", DownloadUtil.ModelType.CHUNKER, ChunkerModel.class); + chunker = new ChunkerME(model); + } + + public SentenceDetector getSentenceDetector() { + return sentenceDetector; + } + + public Tokenizer getTokenizer() { + return tokenizer; + } + + public POSTagger getPOSTagger() { + return posTagger; + } + + public ChunkerME getChunker() { + return chunker; + } + + // closing the processor and serializing parsing cache public void close() { instance = null; ParserCacheSerializer.writeObject(sentence_parseObject); @@ -147,14 +151,6 @@ public class ParserChunker2MatcherProcessor { return instance; } - - public synchronized static ParserChunker2MatcherProcessor getInstance(String fullPathToResources) { - MODEL_DIR = fullPathToResources+"/models"; - if (instance == null) - instance = new ParserChunker2MatcherProcessor(); - - return instance; - } /** * General parsing function, which returns lists of parses for a portion of @@ -165,7 +161,7 @@ public class ParserChunker2MatcherProcessor { * @return lists of parses */ public List<List<Parse>> parseTextNlp(String text) { - if (text == null || text.trim().length() == 0) + if (text == null || text.trim().isEmpty()) return null; List<List<Parse>> textParses = new ArrayList<>(1); @@ -173,7 +169,7 @@ public class ParserChunker2MatcherProcessor { // parse paragraph by paragraph String[] paragraphList = splitParagraph(text); for (String paragraph : paragraphList) { - if (paragraph.length() == 0) + if (paragraph.isEmpty()) continue; List<Parse> paragraphParses = parseParagraphNlp(paragraph); @@ -185,7 +181,7 @@ public class ParserChunker2MatcherProcessor { } public List<Parse> parseParagraphNlp(String paragraph) { - if (paragraph == null || paragraph.trim().length() == 0) + if (paragraph == null || paragraph.trim().isEmpty()) return null; // normalize the text before parsing, otherwise, the sentences may not @@ -197,7 +193,7 @@ public class ParserChunker2MatcherProcessor { List<Parse> parseList = new ArrayList<>(sentences.length); for (String sentence : sentences) { sentence = sentence.trim(); - if (sentence.length() == 0) + if (sentence.isEmpty()) continue; Parse sentenceParse = parseSentenceNlp(sentence, false); @@ -250,9 +246,8 @@ public class ParserChunker2MatcherProcessor { List<List<ParseTreeChunk>> singleSentChunks = formGroupedPhrasesFromChunksForSentence(sent); if (singleSentChunks == null) continue; - if (listOfChunksAccum.size() < 1) { - listOfChunksAccum = new ArrayList<>( - singleSentChunks); + if (listOfChunksAccum.isEmpty()) { + listOfChunksAccum = new ArrayList<>(singleSentChunks); } else for (int i = 0; i < NUMBER_OF_SECTIONS_IN_SENTENCE_CHUNKS; i++) { // make sure not null @@ -468,7 +463,7 @@ public class ParserChunker2MatcherProcessor { public static List<List<SentenceNode>> textToSentenceNodes( List<List<Parse>> textParses) { - if (textParses == null || textParses.size() == 0) + if (textParses == null || textParses.isEmpty()) return null; List<List<SentenceNode>> textNodes = new ArrayList<>( @@ -477,18 +472,18 @@ public class ParserChunker2MatcherProcessor { List<SentenceNode> paragraphNodes = paragraphToSentenceNodes(paragraphParses); // append paragraph node if any - if (paragraphNodes != null && paragraphNodes.size() > 0) + if (paragraphNodes != null && !paragraphNodes.isEmpty()) textNodes.add(paragraphNodes); } - if (textNodes.size() > 0) + if (!textNodes.isEmpty()) return textNodes; else return null; } public static List<SentenceNode> paragraphToSentenceNodes(List<Parse> paragraphParses) { - if (paragraphParses == null || paragraphParses.size() == 0) + if (paragraphParses == null || paragraphParses.isEmpty()) return null; List<SentenceNode> paragraphNodes = new ArrayList<>(paragraphParses.size()); @@ -506,7 +501,7 @@ public class ParserChunker2MatcherProcessor { paragraphNodes.add(sentenceNode); } - if (paragraphNodes.size() > 0) + if (!paragraphNodes.isEmpty()) return paragraphNodes; else return null; @@ -518,10 +513,10 @@ public class ParserChunker2MatcherProcessor { // convert the OpenNLP Parse to our own tree nodes SyntacticTreeNode node = toSyntacticTreeNode(sentenceParse); - if ((node == null)) + if (node == null) return null; - if (node instanceof SentenceNode) - return (SentenceNode) node; + if (node instanceof SentenceNode sn) + return sn; else if (node instanceof PhraseNode) { return new SentenceNode("sentence", node.getChildren()); } else @@ -575,56 +570,6 @@ public class ParserChunker2MatcherProcessor { return tokenizer.tokenize(sentence); } - protected void initializeSentenceDetector() { - try (InputStream is = new BufferedInputStream(new FileInputStream(MODEL_DIR + "/en-sent.bin"))) { - SentenceModel model = new SentenceModel(is); - sentenceDetector = new SentenceDetectorME(model); - } catch (IOException e) { - // we swallow exception to support the cached run - LOG.debug(e.getLocalizedMessage(), e); - } - } - - protected void initializeTokenizer() { - try (InputStream is = new BufferedInputStream(new FileInputStream(MODEL_DIR + "/en-token.bin"))) { - TokenizerModel model = new TokenizerModel(is); - tokenizer = new TokenizerME(model); - } catch (IOException e) { - // we swallow exception to support the cached run - LOG.debug(e.getLocalizedMessage(), e); - } - } - - protected void initializePosTagger() { - try (InputStream is = new BufferedInputStream(new FileInputStream(MODEL_DIR + "/en-pos-maxent.bin"))) { - POSModel model = new POSModel(is); - posTagger = new POSTaggerME(model); - } catch (IOException e) { - // we swallow exception to support the cached run - LOG.debug(e.getLocalizedMessage(), e); - } - } - - protected void initializeParser() { - try (InputStream is = new BufferedInputStream(new FileInputStream(MODEL_DIR + "/en-parser-chunking.bin"))) { - ParserModel model = new ParserModel(is); - parser = ParserFactory.create(model); - } catch (IOException e) { - // we swallow exception to support the cached run - LOG.debug(e.getLocalizedMessage(), e); - } - } - - private void initializeChunker() { - try (InputStream is = new BufferedInputStream(new FileInputStream(MODEL_DIR + "/en-chunker.bin"))) { - ChunkerModel model = new ChunkerModel(is); - chunker = new ChunkerME(model); - } catch (IOException e) { - // we swallow exception to support the cached run - LOG.debug(e.getLocalizedMessage(), e); - } - } - /** * convert an instance of Parse to SyntacticTreeNode, by filtering out the * unnecessary data and assigning the word for each node @@ -641,11 +586,11 @@ public class ParserChunker2MatcherProcessor { return null; String text = parse.getText(); - ArrayList<SyntacticTreeNode> childrenNodeList = convertChildrenNodes(parse); + List<SyntacticTreeNode> childrenNodeList = convertChildrenNodes(parse); // check sentence node, the node contained in the top node if (type.equals(AbstractBottomUpParser.TOP_NODE) - && childrenNodeList != null && childrenNodeList.size() > 0) { + && childrenNodeList != null && !childrenNodeList.isEmpty()) { PhraseNode rootNode; try { rootNode = (PhraseNode) childrenNodeList.get(0); @@ -656,7 +601,7 @@ public class ParserChunker2MatcherProcessor { } // if this node contains children nodes, then it is a phrase node - if (childrenNodeList != null && childrenNodeList.size() > 0) { + if (childrenNodeList != null && !childrenNodeList.isEmpty()) { // System.out.println("Found "+ type + " phrase = "+ childrenNodeList); return new PhraseNode(type, childrenNodeList); @@ -669,7 +614,7 @@ public class ParserChunker2MatcherProcessor { return new WordNode(type, word); } - private static ArrayList<SyntacticTreeNode> convertChildrenNodes(Parse parse) { + private static List<SyntacticTreeNode> convertChildrenNodes(Parse parse) { if (parse == null) return null; @@ -677,7 +622,7 @@ public class ParserChunker2MatcherProcessor { if (children == null || children.length == 0) return null; - ArrayList<SyntacticTreeNode> childrenNodeList = new ArrayList<>(); + List<SyntacticTreeNode> childrenNodeList = new ArrayList<>(); for (Parse child : children) { SyntacticTreeNode childNode = toSyntacticTreeNode(child); if (childNode != null) @@ -711,7 +656,7 @@ public class ParserChunker2MatcherProcessor { protected List<LemmaPair> listListParseTreeChunk2ListLemmaPairs( List<List<ParseTreeChunk>> sent1GrpLst) { List<LemmaPair> results = new ArrayList<>(); - if (sent1GrpLst == null || sent1GrpLst.size() < 1) + if (sent1GrpLst == null || sent1GrpLst.isEmpty()) return results; List<ParseTreeChunk> wholeSentence = sent1GrpLst .get(sent1GrpLst.size() - 1); // whole sentence is last list in the list diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java index 2e21705..c5e5dca 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/chunker2matcher/ParserPure2MatcherProcessor.java @@ -33,9 +33,13 @@ package opennlp.tools.textsimilarity.chunker2matcher; +import java.io.IOException; +import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.List; -import java.util.logging.Logger; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import opennlp.tools.textsimilarity.LemmaPair; import opennlp.tools.textsimilarity.ParseTreeChunk; @@ -44,9 +48,10 @@ import opennlp.tools.textsimilarity.SentencePairMatchResult; import opennlp.tools.textsimilarity.TextProcessor; public class ParserPure2MatcherProcessor extends ParserChunker2MatcherProcessor { + + private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + protected static ParserPure2MatcherProcessor pinstance; - private static final Logger LOG = Logger - .getLogger("opennlp.tools.textsimilarity.chunker2matcher.ParserPure2MatcherProcessor"); public synchronized static ParserPure2MatcherProcessor getInstance() { if (pinstance == null) @@ -56,10 +61,14 @@ public class ParserPure2MatcherProcessor extends ParserChunker2MatcherProcessor } private ParserPure2MatcherProcessor() { - initializeSentenceDetector(); - initializeTokenizer(); - initializePosTagger(); - initializeParser(); + try { + initializeSentenceDetector(); + initializeTokenizer(); + initializePosTagger(); + initializeParser(); + } catch (IOException e) { + LOG.warn("A model can't be loaded: {}", e.getMessage()); + } } public synchronized List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForSentence( @@ -70,7 +79,7 @@ public class ParserPure2MatcherProcessor extends ParserChunker2MatcherProcessor sentence = TextProcessor.removePunctuation(sentence); SentenceNode node = parseSentenceNode(sentence); if (node == null) { - LOG.info("Problem parsing sentence '" + sentence); + LOG.info("Problem parsing sentence '{}'", sentence); return null; } List<ParseTreeChunk> ptcList = node.getParseTreeChunkList(); @@ -78,7 +87,8 @@ public class ParserPure2MatcherProcessor extends ParserChunker2MatcherProcessor List<String> TokList = node.getOrderedLemmaList(); List<List<ParseTreeChunk>> listOfChunks = new ArrayList<>(); - List<ParseTreeChunk> nounPhr = new ArrayList<>(), prepPhr = new ArrayList<>(), verbPhr = new ArrayList<>(), adjPhr = new ArrayList<>(), + List<ParseTreeChunk> nounPhr = new ArrayList<>(), prepPhr = new ArrayList<>(), + verbPhr = new ArrayList<>(), adjPhr = new ArrayList<>(), // to store the whole sentence wholeSentence = new ArrayList<>(); @@ -112,11 +122,7 @@ public class ParserPure2MatcherProcessor extends ParserChunker2MatcherProcessor List<List<ParseTreeChunk>> sent1GrpLst = formGroupedPhrasesFromChunksForPara(para1), sent2GrpLst = formGroupedPhrasesFromChunksForPara(para2); - List<LemmaPair> origChunks1 = listListParseTreeChunk2ListLemmaPairs(sent1GrpLst); // TODO - // need - // to - // populate - // it! + List<LemmaPair> origChunks1 = listListParseTreeChunk2ListLemmaPairs(sent1GrpLst); ParseTreeMatcherDeterministic md = new ParseTreeMatcherDeterministic(); List<List<ParseTreeChunk>> res = md @@ -126,16 +132,13 @@ public class ParserPure2MatcherProcessor extends ParserChunker2MatcherProcessor } public static void main(String[] args) throws Exception { - ParserPure2MatcherProcessor parser = ParserPure2MatcherProcessor - .getInstance(); + ParserPure2MatcherProcessor parser = ParserPure2MatcherProcessor.getInstance(); String text = "Its classy design and the Mercedes name make it a very cool vehicle to drive. "; List<List<ParseTreeChunk>> res = parser .formGroupedPhrasesFromChunksForPara(text); System.out.println(res); - // System.exit(0); - String phrase1 = "Its classy design and the Mercedes name make it a very cool vehicle to drive. " + "The engine makes it a powerful car. " + "The strong engine gives it enough power. " @@ -145,18 +148,15 @@ public class ParserPure2MatcherProcessor extends ParserChunker2MatcherProcessor + "This car provides you a very good mileage."; String sentence = "Not to worry with the 2cv."; - System.out.println(parser.assessRelevance(phrase1, phrase2) - .getMatchResult()); - - System.out - .println(parser - .formGroupedPhrasesFromChunksForSentence("Its classy design and the Mercedes name make it a very cool vehicle to drive. ")); - System.out - .println(parser - .formGroupedPhrasesFromChunksForSentence("Sounds too good to be true but it actually is, the world's first flying car is finally here. ")); - System.out - .println(parser - .formGroupedPhrasesFromChunksForSentence("UN Ambassador Ron Prosor repeated the Israeli position that the only way the Palestinians will get UN membership and statehood is through direct negotiations with the Israelis on a comprehensive peace agreement")); + System.out.println(parser.assessRelevance(phrase1, phrase2).getMatchResult()); + + System.out.println(parser.formGroupedPhrasesFromChunksForSentence( + "Its classy design and the Mercedes name make it a very cool vehicle to drive. ")); + System.out.println(parser.formGroupedPhrasesFromChunksForSentence( + "Sounds too good to be true but it actually is, the world's first flying car is finally here. ")); + System.out.println(parser.formGroupedPhrasesFromChunksForSentence( + "UN Ambassador Ron Prosor repeated the Israeli position that the only way the Palestinians will get " + + "UN membership and statehood is through direct negotiations with the Israelis on a comprehensive peace agreement")); } } diff --git a/opennlp-similarity/src/test/resources/models/en-sent.bin b/opennlp-similarity/src/test/resources/models/en-sent.bin deleted file mode 100644 index e89076b..0000000 Binary files a/opennlp-similarity/src/test/resources/models/en-sent.bin and /dev/null differ diff --git a/pom.xml b/pom.xml index c2f4a52..e98b18d 100644 --- a/pom.xml +++ b/pom.xml @@ -158,22 +158,38 @@ <artifactId>slf4j-api</artifactId> <version>${slf4j.version}</version> </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>log4j-over-slf4j</artifactId> + <version>${slf4j.version}</version> + <scope>runtime</scope> + </dependency> <dependency> <groupId>commons-lang</groupId> <artifactId>commons-lang</artifactId> <version>2.6</version> </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <version>2.18.0</version> + </dependency> <dependency> <groupId>org.apache.commons</groupId> <artifactId>commons-lang3</artifactId> - <version>3.12.0</version> + <version>3.17.0</version> </dependency> <dependency> <groupId>commons-codec</groupId> <artifactId>commons-codec</artifactId> <version>1.15</version> </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-mat3</artifactId> + <version>3.6.1</version> + </dependency> <dependency> <groupId>commons-logging</groupId> <artifactId>commons-logging</artifactId>
