This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch OPENNLP-1594-Add-stricter-tests-for-Summarizer-component in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit 976f251606221335a3c0f22971f885bcc80ea785 Author: Martin Wiesner <[email protected]> AuthorDate: Thu Jul 11 11:09:09 2024 +0200 OPENNLP-1594 Add stricter tests for Summarizer component - adds further, stricter tests - clarifies, at API level, the semantics and constraints of parameters - separates tests so that each test class has a clear responsibility for its class under test - removes binary model files from test/resources folder - improves / enhances the JavaDoc further --- summarizer/pom.xml | 10 +- .../java/opennlp/summarization/DocProcessor.java | 14 +- .../src/main/java/opennlp/summarization/Score.java | 8 +- .../main/java/opennlp/summarization/Sentence.java | 80 +++++---- .../java/opennlp/summarization/Summarizer.java | 9 +- .../LexChainingKeywordExtractor.java | 45 +++-- .../lexicalchaining/LexicalChain.java | 18 +- .../lexicalchaining/LexicalChainingSummarizer.java | 200 ++++++++++++--------- .../lexicalchaining/NounPOSTagger.java | 124 +++++++++++++ .../lexicalchaining/OpenNLPPOSTagger.java | 92 ---------- .../summarization/lexicalchaining/POSTagger.java | 25 ++- .../WordRelationshipDetermination.java | 21 +-- .../summarization/lexicalchaining/WordnetWord.java | 69 ++++++- .../opennlp/summarization/meta/MetaSummarizer.java | 6 +- .../preprocess/DefaultDocProcessor.java | 156 +++++++++------- .../summarization/preprocess/IDFWordWeight.java | 13 +- .../summarization/preprocess/StopWords.java | 2 +- .../summarization/preprocess/WordWeight.java | 9 +- .../opennlp/summarization/textrank/TextRank.java | 100 ++++++----- .../summarization/textrank/TextRankSummarizer.java | 2 +- .../summarization/AbstractSummarizerTest.java | 28 +-- .../java/opennlp/summarization/SentenceTest.java | 104 +++++++++++ .../lexicalchaining/AbstractLexicalChainTest.java | 40 +++++ .../lexicalchaining/LexChainTest.java | 109 ----------- .../LexChainingKeywordExtractorTest.java | 68 ++++--- .../LexicalChainingSummarizerNewsTest.java} | 37 ++-- .../LexicalChainingSummarizerTest.java | 55 ++++-- .../lexicalchaining/NounPOSTaggerTest.java | 104 +++++++++++ .../WordRelationshipDeterminationTest.java | 63 +++++++ .../lexicalchaining/WordnetWordTest.java | 107 +++++++++++ .../preprocess/DefaultDocProcessorTest.java | 116 ++++++++++++ summarizer/src/test/resources/{meta => }/Notes.txt | 0 summarizer/src/test/resources/en-pos-maxent.bin | Bin 1175564 -> 0 bytes summarizer/src/test/resources/en-sent.bin | Bin 20317 -> 0 bytes summarizer/src/test/resources/{meta => }/idf.csv | 0 .../0a2035f3f73b06a5150a6f01cffdf45d027bbbed.story | 0 .../0a2278bec4a80aec1bc3e9e7a9dac10ac1b6425b.story | 0 .../0a3040b6c1bba95efca727158f128a19c44ec8ba.story | 0 .../0a3479b53796863a664c32ca20d8672583335d2a.story | 0 .../0a3639cb86487e72e2ba084211f99799918aedf8.story | 0 .../0a4092bef1801863296777ebcfeceb1aec23c78f.story | 0 .../0a4324d4a5effa420aa95bb058314eab35c73852.story | 0 .../0a5458d3427b290524a8df11d8503a5b57b32747.story | 0 .../0a5691b8fe654b6b2cdace5ab87aff2ee4c23577.story | 0 .../0a6790f886a42a76945d4a21ed27c4ebd9ca1025.story | 0 45 files changed, 1277 insertions(+), 557 deletions(-) diff --git a/summarizer/pom.xml b/summarizer/pom.xml index 19237f3..2c4da8f 100644 --- a/summarizer/pom.xml +++ b/summarizer/pom.xml @@ -31,10 +31,18 @@ <name>Apache OpenNLP Summarizer</name> <properties> + <wordnet.version>2.4.0</wordnet.version> <wordnet-dict.version>3.1</wordnet-dict.version> <maven.download.plugin>1.9.0</maven.download.plugin> </properties> + <repositories> + <repository> + <id>maven.aksw.org</id> + <url>https://maven.aksw.org/repository/internal/</url> + <releases/> + </repository> + </repositories> <dependencies> <dependency> @@ -45,7 +53,7 @@ <dependency> <groupId>edu.mit</groupId> <artifactId>jwi</artifactId> - <version>2.2.3</version> + <version>${wordnet.version}</version> </dependency> <dependency> diff --git a/summarizer/src/main/java/opennlp/summarization/DocProcessor.java b/summarizer/src/main/java/opennlp/summarization/DocProcessor.java index 65a992f..756744f 100644 --- a/summarizer/src/main/java/opennlp/summarization/DocProcessor.java +++ b/summarizer/src/main/java/opennlp/summarization/DocProcessor.java @@ -31,12 +31,20 @@ import opennlp.tools.stemmer.Stemmer; public interface DocProcessor { /** - * Extracts sentences from a string representing an article. + * Extracts {@link Sentence sentences} from a string representing an article. + * + * @param text The text to process; if {@code null} or empty, an empty list is returned. + * + * @return The resulting list of detected {@link Sentence sentences}. */ - List<Sentence> getSentencesFromStr(String text); + List<Sentence> getSentences(String text); /** - * Parses out words from a specified {@link String sent}. + * Extracts words from a specified {@link String sent}. + * + * @param sent The sentence to process; if {@code null} or empty, an zero length array is returned. + * + * @return An array of tokens (words) contained in the given {@code sent}. */ String[] getWords(String sent); diff --git a/summarizer/src/main/java/opennlp/summarization/Score.java b/summarizer/src/main/java/opennlp/summarization/Score.java index 76a2694..80751d6 100755 --- a/summarizer/src/main/java/opennlp/summarization/Score.java +++ b/summarizer/src/main/java/opennlp/summarization/Score.java @@ -18,14 +18,15 @@ package opennlp.summarization; /** - * Stores the score of a sentence for ranking sentences within a document. + * Encapsulates the score of a sentence for the purpose of ranking sentences within a document. */ public class Score implements Comparable<Score> { private int sentId; private double score; - public Score() { - score = 0; + public Score(int sentId, double score) { + this.sentId = sentId; + this.score = score; } public int getSentId() { @@ -46,7 +47,6 @@ public class Score implements Comparable<Score> { @Override public int compareTo(Score o) { - if (o.score > score) return 1; else if (o.score < score) return -1; return 0; diff --git a/summarizer/src/main/java/opennlp/summarization/Sentence.java b/summarizer/src/main/java/opennlp/summarization/Sentence.java index a158199..e59d809 100755 --- a/summarizer/src/main/java/opennlp/summarization/Sentence.java +++ b/summarizer/src/main/java/opennlp/summarization/Sentence.java @@ -32,7 +32,8 @@ import opennlp.tools.stemmer.PorterStemmer; public class Sentence { private static final String SPACE = " "; - private final List<Sentence> links; + private final List<Sentence> links = new ArrayList<>(); + // sentId is always position of sentence in doc. private int sentId; private String stringVal; @@ -43,23 +44,32 @@ public class Sentence { private double wordWt = 0; private int wordCnt; - public Sentence() { - links = new ArrayList<>(); - } + /** + * Instantiates a plain {@link Sentence} via a set of parameters. + * + * @param id A numeric identifier with a postive value. + * @param stringVal The string representation of the sentence. + * @param paragraph TODO clarify exact meaning. + * @param paraPos TODO clarify exact meaning. + * @throws IllegalArgumentException Thrown if parameters are invalid. + */ + public Sentence(int id, String stringVal, int paragraph, int paraPos) { + if (id < 0) throw new IllegalArgumentException("Parameter 'id' cannot be negative"); + if (stringVal == null || stringVal.isBlank()) + throw new IllegalArgumentException("Parameter 'stringVal' must not be null"); + if (paragraph < 0) throw new IllegalArgumentException("Parameter 'paragraph' cannot be negative"); + if (paraPos < 0) throw new IllegalArgumentException("Parameter 'paraPos' cannot be negative"); - public Sentence(int id) { - this(); this.sentId = id; - } + setParagraph(paragraph); + setStringVal(stringVal); + setParaPos(paraPos); + }; public int getSentId() { return sentId; } - public void setSentId(int sentId) { - this.sentId = sentId; - } - public Score getPageRankScore() { return pageRankScore; } @@ -113,38 +123,21 @@ public class Sentence { return this.links; } - public double getWordWt() { + public double getWordWeight() { return wordWt; } - public void setWordWt(double wordWt) { + public void setWordWeight(double wordWt) { this.wordWt = wordWt; } public int getWordCnt() { - return wordCnt == 0 ? this.getStringVal().split("\\s+").length : wordCnt; - } - - // Should add an article id to the sentence class. For now returns true if the ids are the same. - - @Override - public final boolean equals(Object o) { - if (this == o) return true; - if (!(o instanceof Sentence sentence)) return false; - - return sentId == sentence.sentId; - } - - @Override - public int hashCode() { - return Objects.hash(sentId); - } - - @Override - public String toString() { - return this.stringVal;//+ "("+ this.paragraph +", "+this.paraPos+")"; + return wordCnt; } + /** + * @return Applies stemming to each word and returns a fully-stemmed representation of a sentence. + */ public String stem() { PorterStemmer stemmer = new PorterStemmer(); StopWords sw = StopWords.getInstance(); @@ -167,4 +160,23 @@ public class Sentence { } return b.toString(); } + + // Should add an article id to the sentence class. For now returns true if the ids are the same. + @Override + public final boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof Sentence sentence)) return false; + + return sentId == sentence.sentId; + } + + @Override + public int hashCode() { + return Objects.hash(sentId); + } + + @Override + public String toString() { + return this.stringVal;//+ "("+ this.paragraph +", "+this.paraPos+")"; + } } diff --git a/summarizer/src/main/java/opennlp/summarization/Summarizer.java b/summarizer/src/main/java/opennlp/summarization/Summarizer.java index e3ae124..8271868 100644 --- a/summarizer/src/main/java/opennlp/summarization/Summarizer.java +++ b/summarizer/src/main/java/opennlp/summarization/Summarizer.java @@ -17,15 +17,18 @@ package opennlp.summarization; +/** + * Describes the API of a component which summarizes the content of news, articles or books. + */ public interface Summarizer { /** - * Summarizes a given {@code article}. The length of the summary is + * Summarizes a given {@code text}. The length of the summary is * influenced by the specified {@code maxWords} parameter. * - * @param article The text to summarize. Must not be {@code null} and not be blank. + * @param text The content to summarize. Must not be {@code null} and not be blank. * @param maxWords The maximum number of words. Must be larger than {@code zero}. * @return The summary or an {@code empty} String if no summary could be derived. */ - String summarize(String article, int maxWords); + String summarize(String text, int maxWords); } diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractor.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractor.java index a313928..10820cd 100644 --- a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractor.java +++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractor.java @@ -22,20 +22,45 @@ import java.util.Collections; import java.util.List; /** - * Uses the lexical chaining algorithm to extract keywords. + * Uses the {@link LexicalChain lexical chaining} algorithm to extract keywords. + * + * @see LexicalChain */ public class LexChainingKeywordExtractor { - // Simple logic to pull out the keyword based on longest lexical chains.. - public List<String> getKeywords(List<LexicalChain> lexicalChains, int noOfKeywords) { - Collections.sort(lexicalChains); - List<String> ret = new ArrayList<>(); - for (int i = 0; i < Math.min(lexicalChains.size(), noOfKeywords); i++) { - List<Word> words = lexicalChains.get(i).getWord(); - if (!words.isEmpty() && !ret.contains(words.get(0).getLexicon())) { - ret.add(words.get(0).getLexicon()); + /** + * Extracts keywords from a list of {@link LexicalChain lexical chains}, limited by {@code noOfKeywords}. + * + * @param lexicalChains The {@link LexicalChain lexical chains} to process. Must not be {@code null}. + * @param noOfKeywords The upper limit of keywords. Must be greater than {@code zero}. + * + * @return The extracted keywords as a list. Guaranteed to be not {@code null}. + * + * @throws IllegalArgumentException Thrown if parameters are invalid. + * @implNote This operation is based on longest lexical chains. + */ + public List<String> extractKeywords(List<LexicalChain> lexicalChains, int noOfKeywords) { + if (lexicalChains == null) { + throw new IllegalArgumentException("Parameter 'lexicalChains' must not be null."); + } + if (noOfKeywords <= 0) { + throw new IllegalArgumentException("Parameter 'noOfKeywords' must be greater than 0."); + } + if (lexicalChains.isEmpty()) { + return Collections.emptyList(); + } else { + Collections.sort(lexicalChains); + List<String> ret = new ArrayList<>(); + for (int i = 0; i < Math.min(lexicalChains.size(), noOfKeywords); i++) { + List<Word> words = lexicalChains.get(i).getWords(); + if (!words.isEmpty()) { + Word w = words.get(0); + if (!ret.contains(w.getLexicon())) { + ret.add(w.getLexicon()); + } + } } + return ret; } - return ret; } } diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChain.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChain.java index 3da83e3..612465c 100644 --- a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChain.java +++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChain.java @@ -22,17 +22,19 @@ import java.util.List; import opennlp.summarization.Sentence; +/** + * Represents a lexical chain. + */ public class LexicalChain implements Comparable<LexicalChain> { - final List<Word> word; - final List<Sentence> sentences; + + final List<Word> words = new ArrayList<>(); + final List<Sentence> sentences = new ArrayList<>(); int start, last; int score; int occurrences = 1; public LexicalChain() { - word = new ArrayList<>(); - sentences = new ArrayList<>(); } public double score() { @@ -40,7 +42,7 @@ public class LexicalChain implements Comparable<LexicalChain> { } public int length() { - return word.size(); + return words.size(); } public float homogeneity() { @@ -48,7 +50,7 @@ public class LexicalChain implements Comparable<LexicalChain> { } public void addWord(Word w) { - word.add(w); + words.add(w); } public void addSentence(Sentence sent) { @@ -56,8 +58,8 @@ public class LexicalChain implements Comparable<LexicalChain> { sentences.add(sent); } - public List<Word> getWord() { - return word; + public List<Word> getWords() { + return words; } public List<Sentence> getSentences() { diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java index f243d69..53e480b 100755 --- a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java +++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java @@ -17,7 +17,7 @@ package opennlp.summarization.lexicalchaining; -import java.io.InputStream; +import java.io.IOException; import java.util.ArrayList; import java.util.Collections; import java.util.Hashtable; @@ -26,15 +26,19 @@ import java.util.List; import opennlp.summarization.DocProcessor; import opennlp.summarization.Sentence; import opennlp.summarization.Summarizer; +import opennlp.tools.postag.POSModel; /** - * Implements the algorithm outlined in - "Summarization Using Lexical Chains" by R. Berzilay et al. - * <p> + * Implements a {@link Summarizer summarization} algorithm outlined in: <br/> + * <a href="https://aclanthology.org/W97-0703.pdf"> + * "Summarization Using Lexical Chains"</a>, by Regina Berzilay and Michael Elhadad. + * <br/><br/> * The algorithm is based on extracting so-called lexical chains - a set of sentences in the article - * that share a word that are very closely related. Thus, the longest chain represents the most important + * that share a {@link Word} that are very closely related. Thus, the longest chain represents the most important * topic and so forth. A summary can then be formed by identifying the most important lexical chains * and "pulling" out sentences from them. * + * @see Word * @see LexicalChain * @see Summarizer */ @@ -44,87 +48,122 @@ public class LexicalChainingSummarizer implements Summarizer { private final DocProcessor docProcessor; private final WordRelationshipDetermination wordRel; - public LexicalChainingSummarizer(DocProcessor dp, OpenNLPPOSTagger posTagger) { - docProcessor = dp; - tagger = posTagger; - wordRel = new WordRelationshipDetermination(); + /** + * Instantiates a {@link LexicalChainingSummarizer}. + * + * @param docProcessor The {@link DocProcessor} to use at runtime. Must not be {@code null}. + * @param languageCode An ISO-language code for obtaining a {@link POSModel}. + * Must not be {@code null}. + * + * @throws IllegalArgumentException Thrown if parameters are invalid. + */ + public LexicalChainingSummarizer(DocProcessor docProcessor, String languageCode) throws IOException { + this(docProcessor, new NounPOSTagger(languageCode)); } - public LexicalChainingSummarizer(DocProcessor dp, InputStream posModelFile) throws Exception { - this(dp, new OpenNLPPOSTagger(dp, posModelFile)); + /** + * Instantiates a {@link LexicalChainingSummarizer}. + * + * @param docProcessor The {@link DocProcessor} to use at runtime. Must not be {@code null}. + * @param posTagger The {@link NounPOSTagger} to use at runtime. Must not be {@code null}. + * + * @throws IllegalArgumentException Thrown if parameters are invalid. + */ + public LexicalChainingSummarizer(DocProcessor docProcessor, NounPOSTagger posTagger) { + if (docProcessor == null) throw new IllegalArgumentException("Parameter 'docProcessor' must not be null!"); + if (posTagger == null) throw new IllegalArgumentException("Parameter 'posTagger' must not be null!"); + + this.docProcessor = docProcessor; + tagger = posTagger; + wordRel = new WordRelationshipDetermination(); } - //Build Lexical chains.. - public List<LexicalChain> buildLexicalChains(String article, List<Sentence> sent) { - // POS tag article - Hashtable<String, List<LexicalChain>> chains = new Hashtable<>(); - List<LexicalChain> lc = new ArrayList<>(); - // Build lexical chains - // For each sentence - for (Sentence currSent : sent) { - String taggedSent = tagger.getTaggedString(currSent.getStringVal()); - List<String> nouns = tagger.getWordsOfType(taggedSent, POSTagger.NOUN); - // For each noun - for (String noun : nouns) { - int chainsAddCnt = 0; - // Loop through each LC - for (LexicalChain l : lc) { - try { - WordRelation rel = wordRel.getRelation(l, noun, (currSent.getSentId() - l.start) > 7); - // Is the noun an exact match to one of the current LCs (Strong relation) - // Add sentence to chain - if (rel.relation() == WordRelation.STRONG_RELATION) { - addToChain(rel.dest(), l, chains, currSent); - if (currSent.getSentId() - l.last > 10) { - l.occurrences++; - l.start = currSent.getSentId(); - } - chainsAddCnt++; - } else if (rel.relation() == WordRelation.MED_RELATION) { - // Add sentence to chain if it is 7 sentences away from start of chain - addToChain(rel.dest(), l, chains, currSent); - chainsAddCnt++; - //If greater than 7 we will add it but call it a new occurrence of the lexical chain... - if (currSent.getSentId() - l.start > 7) { - l.occurrences++; - l.start = currSent.getSentId(); - } - } else if (rel.relation() == WordRelation.WEAK_RELATION) { - if (currSent.getSentId() - l.start <= 3) { + /** + * Constructs a list of {@link LexicalChain lexical chains} from specified sentences. + * + * @param article TODO unused parameter -> remove it?! + * @param sentences The list of {@link Sentence sentences} to build lexical chains from. + * Must not be {@code null}. + * @return The result list of {@link LexicalChain lexical chains}. Guaranteed to be not {@code null}. + * @throws IllegalArgumentException Thrown if parameters are invalid. + */ + public List<LexicalChain> buildLexicalChains(String article, List<Sentence> sentences) { + if (sentences == null) throw new IllegalArgumentException("Parameter 'sentences' must not be null!"); + else { + if (sentences.isEmpty()) { + return Collections.emptyList(); + } + Hashtable<String, List<LexicalChain>> chains = new Hashtable<>(); + List<LexicalChain> lc = new ArrayList<>(); + // Build lexical chains + // For each sentence + for (Sentence currSent : sentences) { + // POS tag article + String taggedSent = tagger.getTaggedString(currSent.getStringVal().replace(".", " .")); + List<String> nouns = tagger.getWordsOfType(docProcessor.getWords(taggedSent), POSTagger.NOUN); + // For each noun + for (String noun : nouns) { + int chainsAddCnt = 0; + // Loop through each LC + for (LexicalChain l : lc) { + try { + WordRelation rel = wordRel.getRelation(l, noun, (currSent.getSentId() - l.start) > 7); + // Is the noun an exact match to one of the current LCs (Strong relation) + // Add sentence to chain + if (rel.relation() == WordRelation.STRONG_RELATION) { addToChain(rel.dest(), l, chains, currSent); + if (currSent.getSentId() - l.last > 10) { + l.occurrences++; + l.start = currSent.getSentId(); + } chainsAddCnt++; + } else if (rel.relation() == WordRelation.MED_RELATION) { + // Add sentence to chain if it is 7 sentences away from start of chain + addToChain(rel.dest(), l, chains, currSent); + chainsAddCnt++; + // If greater than 7 we will add it but call it a new occurrence of the lexical chain... + if (currSent.getSentId() - l.start > 7) { + l.occurrences++; + l.start = currSent.getSentId(); + } + } else if (rel.relation() == WordRelation.WEAK_RELATION) { + if (currSent.getSentId() - l.start <= 3) { + addToChain(rel.dest(), l, chains, currSent); + chainsAddCnt++; + } } + } catch (Exception ex) { + throw new RuntimeException(ex); } - } catch (Exception ex) { + // add sentence and update last occurrence.. + //chaincnt++ + // else 1 hop-relation in Wordnet (weak relation) + // Add sentence to chain if it is 3 sentences away from start of chain + //chaincnt++ + // End loop LC } - // add sentence and update last occurrence.. - //chaincnt++ - // else 1 hop-relation in Wordnet (weak relation) - // Add sentence to chain if it is 3 sentences away from start of chain - //chaincnt++ - // End loop LC - } - //Could not add the word to any existing list. Start a new lexical chain with the word. - if (chainsAddCnt == 0) { - List<Word> senses = wordRel.getWordSenses(noun); - for (Word w : senses) { - LexicalChain newLc = new LexicalChain(); - newLc.start = currSent.getSentId(); - addToChain(w, newLc, chains, currSent); - lc.add(newLc); + // Could not add the word to any existing list. Start a new lexical chain with the word. + if (chainsAddCnt == 0) { + List<Word> senses = wordRel.getWordSenses(noun); + for (Word w : senses) { + LexicalChain newLc = new LexicalChain(); + newLc.start = currSent.getSentId(); + addToChain(w, newLc, chains, currSent); + lc.add(newLc); + } } + if (lc.size() > 20) + purge(lc, currSent.getSentId(), sentences.size()); } - if (lc.size() > 20) - purge(lc, currSent.getSentId(), sent.size()); + //End sentence } - //End sentence - } // disambiguateAndCleanChains(lc, chains); - // Calculate score - // Length of chain * homogeneity - //sort LC by strength. - return lc; + // Calculate score + // Length of chain * homogeneity + //sort LC by strength. + return lc; + } } /* @@ -132,7 +171,7 @@ public class LexicalChainingSummarizer implements Summarizer { * Takes care to only remove small chains that were added "long back" */ private void purge(List<LexicalChain> lc, int sentId, int totSents) { - //Do nothing for the first 50 sentences. + //Do nothing for the first 20 sentences. if (lc.size() < 20) return; Collections.sort(lc); @@ -146,12 +185,12 @@ public class LexicalChainingSummarizer implements Summarizer { LexicalChain l = lc.get(i); if (l.score() < cutOff && (sentId - l.last) > totSents / 3)// && containsAllWords(words, l.word)) toRem.add(l); - //A different sense and added long back. - else if (words.containsKey(l.getWord().get(0).getLexicon()) && (sentId - l.start) > totSents / 10) + // A different sense and added long back. + else if (words.containsKey(l.getWords().get(0).getLexicon()) && (sentId - l.start) > totSents / 10) toRem.add(l); else { - //Check if this is from a word with different sense.. - for (Word w : l.word) + // Check if this is from a word with different sense.. + for (Word w : l.words) words.put(w.getLexicon(), Boolean.TRUE); } } @@ -169,9 +208,7 @@ public class LexicalChainingSummarizer implements Summarizer { return ret; } - private void addToChain(Word noun, LexicalChain l, - Hashtable<String, List<LexicalChain>> chains, Sentence sent) { - + private void addToChain(Word noun, LexicalChain l, Hashtable<String, List<LexicalChain>> chains, Sentence sent) { l.addWord(noun); l.addSentence(sent); l.last = sent.getSentId(); @@ -182,14 +219,13 @@ public class LexicalChainingSummarizer implements Summarizer { @Override public String summarize(String article, int maxWords) { - List<Sentence> sent = docProcessor.getSentencesFromStr(article); + List<Sentence> sent = docProcessor.getSentences(article); List<LexicalChain> lc = buildLexicalChains(article, sent); Collections.sort(lc); int summSize = 0; List<Sentence> summ = new ArrayList<>(); StringBuilder sb = new StringBuilder(); - for (int i = 0; i < lc.size(); i++) { - LexicalChain chain = lc.get(i); + for (LexicalChain chain : lc) { for (int j = 0; j < chain.sentences.size(); j++) { Sentence candidate = chain.sentences.get(j); if (!summ.contains(candidate)) { diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/NounPOSTagger.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/NounPOSTagger.java new file mode 100644 index 0000000..2acc60b --- /dev/null +++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/NounPOSTagger.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.summarization.lexicalchaining; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Hashtable; +import java.util.List; +import java.util.Map; +import java.util.Set; + +import opennlp.tools.postag.POSModel; +import opennlp.tools.postag.POSTaggerME; +import opennlp.tools.tokenize.WhitespaceTokenizer; +import opennlp.tools.util.DownloadUtil; + +/** + * A {@link POSTagger} wrapper implementation that relies on an OpenNLP {@link POSTaggerME}. + * + * @see POSTagger + * @see POSTaggerME + */ +public class NounPOSTagger implements POSTagger { + + public static final String[] TAGS_NOUNS = {"NOUN", "NN", "NNS", "NNP", "NNPS"}; + private static final Set<String> EOS_CHARS = Set.of(".", "?", "!"); + + private final POSTaggerME tagger; + private final Map<Integer, String[]> tagMap = new Hashtable<>(); + + /** + * Instantiates a {@link NounPOSTagger} for a POS model for the specified {@code languageCode}. + * + * @param languageCode An ISO-language code for obtaining a {@link POSModel}. + * Must not be {@code null}. + * @throws IOException Thrown if IO errors occurred. + * @throws IllegalArgumentException Thrown if parameters are invalid. + */ + public NounPOSTagger(String languageCode) throws IOException { + if (languageCode == null || languageCode.isBlank()) + throw new IllegalArgumentException("Parameter 'languageCode' must not be null"); + // init Tag map + tagMap.put(POSTagger.NOUN, TAGS_NOUNS); + POSModel posModel = DownloadUtil.downloadModel(languageCode, DownloadUtil.ModelType.POS, POSModel.class); + tagger = new POSTaggerME(posModel); + } + + /** + * @return {@code true} if the type string belongs to one of the (noun) tags for the type, + * {@code false} otherwise. + */ + public boolean isType(String typeStr, int type) { + boolean ret = false; + String[] tags = tagMap.get(type); + if (tags != null) { + for (String tag : tags) { + if (typeStr.equalsIgnoreCase(tag)) { + ret = true; + break; + } + } + return ret; + } else { + return false; + } + } + + /** + * {@inheritDoc} + */ + @Override + public String getTaggedString(String input) { + if (input == null) throw new IllegalArgumentException("Parameter 'input' must not be null"); + + String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(input); + String[] tags = tagger.tag(tokens); + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < tokens.length; i++) { + sb.append(tokens[i]).append("/").append(tags[i]); + // whitespace appending only for non-EOS / PUNCT tokens, skipping for actual EOS tokens + if (! (EOS_CHARS.contains(tokens[i]) && tokens.length == i + 1)) { + sb.append(" "); + } + } + return sb.toString(); + } + + /** + * {@inheritDoc} + */ + @Override + public List<String> getWordsOfType(String[] tokens, int type) { + if (tokens == null) throw new IllegalArgumentException("Parameter 'tokens' must not be null"); + if (type < 0 || type > PRONOUN) throw new IllegalArgumentException("Parameter 'type' must be in range [0, 4]"); + + List<String> ret = new ArrayList<>(); + for (String t : tokens) { + String[] wordPlusType = t.split("/"); + if (wordPlusType.length == 2) { + if (isType(wordPlusType[1], type)) + ret.add(wordPlusType[0]); + } else { + throw new IllegalArgumentException("Token '" + t + "' is not tagged correctly!"); + } + } + // log.info(ret.toString()); + return ret; + } +} diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/OpenNLPPOSTagger.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/OpenNLPPOSTagger.java deleted file mode 100644 index 39edde3..0000000 --- a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/OpenNLPPOSTagger.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.summarization.lexicalchaining; - -import java.io.BufferedInputStream; -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.Hashtable; -import java.util.List; - -import opennlp.summarization.DocProcessor; -import opennlp.tools.postag.POSModel; -import opennlp.tools.postag.POSTaggerME; -import opennlp.tools.tokenize.WhitespaceTokenizer; - -public class OpenNLPPOSTagger implements POSTagger { - - private final POSTaggerME tagger; - private final DocProcessor dp; - private final String[] nounTags = {"NOUN", "NN", "NNS", "NNP", "NNPS"}; - private Hashtable<Integer, String[]> tagMap; - - public OpenNLPPOSTagger(DocProcessor dp, InputStream posModelFile) throws IOException { - this.dp = dp; - initTagMap(); - - try (InputStream modelIn = new BufferedInputStream(posModelFile)) { - POSModel model = new POSModel(modelIn); - tagger = new POSTaggerME(model); - } - } - - private void initTagMap() { - tagMap = new Hashtable<>(); - tagMap.put(POSTagger.NOUN, nounTags); - } - - // Returns true if the type string belongs to one of the tags for the type - public boolean isType(String typeStr, int type) { - boolean ret = false; - String[] tags = tagMap.get(type); - for (String tag : tags) { - if (typeStr.equalsIgnoreCase(tag)) { - ret = true; - break; - } - } - return ret; - } - - @Override - public String getTaggedString(String input) { - String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(input); - String[] tags = tagger.tag(tokens); - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < tokens.length; i++) { - sb.append(tokens[i]).append("/").append(tags[i]).append(" "); - } - return sb.toString(); - } - - @Override - public List<String> getWordsOfType(String sent, int type) { - List<String> ret = new ArrayList<>(); - String[] tokens = dp.getWords(sent); - for (String t : tokens) { - String[] wordPlusType = t.split("/"); - if (wordPlusType.length == 2) { - if (isType(wordPlusType[1], type)) - ret.add(wordPlusType[0]); - } - } - // log.info(ret.toString()); - return ret; - } -} diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/POSTagger.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/POSTagger.java index d6b5d2d..af468ed 100644 --- a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/POSTagger.java +++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/POSTagger.java @@ -19,6 +19,10 @@ package opennlp.summarization.lexicalchaining; import java.util.List; +/** + * A basic POS tagger which describes functionality to tag text and + * filter tokens for certain word classes. + */ public interface POSTagger { //Tagger types.. @@ -28,7 +32,26 @@ public interface POSTagger { int ADVERB = 3; int PRONOUN = 4; + /** + * Tags a given {@code input} text so that word classes are appenend to each token. + * + * @param input The text to process. Must not be {@code null}. If empty, an empty String is returned. + * @return The POS tagged text. May be empty. + * @throws IllegalArgumentException Thrown if parameters are invalid. + */ String getTaggedString(String input); - List<String> getWordsOfType(String sent, int type); + /** + * Extracts words from POS-tagged {@code tokens} which equal a certain word class ({@code type}). + * + * @param tokens An array of words to filter for its word class ({@code type}). Must not be {@code null}. + * Must be in a tagged form, that is, separated into {@code token/word-class} pairs. + * @param type One of the supported types: {@link #NOUN}, {@link #VERB}, {@link #ADJECTIVE}, + * {@link #ADVERB}, or {@link #PRONOUN}. Must not be less than {@code zero} + * and not be more than {@link #PRONOUN}. + * @return A list of words that match the given {@code type}. May be empty, yet guaranteed to be non-{@code null}. + * + * @throws IllegalArgumentException Thrown if parameters are invalid. + */ + List<String> getWordsOfType(String[] tokens, int type); } diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java index ebe352f..59b8a76 100644 --- a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java +++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordRelationshipDetermination.java @@ -58,7 +58,7 @@ public class WordRelationshipDetermination { try { DICTIONARY.open(); } catch (IOException e) { - e.printStackTrace(); + throw new RuntimeException(e); } } @@ -130,10 +130,7 @@ public class WordRelationshipDetermination { WordnetWord ww = (WordnetWord) w; IWord syn; if ((syn = this.isSynonym(noun, w)) != null) { - ret = new WordnetWord(); - ret.lexicon = noun; - ret.id = syn.getID(); - ret.wordSense = syn.getSenseKey(); + ret = new WordnetWord(noun, syn.getSenseKey(), syn.getID()); } //Construct an IWord object representing word associated with wordID @@ -156,10 +153,7 @@ public class WordRelationshipDetermination { ISynset s = this.DICTIONARY.getSynset(id); IWord mat = inSynset(s, idxNoun); if (mat != null) { - ret = new WordnetWord(); - ret.lexicon = noun; - ret.id = mat.getID(); - ret.wordSense = mat.getSenseKey(); + ret = new WordnetWord(noun, mat.getSenseKey(), mat.getID()); break; } } @@ -175,7 +169,7 @@ public class WordRelationshipDetermination { */ public WordRelation getRelation(LexicalChain l, String noun, boolean checkMed) { WordRelation ret = new WordRelation(null, null, WordRelation.NO_RELATION); - for (Word w : l.word) { + for (Word w : l.words) { //Exact match is a string relation. if (w.getLexicon().equalsIgnoreCase(noun)) { ret = new WordRelation(w, w, WordRelation.STRONG_RELATION); @@ -199,15 +193,12 @@ public class WordRelationshipDetermination { // openDict(); List<IWordID> wordIDs = this.DICTIONARY.getIndexWord(noun, POS.NOUN).getWordIDs(); for (IWordID wid : wordIDs) { - Word w = new WordnetWord(); - w.setLexicon(noun); - w.setID(wid); + Word w = new WordnetWord(noun, wid); ret.add(w); } } catch (Exception ex) { //Not in dictionary - Word w = new WordnetWord(); - w.setLexicon(noun); + Word w = new WordnetWord(noun); ret.add(w); } return ret; diff --git a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordnetWord.java b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordnetWord.java index a110719..0cf026d 100644 --- a/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordnetWord.java +++ b/summarizer/src/main/java/opennlp/summarization/lexicalchaining/WordnetWord.java @@ -18,6 +18,7 @@ package opennlp.summarization.lexicalchaining; import java.util.Hashtable; import java.util.List; +import java.util.Objects; import edu.mit.jwi.item.IPointer; import edu.mit.jwi.item.ISenseKey; @@ -25,16 +26,58 @@ import edu.mit.jwi.item.ISynset; import edu.mit.jwi.item.ISynsetID; import edu.mit.jwi.item.IWordID; +/** + * A {@link Word} implementation based on Wordnet concepts. + */ public class WordnetWord implements Word { - final Hashtable<IPointer, List<ISynsetID>> rels; - String lexicon; - ISenseKey wordSense; - IWordID id; - //Cache.. + + private String lexicon; + private IWordID id; + private ISenseKey wordSense; + + final Hashtable<IPointer, List<ISynsetID>> rels = new Hashtable<>(); + // Cache.. ISynset synonyms; - public WordnetWord() { - rels = new Hashtable<>(); + /** + * Instantiates a {@link WordnetWord} via its lexicon term. + * + * @param lexicon Must not be {@code null} and not be an empty string. + * @throws IllegalArgumentException Thrown if parameters are invalid. + */ + public WordnetWord(String lexicon) { + if (lexicon == null || lexicon.isBlank()) throw new IllegalArgumentException("parameter 'lexicon' must not be null or empty"); + setLexicon(lexicon); + } + + /** + * Instantiates a {@link WordnetWord} via its lexicon term and a {@link IWordID}. + * + * @param lexicon Must not be {@code null} and not be an empty string. + * @param id A unique identifier sufficient to retrieve a particular word from the Wordnet database. + * Must not be {@code null}. + * @throws IllegalArgumentException Thrown if parameters are invalid. + */ + public WordnetWord(String lexicon, IWordID id) { + this(lexicon); + if (id == null) throw new IllegalArgumentException("parameter 'id' must not be null"); + setID(id); + } + + /** + * Instantiates a {@link WordnetWord} via its lexicon term and a {@link IWordID}. + * + * @param lexicon Must not be {@code null} and not be an empty string. + * @param wordSense A sense key is a unique string that identifies a Wordnet word. + * Must not be {@code null}. + * @param id A unique identifier sufficient to retrieve a particular word from the Wordnet database. + * Must not be {@code null}. + * @throws IllegalArgumentException Thrown if parameters are invalid. + */ + public WordnetWord(String lexicon, ISenseKey wordSense, IWordID id) { + this(lexicon, id); + if (wordSense == null) throw new IllegalArgumentException("parameter 'wordSense' must not be null"); + setSense(wordSense); } @Override @@ -72,8 +115,18 @@ public class WordnetWord implements Word { return this.lexicon; } + @Override + public final boolean equals(Object o) { + if (this == o) return true; + if (!(o instanceof WordnetWord that)) return false; + + return Objects.equals(lexicon, that.lexicon) && Objects.equals(id, that.id); + } + @Override public int hashCode() { - return toString().hashCode(); + int result = Objects.hashCode(lexicon); + result = 31 * result + Objects.hashCode(id); + return result; } } diff --git a/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java b/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java index 7fa1155..c52d4be 100644 --- a/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java +++ b/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java @@ -28,7 +28,7 @@ import opennlp.summarization.Sentence; import opennlp.summarization.Summarizer; import opennlp.summarization.lexicalchaining.LexicalChain; import opennlp.summarization.lexicalchaining.LexicalChainingSummarizer; -import opennlp.summarization.lexicalchaining.OpenNLPPOSTagger; +import opennlp.summarization.lexicalchaining.NounPOSTagger; import opennlp.summarization.textrank.TextRankSummarizer; import opennlp.summarization.DocProcessor; @@ -47,7 +47,7 @@ public class MetaSummarizer implements Summarizer { private final TextRankSummarizer textRank; private final LexicalChainingSummarizer lcs; - public MetaSummarizer(DocProcessor docProcessor, OpenNLPPOSTagger posTagger) { + public MetaSummarizer(DocProcessor docProcessor, NounPOSTagger posTagger) { dp = docProcessor; textRank = new TextRankSummarizer(dp); lcs = new LexicalChainingSummarizer(dp, posTagger); @@ -117,7 +117,7 @@ public class MetaSummarizer implements Summarizer { @Override public String summarize(String article, int maxWords) { // Build lexical Chains.. - List<Sentence> sent = dp.getSentencesFromStr(article); + List<Sentence> sent = dp.getSentences(article); List<Score> finalSc = rankSentences(article, sent, maxWords); StringBuilder sb = new StringBuilder(); diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java index c185361..a638d68 100755 --- a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java +++ b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java @@ -17,8 +17,7 @@ package opennlp.summarization.preprocess; -import java.io.BufferedInputStream; -import java.io.FileReader; +import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.io.LineNumberReader; @@ -28,8 +27,6 @@ import java.util.List; import java.util.ArrayList; import java.util.Locale; import java.util.Hashtable; -import java.util.logging.Level; -import java.util.logging.Logger; import java.util.regex.Pattern; import opennlp.summarization.Sentence; @@ -38,6 +35,7 @@ import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; import opennlp.tools.stemmer.PorterStemmer; import opennlp.tools.stemmer.Stemmer; +import opennlp.tools.util.DownloadUtil; /** * Parses a document to sentences. @@ -53,16 +51,21 @@ public class DefaultDocProcessor implements DocProcessor { private static final int SENTENCE_FRAG = OPEN_NLP; private final Stemmer stemmer; - private SentenceModel sentModel; - - public DefaultDocProcessor(InputStream fragModelFile) { + private final SentenceModel sentModel; + + /** + * Instantiates a {@link DocProcessor} for a Sentence detection model for the specified {@code languageCode}. + * + * @param languageCode An ISO-language code for obtaining a {@link SentenceModel}. + * Must not be {@code null} and not be blank. + * @throws IOException Thrown if IO errors occurred. + * @throws IllegalArgumentException Thrown if parameters are invalid. + */ + public DefaultDocProcessor(String languageCode) throws IOException { + if (languageCode == null || languageCode.isBlank()) + throw new IllegalArgumentException("Parameter 'languageCode' must not be null or blank"); stemmer = new PorterStemmer(); - - try (InputStream modelIn = new BufferedInputStream(fragModelFile)) { - sentModel = new SentenceModel(modelIn); - } catch (Exception ex) { - Logger.getAnonymousLogger().info("Error while parsing.. Ignoring the line and marching on.. " + ex.getMessage()); - } + sentModel = DownloadUtil.downloadModel(languageCode, DownloadUtil.ModelType.SENTENCE_DETECTOR, SentenceModel.class); } // Str - Document or para @@ -81,8 +84,8 @@ public class DefaultDocProcessor implements DocProcessor { for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) { String sentence = str.substring(start, end);//str.substring(oldSentEndIdx, sentEndIdx).trim(); - //Add the sentence as-is; do any processing at the word level - //To lower case and trim all punctuations + // Add the sentence as-is; do any processing at the word level + // To lower case and trim all punctuations sentences.add(sentence); wrdItr.setText(sentence); StringBuilder procSent = new StringBuilder(); @@ -93,12 +96,12 @@ public class DefaultDocProcessor implements DocProcessor { String word = sentence.substring(wrdStrt, wrdEnd);//words[i].trim(); word = word.replace(REGEX, ""); - //Skip stop words and stem the word + // Skip stop words and stem the word if (sw.isStopWord(word)) continue; String stemedWrd = stemmer.stem(word).toString(); - //update iidx by adding the current sentence to the list + // update iidx by adding the current sentence to the list if (iidx != null) { if (stemedWrd.length() > 1) { List<Integer> sentList = iidx.get(stemedWrd); @@ -107,7 +110,7 @@ public class DefaultDocProcessor implements DocProcessor { } sentList.add(sentCnt); - //Save it back + // Save it back iidx.put(stemedWrd, sentList); } } @@ -121,60 +124,77 @@ public class DefaultDocProcessor implements DocProcessor { } - public String docToString(String fileName) { - StringBuilder docBuffer = new StringBuilder(); - - try (InputStream in = DefaultDocProcessor.class.getResourceAsStream(fileName); - LineNumberReader lnr = new LineNumberReader(new InputStreamReader(in))) { - String nextLine; - - while ((nextLine = lnr.readLine()) != null) { - String trimmedLine = nextLine.trim(); - if (!trimmedLine.isEmpty()) { - docBuffer.append(REPLACEMENT_PATTERN.matcher(trimmedLine).replaceAll("")).append(" "); + /** + * Reads a document's content from a file. + * + * @param fileName The path relative file reference of the resource to read in. + * If {@code null} or empty, an empty String is returned. + * @return A string representation of the file's contents. + */ + public String docToString(String fileName) throws IOException { + if (fileName == null || fileName.isBlank()) { + return ""; + } else { + StringBuilder docBuffer = new StringBuilder(); + try (InputStream in = DefaultDocProcessor.class.getResourceAsStream(fileName); + LineNumberReader lnr = new LineNumberReader(new InputStreamReader(in))) { + String nextLine; + + while ((nextLine = lnr.readLine()) != null) { + String trimmedLine = nextLine.trim(); + if (!trimmedLine.isEmpty()) { + docBuffer.append(REPLACEMENT_PATTERN.matcher(trimmedLine).replaceAll("")).append(" "); + } } } - } catch (Exception ex) { - Logger.getLogger(DefaultDocProcessor.class.getName()).log(Level.SEVERE, null, ex); + return docBuffer.toString(); } - return docBuffer.toString(); } - //List of sentences form a document - public List<Sentence> docToSentList(String fileName) { - List<Sentence> sentList = new ArrayList<>(); - - try (LineNumberReader lnr = new LineNumberReader(new FileReader(fileName))) { - String nextLine; - int paraNo = 0; - int sentNo = 0; - while ((nextLine = lnr.readLine()) != null) { - String trimmedLine = nextLine.trim(); - if (!trimmedLine.isEmpty()) { - List<String> sents = new ArrayList<>(); - List<String> cleanedSents = new ArrayList<>(); - this.getSentences(trimmedLine, sents, null, cleanedSents); - int paraPos = 1; - for (String sen : sents) { - Sentence s = new Sentence(); - s.setSentId(sentNo++); - s.setParagraph(paraNo); - s.setStringVal(sen); - s.setParaPos(paraPos++); - sentList.add(s); + /** + * Reads a document's content from a file. + * + * @param fileName The path relative file reference of the resource to read in. + * If {@code null} or empty, an empty List is returned. + * @return A list {@link Sentence sentences} representing the file's contents. + */ + public List<Sentence> docToSentences(String fileName) throws IOException { + if (fileName == null || fileName.isBlank()) { + return Collections.emptyList(); + } else { + List<Sentence> sentList = new ArrayList<>(); + try (InputStream in = DefaultDocProcessor.class.getResourceAsStream(fileName); + LineNumberReader lnr = new LineNumberReader(new InputStreamReader(in))) { + String nextLine; + int paraNo = 0; + int sentNo = 0; + while ((nextLine = lnr.readLine()) != null) { + String trimmedLine = nextLine.trim(); + if (!trimmedLine.isEmpty()) { + List<String> sents = new ArrayList<>(); + List<String> cleanedSents = new ArrayList<>(); + this.getSentences(trimmedLine, sents, null, cleanedSents); + int paraPos = 1; + for (String sen : sents) { + Sentence s = new Sentence(sentNo++, sen, paraNo, paraPos++); + sentList.add(s); + } + paraNo++; } - paraNo++; } } - - } catch (Exception ex) { - Logger.getLogger(DefaultDocProcessor.class.getName()).log(Level.SEVERE, null, ex); + return sentList; } - return sentList; } + /** + * {@inheritDoc} + */ @Override - public List<Sentence> getSentencesFromStr(String text) { + public List<Sentence> getSentences(String text) { + if (text == null || text.isBlank()) { + return Collections.emptyList(); + } List<Sentence> ret = new ArrayList<>(); List<String> sentStrs = new ArrayList<>(); List<String> cleanedSents = new ArrayList<>(); @@ -188,24 +208,28 @@ public class DefaultDocProcessor implements DocProcessor { Collections.addAll(sentStrs, sentences); } int sentNo = 0; - for (String sen : sentStrs) { - Sentence s = new Sentence(); - s.setSentId(sentNo); - s.setParagraph(1); - s.setStringVal(sen); - s.setParaPos(sentNo); + Sentence s = new Sentence(sentNo, sen, 1, sentNo); ret.add(s); sentNo++; } return ret; } + /** + * {@inheritDoc} + */ @Override public String[] getWords(String sent) { + if (sent == null || sent.isBlank()) { + return new String[0]; + } return sent.trim().split("\\s+"); } + /** + * {@inheritDoc} + */ @Override public Stemmer getStemmer() { return stemmer; diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/IDFWordWeight.java b/summarizer/src/main/java/opennlp/summarization/preprocess/IDFWordWeight.java index 8b88cd6..b6eef0b 100755 --- a/summarizer/src/main/java/opennlp/summarization/preprocess/IDFWordWeight.java +++ b/summarizer/src/main/java/opennlp/summarization/preprocess/IDFWordWeight.java @@ -17,6 +17,7 @@ package opennlp.summarization.preprocess; +import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.Hashtable; @@ -29,12 +30,17 @@ import java.io.LineNumberReader; * @see WordWeight */ public class IDFWordWeight implements WordWeight { + private static IDFWordWeight instance; final Hashtable<String, Double> idf; public IDFWordWeight(String fileName) { idf = new Hashtable<>(); - load(fileName); + try { + load(fileName); + } catch (IOException e) { + throw new RuntimeException("Could not load the file with IDF", e); + } } public static IDFWordWeight getInstance(String fileName) { @@ -58,7 +64,7 @@ public class IDFWordWeight implements WordWeight { * Loads the IDF for words from given file. The file is required to have a simple format - * word, IDF. */ - private void load(String fileName) { + private void load(String fileName) throws IOException { try (InputStream in = IDFWordWeight.class.getResourceAsStream(fileName); LineNumberReader lnr = new LineNumberReader(new InputStreamReader(in))) { @@ -72,9 +78,6 @@ public class IDFWordWeight implements WordWeight { idf.put(word, idfVal); } } - } catch (Exception ex) { - System.err.println("Could not load the file with IDF"); - ex.printStackTrace(); } } } diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/StopWords.java b/summarizer/src/main/java/opennlp/summarization/preprocess/StopWords.java index deb338d..c558dee 100755 --- a/summarizer/src/main/java/opennlp/summarization/preprocess/StopWords.java +++ b/summarizer/src/main/java/opennlp/summarization/preprocess/StopWords.java @@ -215,7 +215,7 @@ public class StopWords { h.add("your"); h.add("yours"); h.add("yourself"); - h.add("yourselves "); + h.add("yourselves"); } public static StopWords getInstance() { diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/WordWeight.java b/summarizer/src/main/java/opennlp/summarization/preprocess/WordWeight.java index 97866aa..1998434 100755 --- a/summarizer/src/main/java/opennlp/summarization/preprocess/WordWeight.java +++ b/summarizer/src/main/java/opennlp/summarization/preprocess/WordWeight.java @@ -17,7 +17,14 @@ package opennlp.summarization.preprocess; +/** + * Represents a type which can compute the weight of a word in a certain context, e.g. a sentence or a text. + */ public interface WordWeight { - double getWordWeight(String s); + /** + * @param token The input token (word) to get a weight for. Must not be {@code null}. + * @return The associated weight for the specified {@code token}. + */ + double getWordWeight(String token); } diff --git a/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java b/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java index 3ead306..fc359e7 100755 --- a/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java +++ b/summarizer/src/main/java/opennlp/summarization/textrank/TextRank.java @@ -30,46 +30,74 @@ import opennlp.summarization.preprocess.StopWords; import opennlp.summarization.preprocess.WordWeight; /** - * Implements the TextRank algorithm by Mihalcea et al. - * <p> + * Implements the TextRank algorithm by Rada Mihalcea and Paul Tarau: <br/> + * <a href="https://aclanthology.org/W04-3252/">TextRank: Bringing Order into Text</a> + * <br/><br/> * This basically applies the page rank algorithm to a graph where each sentence is a node * and a connection between sentences indicates that a word is shared between them. + * <p> * It returns a ranking of sentences where the highest rank means most important etc. * Currently, only stemming is done to the words; a more sophisticated way might use a * resource like Wordnet to match synonyms etc. */ public class TextRank { + private static final int NO_OF_IT = 100; // DAMPING FACTOR.. private static final double DF = 0.15; private static final boolean HIGHER_TITLE_WEIGHT = true; private static final double TITLE_WRD_WT = 2d; + + private final DocProcessor docProc; private final StopWords sw; private final WordWeight wordWt; + private final double maxErr = 0.1; private final double title_wt = 0; - private String article; - private Hashtable<Integer, List<Integer>> links; + + private Hashtable<Integer, List<Integer>> links = new Hashtable<>(); private List<String> sentences = new ArrayList<>(); private List<String> processedSent = new ArrayList<>(); - private DocProcessor docProc; + /** + * Instantiates a {@link TextRank} with the specified {@link DocProcessor}. + * + * @param dp A valid {@link DocProcessor}. Must not be {@code null}. + * + * @throws IllegalArgumentException Thrown if parameters are invalid. + */ public TextRank(DocProcessor dp) { - sw = new StopWords(); - setLinks(new Hashtable<>()); - processedSent = new ArrayList<>(); - docProc = dp; - wordWt = IDFWordWeight.getInstance("/meta/idf.csv"); + this(dp, new StopWords(), IDFWordWeight.getInstance("/idf.csv")); } - public TextRank(StopWords sw, WordWeight wordWts) { - this.sw = sw; - this.wordWt = wordWts; + /** + * Instantiates a {@link TextRank} with the specified {@link DocProcessor}. + * + * @param dp A valid {@link DocProcessor}. Must not be {@code null}. + * @param stopWords The {@link StopWords} instance to use. Must not be {@code null}. + * @param wordWeights The {@link WordWeight} instance to use. Must not be {@code null}. + * + * @throws IllegalArgumentException Thrown if parameters are invalid. + */ + public TextRank(DocProcessor dp, StopWords stopWords, WordWeight wordWeights) { + if (dp == null) throw new IllegalArgumentException("parameter 'dp' must not be null"); + if (stopWords == null) throw new IllegalArgumentException("parameter 'stopWords' must not be null"); + if (wordWeights == null) throw new IllegalArgumentException("parameter 'wordWeights' must not be null"); + this.docProc = dp; + this.sw = stopWords; + this.wordWt = wordWeights; } - // Returns similarity of two sentences. Wrd wts contains tf-idf of the words.. - public double getWeightedSimilarity(String sent1, String sent2, - Hashtable<String, Double> wrdWts) { + /** + * Computes the similarity of two sentences. + * + * @param sent1 The first sentence. If {@code null} or empty the computation will result in {@code 0.0}. + * @param sent2 The second sentence. If {@code null} or empty the computation will result in {@code 0.0}. + * @param wrdWts The mapping table contains tf-idf of the words. + * @return The computed similarity. If no similarity exist, the resulting value equals {@code 0.0}. + */ + public double getWeightedSimilarity(String sent1, String sent2, Hashtable<String, Double> wrdWts) { + String[] words1 = docProc.getWords(sent1); String[] words2 = docProc.getWords(sent2); double wordsInCommon = 0; @@ -97,13 +125,17 @@ public class TextRank { return (wordsInCommon) / (words1.length + words2.length); } - // Gets the current score from the list of scores passed ... + /** + * @param scores A list of {@link Score} instances. + * @param id The sentence id to check for. + * @return Gets the element from {@code scores} that matches the passed sentence {@code id}. + */ public double getScoreFrom(List<Score> scores, int id) { for (Score s : scores) { if (s.getSentId() == id) return s.getScore(); } - return 1; + return 1; // Why is the default score "1" here? } // This method runs the page rank algorithm for the sentences. @@ -114,9 +146,7 @@ public class TextRank { List<Score> currWtScores = new ArrayList<>(); // Start with equal weights for all sentences for (int i = 0; i < rawScores.size(); i++) { - Score ns = new Score(); - ns.setSentId(rawScores.get(i).getSentId()); - ns.setScore((1 - title_wt) / (rawScores.size()));// this.getSimilarity(); + Score ns = new Score(rawScores.get(i).getSentId(), (1 - title_wt) / (rawScores.size())); // this.getSimilarity(); currWtScores.add(ns); } // currWtScores.get(0).score = this.title_wt; @@ -129,8 +159,6 @@ public class TextRank { // Update the scores for the current iteration.. for (Score rs : rawScores) { int sentId = rs.getSentId(); - Score ns = new Score(); - ns.setSentId(sentId); List<Integer> neighbors = getLinks().get(sentId); double sum = 0; @@ -145,7 +173,7 @@ public class TextRank { sum += wij / sigmawjk * txtRnkj; } } - ns.setScore((1d - DF) + sum * DF);// * rs.score + Score ns = new Score(sentId, (1d - DF) + sum * DF); // * rs.score totErr += ns.getScore() - getScoreFrom(rawScores, sentId); newWtScores.add(ns); } @@ -169,8 +197,7 @@ public class TextRank { for (int i = 0; i < sentences.size(); i++) { String nextSent = sentences.get(i); String[] words = docProc.getWords(nextSent); - Score s = new Score(); - s.setSentId(i); + Score s = new Score(i, 0d); for (String word : words) { String currWrd = docProc.getStemmer().stem(word).toString(); //stemmer.toString(); @@ -220,7 +247,7 @@ public class TextRank { this.sentences = sentences; this.processedSent = processedSent; - Hashtable<String, Double> wrdWts = toWordWtHashtable(this.wordWt, iidx);// new + Hashtable<String, Double> wrdWts = toWordWtHashtable(this.wordWt, iidx); // new if (HIGHER_TITLE_WEIGHT && !getSentences().isEmpty()) { String sent = getSentences().get(0); @@ -250,14 +277,6 @@ public class TextRank { this.sentences = sentences; } - public String getArticle() { - return article; - } - - public void setArticle(String article) { - this.article = article; - } - public Hashtable<Integer, List<Integer>> getLinks() { return links; } @@ -265,14 +284,5 @@ public class TextRank { private void setLinks(Hashtable<Integer, List<Integer>> links) { this.links = links; } -} -/* - * public double getScore(String sent1, String sent2, boolean toPrint) { - * String[] words1 = sent1.split("\\s+"); String[] words2 = sent2.split("\\s+"); - * double wordsInCommon = 0; for(int i=0;i< words1.length;i++) { for(int - * j=0;j<words2.length;j++) { if(!sw.isStopWord(words1[i]) && - * !words1[i].trim().isEmpty() && words1[i].equals(words2[j])) { wordsInCommon+= - * wordWt.getWordWeight(words1[i]); } } } return ((double)wordsInCommon) / - * (Math.log(1+words1.length) + Math.log(1+words2.length)); } - */ \ No newline at end of file +} diff --git a/summarizer/src/main/java/opennlp/summarization/textrank/TextRankSummarizer.java b/summarizer/src/main/java/opennlp/summarization/textrank/TextRankSummarizer.java index 765bb94..3c1a3e3 100755 --- a/summarizer/src/main/java/opennlp/summarization/textrank/TextRankSummarizer.java +++ b/summarizer/src/main/java/opennlp/summarization/textrank/TextRankSummarizer.java @@ -108,7 +108,7 @@ public class TextRankSummarizer implements Summarizer { @Override public String summarize(String article, int maxWords) { - List<Sentence> sentences = docProcessor.getSentencesFromStr(article); + List<Sentence> sentences = docProcessor.getSentences(article); List<Score> scores = rankSentences(article, sentences, maxWords); return scores2String(sentences, scores, maxWords); } diff --git a/summarizer/src/test/java/opennlp/summarization/AbstractSummarizerTest.java b/summarizer/src/test/java/opennlp/summarization/AbstractSummarizerTest.java index ce7bc50..ec31f79 100644 --- a/summarizer/src/test/java/opennlp/summarization/AbstractSummarizerTest.java +++ b/summarizer/src/test/java/opennlp/summarization/AbstractSummarizerTest.java @@ -17,7 +17,7 @@ package opennlp.summarization; -import opennlp.summarization.lexicalchaining.OpenNLPPOSTagger; +import opennlp.summarization.lexicalchaining.NounPOSTagger; import opennlp.summarization.preprocess.DefaultDocProcessor; import org.junit.jupiter.api.BeforeAll; @@ -37,12 +37,12 @@ public abstract class AbstractSummarizerTest { private static final Logger log = LoggerFactory.getLogger(AbstractSummarizerTest.class); protected static DefaultDocProcessor docProcessor; - protected static OpenNLPPOSTagger posTagger; + protected static NounPOSTagger posTagger; @BeforeAll static void initEnv() throws IOException { - docProcessor = new DefaultDocProcessor(AbstractSummarizerTest.class.getResourceAsStream("/en-sent.bin")); - posTagger = new OpenNLPPOSTagger(docProcessor, AbstractSummarizerTest.class.getResourceAsStream("/en-pos-maxent.bin")); + docProcessor = new DefaultDocProcessor("en"); + posTagger = new NounPOSTagger("en"); } /** @@ -52,17 +52,17 @@ public abstract class AbstractSummarizerTest { @ParameterizedTest(name = "news story {index}") @ValueSource(strings = { - "/meta/0a2035f3f73b06a5150a6f01cffdf45d027bbbed.story", - "/meta/0a2278bec4a80aec1bc3e9e7a9dac10ac1b6425b.story", - "/meta/0a3040b6c1bba95efca727158f128a19c44ec8ba.story", - "/meta/0a3479b53796863a664c32ca20d8672583335d2a.story", - "/meta/0a3639cb86487e72e2ba084211f99799918aedf8.story", - "/meta/0a4092bef1801863296777ebcfeceb1aec23c78f.story", - "/meta/0a5458d3427b290524a8df11d8503a5b57b32747.story", - "/meta/0a5691b8fe654b6b2cdace5ab87aff2ee4c23577.story", - "/meta/0a6790f886a42a76945d4a21ed27c4ebd9ca1025.story" + "/news/0a2035f3f73b06a5150a6f01cffdf45d027bbbed.story", + "/news/0a2278bec4a80aec1bc3e9e7a9dac10ac1b6425b.story", + "/news/0a3040b6c1bba95efca727158f128a19c44ec8ba.story", + "/news/0a3479b53796863a664c32ca20d8672583335d2a.story", + "/news/0a3639cb86487e72e2ba084211f99799918aedf8.story", + "/news/0a4092bef1801863296777ebcfeceb1aec23c78f.story", + "/news/0a5458d3427b290524a8df11d8503a5b57b32747.story", + "/news/0a5691b8fe654b6b2cdace5ab87aff2ee4c23577.story", + "/news/0a6790f886a42a76945d4a21ed27c4ebd9ca1025.story" }) - public void testSummarize(String filename) { + public void testSummarize(String filename) throws IOException { String article = docProcessor.docToString(filename); String summary = getSummarizer().summarize(article, 20); assertNotNull(summary); diff --git a/summarizer/src/test/java/opennlp/summarization/SentenceTest.java b/summarizer/src/test/java/opennlp/summarization/SentenceTest.java new file mode 100644 index 0000000..28f9fc1 --- /dev/null +++ b/summarizer/src/test/java/opennlp/summarization/SentenceTest.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.summarization; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.NullAndEmptySource; +import org.junit.jupiter.params.provider.ValueSource; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class SentenceTest { + + private static final String SENTENCE = "This example is available in many tests."; + + // SUT + private Sentence sentence; + + @BeforeEach + public void setUp() { + sentence = new Sentence(0, SENTENCE, 0, 0); + } + + @ParameterizedTest + @ValueSource(strings = {"\t", "\n", " "}) + @NullAndEmptySource + public void testConstructInvalid1(String input) { + assertThrows(IllegalArgumentException.class, () -> new Sentence(0, input, 0, 0)); + } + + @ParameterizedTest + @ValueSource(ints = {Integer.MIN_VALUE, -42, -1}) + public void testConstructInvalid2(int input) { + assertThrows(IllegalArgumentException.class, () -> new Sentence(input, SENTENCE, 0, 0)); + } + + @ParameterizedTest + @ValueSource(ints = {Integer.MIN_VALUE, -42, -1}) + public void testConstructInvalid3(int input) { + assertThrows(IllegalArgumentException.class, () -> new Sentence(0, SENTENCE, input, 0)); + } + + @ParameterizedTest + @ValueSource(ints = {Integer.MIN_VALUE, -42, -1}) + public void testConstructInvalid4(int input) { + assertThrows(IllegalArgumentException.class, () -> new Sentence(0, SENTENCE, 0, input)); + } + + @Test + public void testSentenceIdentity() { + assertEquals(0, sentence.getSentId()); + assertEquals(0, sentence.getParagraph()); + assertEquals(0, sentence.getParaPos()); + assertEquals(SENTENCE, sentence.getStringVal()); + } + + @Test + public void testStem() { + String stemmed = sentence.stem(); + assertNotNull(stemmed); + assertFalse(stemmed.isBlank()); + assertEquals("Thi exampl avail mani test ", stemmed); + } + + @Test + public void testGetWrdCnt() { + int wordCountWithoutStopwords = sentence.getWordCnt(); + assertEquals(5, wordCountWithoutStopwords); + } + + @Test + public void testHashcode() { + int hash = sentence.hashCode(); + assertEquals(hash, new Sentence(0, SENTENCE, 0, 0).hashCode()); + } + + @Test + public void testEquals() { + assertEquals(sentence, new Sentence(0, SENTENCE, 0, 0)); + } + + @Test + public void testToString() { + assertEquals(sentence.toString(), new Sentence(0, SENTENCE, 0, 0).toString()); + } +} diff --git a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/AbstractLexicalChainTest.java b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/AbstractLexicalChainTest.java new file mode 100644 index 0000000..b2bca3c --- /dev/null +++ b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/AbstractLexicalChainTest.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.summarization.lexicalchaining; + +import opennlp.summarization.preprocess.DefaultDocProcessor; +import org.junit.jupiter.api.BeforeAll; + +public abstract class AbstractLexicalChainTest { + + protected static final String ARTICLE = + "US President Barack Obama has welcomed an agreement between the US and Russia under which Syria's chemical weapons must be destroyed or removed by mid-2014 as an \"important step\"." + + "But a White House statement cautioned that the US expected Syria to live up to its public commitments. " + + "The US-Russian framework document stipulates that Syria must provide details of its stockpile within a week. " + + "If Syria fails to comply, the deal could be enforced by a UN resolution. " + + "China, France, the UK, the UN and Nato have all expressed satisfaction at the agreement. " + + "In Beijing, Foreign Minister Wang Yi said on Sunday that China welcomes the general agreement between the US and Russia."; + + protected static DefaultDocProcessor dp; + protected static LexicalChainingSummarizer lcs; + + @BeforeAll + static void initEnv() throws Exception { + dp = new DefaultDocProcessor("en"); + lcs = new LexicalChainingSummarizer(dp, "en"); + } +} diff --git a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexChainTest.java b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexChainTest.java deleted file mode 100644 index 8655922..0000000 --- a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexChainTest.java +++ /dev/null @@ -1,109 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.summarization.lexicalchaining; - -import opennlp.summarization.Sentence; -import opennlp.summarization.preprocess.DefaultDocProcessor; - -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import java.util.Collections; -import java.util.Hashtable; -import java.util.List; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assertions.assertNotNull; -import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; - -class LexChainTest { - - private static final String ARTICLE = - "US President Barack Obama has welcomed an agreement between the US and Russia under which Syria's chemical weapons must be destroyed or removed by mid-2014 as an \"important step\"." - + "But a White House statement cautioned that the US expected Syria to live up to its public commitments. " - + "The US-Russian framework document stipulates that Syria must provide details of its stockpile within a week. " - + "If Syria fails to comply, the deal could be enforced by a UN resolution. " - + "China, France, the UK, the UN and Nato have all expressed satisfaction at the agreement. " - + "In Beijing, Foreign Minister Wang Yi said on Sunday that China welcomes the general agreement between the US and Russia."; - - private static DefaultDocProcessor dp; - private static LexicalChainingSummarizer lcs; - - @BeforeAll - static void initEnv() throws Exception { - dp = new DefaultDocProcessor(LexChainTest.class.getResourceAsStream("/en-sent.bin")); - lcs = new LexicalChainingSummarizer(dp, LexChainTest.class.getResourceAsStream("/en-pos-maxent.bin")); - } - - @Test - void testBuildLexicalChains() { - List<Sentence> sent = dp.getSentencesFromStr(ARTICLE); - assertNotNull(sent); - List<LexicalChain> vh = lcs.buildLexicalChains(ARTICLE, sent); - assertNotNull(vh); - Collections.sort(vh); - assertTrue(!vh.isEmpty()); - - List<Sentence> s = dp.getSentencesFromStr(ARTICLE); - Hashtable<String, Boolean> comp = new Hashtable<>(); - - for (int i = vh.size() - 1; i >= Math.max(vh.size() - 50, 0); i--) { - LexicalChain lc = vh.get(i); - - if (!(comp.containsKey(lc.getWord().get(0).getLexicon()))) { - comp.put(lc.getWord().get(0).getLexicon(), Boolean.TRUE); - /* - for(int j=0;j<lc.getWord().size();j++) - System.out.print(lc.getWord().get(j) + " -- "); - */ - - assertEquals(1.0d, lc.score()); - /* - for(Sentence sid : lc.getSentences()) { - //if(sid>=0 && sid<s.size()) - System.out.println(sid); - } - */ - } - } - - } - - @Test - void testGetRelation() { - try { - WordRelationshipDetermination lcs = new WordRelationshipDetermination(); - LexicalChain l = new LexicalChain(); - List<Word> words = lcs.getWordSenses("music"); - - l.addWord(words.get(0)); - // int rel = lcs.getRelation(l, "nation"); - WordRelation rel2 = lcs.getRelation(l, "tune", true); - WordRelation rel3 = lcs.getRelation(l, "vocal", true); - assertEquals(1, rel2.relation()); - assertEquals(1, rel3.relation()); - // assertEquals(rel, LexicalChainingSummarizer.STRONG_RELATION); - assertEquals(WordRelation.MED_RELATION, rel2.relation()); - assertEquals(WordRelation.MED_RELATION, rel3.relation()); - } catch (Exception e) { - fail(e.getLocalizedMessage()); - } - } - -} diff --git a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractorTest.java b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractorTest.java index 1bb476a..66fa5d9 100644 --- a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractorTest.java +++ b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractorTest.java @@ -17,43 +17,69 @@ package opennlp.summarization.lexicalchaining; +import java.util.Collections; import java.util.List; +import opennlp.summarization.Sentence; + import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; - -import opennlp.summarization.Sentence; -import opennlp.summarization.preprocess.DefaultDocProcessor; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; +import static org.junit.jupiter.api.Assertions.assertTrue; -class LexChainingKeywordExtractorTest { +class LexChainingKeywordExtractorTest extends AbstractLexicalChainTest { - private static final String ARTICLE = - "US President Barack Obama has welcomed an agreement between the US and Russia under which Syria's chemical weapons must be destroyed or removed by mid-2014 as an \"important step\"." - + "But a White House statement cautioned that the US expected Syria to live up to its public commitments. " - + "The US-Russian framework document stipulates that Syria must provide details of its stockpile within a week. " - + "If Syria fails to comply, the deal could be enforced by a UN resolution. " - + "China, France, the UK, the UN and Nato have all expressed satisfaction at the agreement. " - + "In Beijing, Foreign Minister Wang Yi said on Sunday that China welcomes the general agreement between the US and Russia."; + private static List<LexicalChain> chains; - private static DefaultDocProcessor dp; - private static LexicalChainingSummarizer lcs; + // SUT + private LexChainingKeywordExtractor keywordExtractor; @BeforeAll static void initEnv() throws Exception { - dp = new DefaultDocProcessor(LexChainingKeywordExtractorTest.class.getResourceAsStream("/en-sent.bin")); - lcs = new LexicalChainingSummarizer(dp, LexChainingKeywordExtractorTest.class.getResourceAsStream("/en-pos-maxent.bin")); + AbstractLexicalChainTest.initEnv(); + // Prep + List<Sentence> sent = dp.getSentences(ARTICLE); + assertNotNull(sent); + assertFalse(sent.isEmpty()); + chains = lcs.buildLexicalChains(ARTICLE, sent); + assertNotNull(chains); + assertFalse(chains.isEmpty()); } - @Test - void testGetKeywords() { - List<Sentence> sent = dp.getSentencesFromStr(ARTICLE); - List<LexicalChain> vh = lcs.buildLexicalChains(ARTICLE, sent); - LexChainingKeywordExtractor ke = new LexChainingKeywordExtractor(); - List<String> keywords = ke.getKeywords(vh, 5); + @BeforeEach + public void setUp() { + keywordExtractor = new LexChainingKeywordExtractor(); + } + + @ParameterizedTest + @ValueSource(ints = {1, 5, 42, Integer.MAX_VALUE}) + void testExtractKeywords(int noOfKeywords) { + List<String> keywords = keywordExtractor.extractKeywords(chains, noOfKeywords); assertNotNull(keywords); assertFalse(keywords.isEmpty()); } + + @Test + void testExtractKeywordsWithEmptyInput() { + List<String> keywords = keywordExtractor.extractKeywords(Collections.emptyList(), 5); + assertNotNull(keywords); + assertTrue(keywords.isEmpty()); + } + + @Test + void testExtractKeywordsInvalid1() { + assertThrows(IllegalArgumentException.class, () -> keywordExtractor.extractKeywords(null, 5)); + } + + @ParameterizedTest + @ValueSource(ints = {Integer.MIN_VALUE, -1, 0}) + void testExtractKeywordsInvalid2(int noOfKeywords) { + assertThrows(IllegalArgumentException.class, () -> keywordExtractor.extractKeywords(chains, noOfKeywords)); + } } diff --git a/summarizer/src/test/java/opennlp/summarization/preprocess/DocProcessorTest.java b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizerNewsTest.java similarity index 54% rename from summarizer/src/test/java/opennlp/summarization/preprocess/DocProcessorTest.java rename to summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizerNewsTest.java index ce31c26..8f82065 100644 --- a/summarizer/src/test/java/opennlp/summarization/preprocess/DocProcessorTest.java +++ b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizerNewsTest.java @@ -15,32 +15,33 @@ * limitations under the License. */ -package opennlp.summarization.preprocess; +package opennlp.summarization.lexicalchaining; -import java.util.List; +import opennlp.summarization.AbstractSummarizerTest; +import opennlp.summarization.Summarizer; -import org.junit.jupiter.api.BeforeAll; -import org.junit.jupiter.api.Test; - -import opennlp.summarization.Sentence; +import org.junit.jupiter.api.BeforeEach; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; -class DocProcessorTest { +/** + * Tests the implementation of {@link LexicalChainingSummarizer}. + */ +public class LexicalChainingSummarizerNewsTest extends AbstractSummarizerTest { - private static DefaultDocProcessor dp; + // SUT + private LexicalChainingSummarizer lexicalChainSummarizer; - @BeforeAll - static void initEnv() { - dp = new DefaultDocProcessor(DocProcessorTest.class.getResourceAsStream("/en-sent.bin")); + @BeforeEach + void setUp() { + lexicalChainSummarizer = new LexicalChainingSummarizer(docProcessor, posTagger); } - @Test - void testGetSentencesFromStr() { - String sent = "This is a sentence, with some punctuations; to test if the sentence breaker can handle it! Is every thing working OK ? Yes."; - List<Sentence> doc = dp.getSentencesFromStr(sent); - //dp.docToString(fileName); - assertEquals(doc.size(), 3); + @Override + public Summarizer getSummarizer() { + return lexicalChainSummarizer; } - + } diff --git a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizerTest.java b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizerTest.java index 5d23bef..435c727 100644 --- a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizerTest.java +++ b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizerTest.java @@ -17,27 +17,58 @@ package opennlp.summarization.lexicalchaining; -import opennlp.summarization.AbstractSummarizerTest; -import opennlp.summarization.Summarizer; +import java.util.Collections; +import java.util.Hashtable; +import java.util.List; +import java.util.Map; import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; -/** - * Tests the implementation of {@link LexicalChainingSummarizer}. - */ -public class LexicalChainingSummarizerTest extends AbstractSummarizerTest { +import opennlp.summarization.Sentence; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +class LexicalChainingSummarizerTest extends AbstractLexicalChainTest { - // SUT - private Summarizer lexicalChainSummarizer; + private List<Sentence> sent; @BeforeEach void setUp() { - lexicalChainSummarizer = new LexicalChainingSummarizer(docProcessor, posTagger); + sent = dp.getSentences(ARTICLE); + assertNotNull(sent); } - @Override - public Summarizer getSummarizer() { - return lexicalChainSummarizer; + @Test + void testBuildLexicalChains() { + List<LexicalChain> vh = lcs.buildLexicalChains(ARTICLE, sent); + assertNotNull(vh); + Collections.sort(vh); + assertFalse(vh.isEmpty()); + + Map<String, Boolean> comp = new Hashtable<>(); + + for (int i = vh.size() - 1; i >= Math.max(vh.size() - 50, 0); i--) { + LexicalChain lc = vh.get(i); + Word w = lc.getWords().get(0); + if (!(comp.containsKey(w.getLexicon()))) { + comp.put(w.getLexicon(), Boolean.TRUE); + /* + for(int j=0;j<lc.getWord().size();j++) + System.out.print(lc.getWord().get(j) + " -- "); + */ + + // assertEquals(1.0d, lc.score()); + /* + System.out.println(lc + ": "); + for(Sentence sid : lc.getSentences()) { + //if(sid>=0 && sid<s.size()) + System.out.println("\t" + sid + " [" + lc.score() + "]"); + } + */ + } + } } } diff --git a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/NounPOSTaggerTest.java b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/NounPOSTaggerTest.java new file mode 100644 index 0000000..2822299 --- /dev/null +++ b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/NounPOSTaggerTest.java @@ -0,0 +1,104 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.summarization.lexicalchaining; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.EmptySource; +import org.junit.jupiter.params.provider.ValueSource; + +import java.io.IOException; +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class NounPOSTaggerTest { + + private static final String UNTAGGED_SENTENCE = "This is a test ."; + private static final String[] TOKENS_SENTENCE = {"This", "is", "a", "test", "."}; + private static final String[] TOKENS_TAGGED_SENTENCE = {"This/PRON", "is/AUX", "a/DET", "test/NOUN", "./PUNCT"}; + + private static POSTagger tagger; // SUT + + @BeforeAll + public static void initResources() throws IOException { + tagger = new NounPOSTagger("en"); + } + + @Test + void testConstructWithInvalidResource() { + assertThrows(IllegalArgumentException.class, () -> new NounPOSTagger(null)); + } + + @Test + void testGetTaggedString() { + String tagged = tagger.getTaggedString(UNTAGGED_SENTENCE); + assertNotNull(tagged); + assertEquals("This/PRON is/AUX a/DET test/NOUN ./PUNCT", tagged); + } + + @Test + void testGetTaggedStringInvalid1() { + assertThrows(IllegalArgumentException.class, () -> tagger.getTaggedString(null)); + } + + @ParameterizedTest + @ValueSource(strings = {"\t", "\n", " "}) + @EmptySource + void testGetTaggedStringInvalid2(String input) { + String tagged = tagger.getTaggedString(input); + assertNotNull(tagged); + } + + @Test + void testGetWordsOfTypeWithTags() { + List<String> filteredByType = tagger.getWordsOfType(TOKENS_TAGGED_SENTENCE, POSTagger.NOUN); + assertNotNull(filteredByType); + assertEquals(1, filteredByType.size()); + assertEquals("test", filteredByType.get(0)); + } + + @Test + void testGetWordsOfTypeWithoutTags() { + assertThrows(IllegalArgumentException.class, () -> + tagger.getWordsOfType(TOKENS_SENTENCE, POSTagger.NOUN)); + } + + @ParameterizedTest + @ValueSource(ints = {POSTagger.ADJECTIVE, POSTagger.ADVERB, POSTagger.VERB}) + void testGetWordsOfTypeWithNonMatchingType(int type) { + List<String> filteredByType = tagger.getWordsOfType(TOKENS_TAGGED_SENTENCE, type); + assertNotNull(filteredByType); + assertEquals(0, filteredByType.size()); + } + + @ParameterizedTest + @ValueSource(ints = {Integer.MIN_VALUE, -1, 5, Integer.MAX_VALUE}) + void testGetWordsOfTypeWithInvalidType(int type) { + assertThrows(IllegalArgumentException.class, () -> + tagger.getWordsOfType(TOKENS_TAGGED_SENTENCE, type)); + } + + @Test + void testGetWordsOfTypeWithInvalidInput() { + assertThrows(IllegalArgumentException.class, () -> + tagger.getWordsOfType(null, POSTagger.NOUN)); + } +} diff --git a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/WordRelationshipDeterminationTest.java b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/WordRelationshipDeterminationTest.java new file mode 100644 index 0000000..bd8845f --- /dev/null +++ b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/WordRelationshipDeterminationTest.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.summarization.lexicalchaining; + +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +public class WordRelationshipDeterminationTest { + + // SUT + private WordRelationshipDetermination wrd; + + @BeforeEach + public void setUp() { + wrd = new WordRelationshipDetermination(); + } + + @Test + void testGetWordSenses() { + LexicalChain l = new LexicalChain(); + List<Word> words = wrd.getWordSenses("music"); + assertNotNull(words); + assertFalse(words.isEmpty()); + l.addWord(words.get(0)); + } + + @Test + void testGetRelation() { + LexicalChain l = new LexicalChain(); + List<Word> words = wrd.getWordSenses("music"); + assertNotNull(words); + assertFalse(words.isEmpty()); + l.addWord(words.get(0)); + // int rel = lcs.getRelation(l, "nation"); + WordRelation rel2 = wrd.getRelation(l, "tune", true); + WordRelation rel3 = wrd.getRelation(l, "vocal", true); + assertEquals(1, rel2.relation()); + assertEquals(1, rel3.relation()); + // assertEquals(rel, LexicalChainingSummarizer.STRONG_RELATION); + assertEquals(WordRelation.MED_RELATION, rel2.relation()); + assertEquals(WordRelation.MED_RELATION, rel3.relation()); + } +} diff --git a/summarizer/src/test/java/opennlp/summarization/lexicalchaining/WordnetWordTest.java b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/WordnetWordTest.java new file mode 100644 index 0000000..ab25c21 --- /dev/null +++ b/summarizer/src/test/java/opennlp/summarization/lexicalchaining/WordnetWordTest.java @@ -0,0 +1,107 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.summarization.lexicalchaining; + +import edu.mit.jwi.item.ISynsetID; +import edu.mit.jwi.item.IWordID; +import edu.mit.jwi.item.POS; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.NullAndEmptySource; +import org.junit.jupiter.params.provider.ValueSource; + +import java.util.List; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; + +public class WordnetWordTest { + + private WordRelationshipDetermination wrd; + + // SUT + private Word word; + + @BeforeEach + public void setUp() { + wrd = new WordRelationshipDetermination(); + List<Word> words = wrd.getWordSenses("music"); + assertNotNull(words); + assertFalse(words.isEmpty()); + word = words.get(0); + assertNotNull(word); + } + + @ParameterizedTest + @ValueSource(strings = {"\t", "\n", " "}) + @NullAndEmptySource + public void testConstructInvalid1(String input) { + assertThrows(IllegalArgumentException.class, () -> new WordnetWord(input, new DummyWordID())); + } + + @Test + public void testConstructInvalid2() { + assertThrows(IllegalArgumentException.class, () -> new WordnetWord("music", null)); + } + + @Test + public void testSentenceIdentity() { + assertEquals("music", word.getLexicon()); + assertEquals("WID-07034009-N-01-music", word.getID().toString()); + } + + @Test + public void testHashcode() { + int hash = word.hashCode(); + assertEquals(hash, wrd.getWordSenses("music").get(0).hashCode()); + } + + @Test + public void testEquals() { + assertEquals(word, wrd.getWordSenses("music").get(0)); + } + + @Test + public void testToString() { + assertEquals(word.toString(), wrd.getWordSenses("music").get(0).toString()); + } + + private static class DummyWordID implements IWordID { + @Override + public ISynsetID getSynsetID() { + return null; + } + + @Override + public int getWordNumber() { + return 0; + } + + @Override + public String getLemma() { + return ""; + } + + @Override + public POS getPOS() { + return null; + } + } +} diff --git a/summarizer/src/test/java/opennlp/summarization/preprocess/DefaultDocProcessorTest.java b/summarizer/src/test/java/opennlp/summarization/preprocess/DefaultDocProcessorTest.java new file mode 100644 index 0000000..dc95239 --- /dev/null +++ b/summarizer/src/test/java/opennlp/summarization/preprocess/DefaultDocProcessorTest.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package opennlp.summarization.preprocess; + +import java.io.IOException; +import java.util.List; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import opennlp.summarization.Sentence; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.NullAndEmptySource; +import org.junit.jupiter.params.provider.ValueSource; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; + +class DefaultDocProcessorTest { + + private static DefaultDocProcessor dp; + + @BeforeAll + static void initEnv() throws IOException { + dp = new DefaultDocProcessor("en"); + } + + @Test + void testGetSentences() { + String sent = "This is a sentence, with some punctuations; to test if the sentence breaker can handle it! Is every thing working OK ? Yes."; + List<Sentence> doc = dp.getSentences(sent); + assertNotNull(doc); + assertEquals(3, doc.size()); + } + + @ParameterizedTest + @ValueSource(strings = {"\t", "\n", " "}) + @NullAndEmptySource + void testGetSentencesInvalid(String input) { + List<Sentence> doc = dp.getSentences(input); + assertNotNull(doc); + assertEquals(0, doc.size()); + } + + @Test + void testGetWords() { + String sent = "This is a sentence, with some punctuations; to test if the sentence breaker can handle it! Is every thing working OK ? Yes."; + List<Sentence> doc = dp.getSentences(sent); + assertNotNull(doc); + assertEquals(3, doc.size()); + for (Sentence sentence : doc) { + String[] words = dp.getWords(sentence.getStringVal()); + assertNotNull(words); + assertTrue(words.length > 0); + assertTrue(words.length >= sentence.getWordCnt()); // due to stop words not counted, this must hold. + } + } + + @ParameterizedTest + @ValueSource(strings = {"\t", "\n", " "}) + @NullAndEmptySource + void testGetWordsInvalid(String input) { + String[] words = dp.getWords(input); + assertNotNull(words); + assertEquals(0, words.length); + } + + @Test + void testDocToString() throws IOException { + String content = dp.docToString("/news/0a2035f3f73b06a5150a6f01cffdf45d027bbbed.story"); + assertNotNull(content); + assertFalse(content.isEmpty()); + } + + @ParameterizedTest + @ValueSource(strings = {"\t", "\n", " "}) + @NullAndEmptySource + void testDocToStringInvalid(String input) throws IOException { + String content = dp.docToString(input); + assertNotNull(content); + assertTrue(content.isEmpty()); + } + + @Test + void testDocToSentences() throws IOException { + List<Sentence> content = dp.docToSentences("/news/0a2035f3f73b06a5150a6f01cffdf45d027bbbed.story"); + assertNotNull(content); + assertFalse(content.isEmpty()); + } + + @ParameterizedTest + @ValueSource(strings = {"\t", "\n", " "}) + @NullAndEmptySource + void testDocToSentencesInvalid(String input) throws IOException { + List<Sentence> content = dp.docToSentences(input); + assertNotNull(content); + assertTrue(content.isEmpty()); + } +} diff --git a/summarizer/src/test/resources/meta/Notes.txt b/summarizer/src/test/resources/Notes.txt similarity index 100% rename from summarizer/src/test/resources/meta/Notes.txt rename to summarizer/src/test/resources/Notes.txt diff --git a/summarizer/src/test/resources/en-pos-maxent.bin b/summarizer/src/test/resources/en-pos-maxent.bin deleted file mode 100644 index 168f259..0000000 Binary files a/summarizer/src/test/resources/en-pos-maxent.bin and /dev/null differ diff --git a/summarizer/src/test/resources/en-sent.bin b/summarizer/src/test/resources/en-sent.bin deleted file mode 100644 index d3a2779..0000000 Binary files a/summarizer/src/test/resources/en-sent.bin and /dev/null differ diff --git a/summarizer/src/test/resources/meta/idf.csv b/summarizer/src/test/resources/idf.csv similarity index 100% rename from summarizer/src/test/resources/meta/idf.csv rename to summarizer/src/test/resources/idf.csv diff --git a/summarizer/src/test/resources/meta/0a2035f3f73b06a5150a6f01cffdf45d027bbbed.story b/summarizer/src/test/resources/news/0a2035f3f73b06a5150a6f01cffdf45d027bbbed.story similarity index 100% rename from summarizer/src/test/resources/meta/0a2035f3f73b06a5150a6f01cffdf45d027bbbed.story rename to summarizer/src/test/resources/news/0a2035f3f73b06a5150a6f01cffdf45d027bbbed.story diff --git a/summarizer/src/test/resources/meta/0a2278bec4a80aec1bc3e9e7a9dac10ac1b6425b.story b/summarizer/src/test/resources/news/0a2278bec4a80aec1bc3e9e7a9dac10ac1b6425b.story similarity index 100% rename from summarizer/src/test/resources/meta/0a2278bec4a80aec1bc3e9e7a9dac10ac1b6425b.story rename to summarizer/src/test/resources/news/0a2278bec4a80aec1bc3e9e7a9dac10ac1b6425b.story diff --git a/summarizer/src/test/resources/meta/0a3040b6c1bba95efca727158f128a19c44ec8ba.story b/summarizer/src/test/resources/news/0a3040b6c1bba95efca727158f128a19c44ec8ba.story similarity index 100% rename from summarizer/src/test/resources/meta/0a3040b6c1bba95efca727158f128a19c44ec8ba.story rename to summarizer/src/test/resources/news/0a3040b6c1bba95efca727158f128a19c44ec8ba.story diff --git a/summarizer/src/test/resources/meta/0a3479b53796863a664c32ca20d8672583335d2a.story b/summarizer/src/test/resources/news/0a3479b53796863a664c32ca20d8672583335d2a.story similarity index 100% rename from summarizer/src/test/resources/meta/0a3479b53796863a664c32ca20d8672583335d2a.story rename to summarizer/src/test/resources/news/0a3479b53796863a664c32ca20d8672583335d2a.story diff --git a/summarizer/src/test/resources/meta/0a3639cb86487e72e2ba084211f99799918aedf8.story b/summarizer/src/test/resources/news/0a3639cb86487e72e2ba084211f99799918aedf8.story similarity index 100% rename from summarizer/src/test/resources/meta/0a3639cb86487e72e2ba084211f99799918aedf8.story rename to summarizer/src/test/resources/news/0a3639cb86487e72e2ba084211f99799918aedf8.story diff --git a/summarizer/src/test/resources/meta/0a4092bef1801863296777ebcfeceb1aec23c78f.story b/summarizer/src/test/resources/news/0a4092bef1801863296777ebcfeceb1aec23c78f.story similarity index 100% rename from summarizer/src/test/resources/meta/0a4092bef1801863296777ebcfeceb1aec23c78f.story rename to summarizer/src/test/resources/news/0a4092bef1801863296777ebcfeceb1aec23c78f.story diff --git a/summarizer/src/test/resources/meta/0a4324d4a5effa420aa95bb058314eab35c73852.story b/summarizer/src/test/resources/news/0a4324d4a5effa420aa95bb058314eab35c73852.story similarity index 100% rename from summarizer/src/test/resources/meta/0a4324d4a5effa420aa95bb058314eab35c73852.story rename to summarizer/src/test/resources/news/0a4324d4a5effa420aa95bb058314eab35c73852.story diff --git a/summarizer/src/test/resources/meta/0a5458d3427b290524a8df11d8503a5b57b32747.story b/summarizer/src/test/resources/news/0a5458d3427b290524a8df11d8503a5b57b32747.story similarity index 100% rename from summarizer/src/test/resources/meta/0a5458d3427b290524a8df11d8503a5b57b32747.story rename to summarizer/src/test/resources/news/0a5458d3427b290524a8df11d8503a5b57b32747.story diff --git a/summarizer/src/test/resources/meta/0a5691b8fe654b6b2cdace5ab87aff2ee4c23577.story b/summarizer/src/test/resources/news/0a5691b8fe654b6b2cdace5ab87aff2ee4c23577.story similarity index 100% rename from summarizer/src/test/resources/meta/0a5691b8fe654b6b2cdace5ab87aff2ee4c23577.story rename to summarizer/src/test/resources/news/0a5691b8fe654b6b2cdace5ab87aff2ee4c23577.story diff --git a/summarizer/src/test/resources/meta/0a6790f886a42a76945d4a21ed27c4ebd9ca1025.story b/summarizer/src/test/resources/news/0a6790f886a42a76945d4a21ed27c4ebd9ca1025.story similarity index 100% rename from summarizer/src/test/resources/meta/0a6790f886a42a76945d4a21ed27c4ebd9ca1025.story rename to summarizer/src/test/resources/news/0a6790f886a42a76945d4a21ed27c4ebd9ca1025.story
