Re: [PR] OPENNLP-1594 Add stricter tests for Summarizer component (opennlp-sandbox)

via GitHub Sat, 13 Jul 2024 00:59:07 -0700


kinow commented on code in PR #158:
URL: https://github.com/apache/opennlp-sandbox/pull/158#discussion_r1676778913



##########
summarizer/src/main/java/opennlp/summarization/Sentence.java:
##########
@@ -113,38 +123,21 @@ public List<Sentence> getLinks() {
     return this.links;
   }
 
-  public double getWordWt() {
+  public double getWordWeight() {
     return wordWt;
   }
 
-  public void setWordWt(double wordWt) {
+  public void setWordWeight(double wordWt) {
     this.wordWt = wordWt;
   }
 
   public int getWordCnt() {

Review Comment:
   Since the `Wt` was de-abbreviated, maybe `cnt` could also become `count`? Or 
maybe something for a future change...



##########
summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractor.java:
##########
@@ -22,20 +22,45 @@
 import java.util.List;
 
 /**
- * Uses the lexical chaining algorithm to extract keywords.
+ * Uses the {@link LexicalChain lexical chaining} algorithm to extract 
keywords.
+ *
+ * @see LexicalChain
  */
 public class LexChainingKeywordExtractor {
 
-  // Simple logic to pull out the keyword based on longest lexical chains..
-  public List<String> getKeywords(List<LexicalChain> lexicalChains, int 
noOfKeywords) {
-    Collections.sort(lexicalChains);
-    List<String> ret = new ArrayList<>();
-    for (int i = 0; i < Math.min(lexicalChains.size(), noOfKeywords); i++) {
-      List<Word> words = lexicalChains.get(i).getWord();
-      if (!words.isEmpty() && !ret.contains(words.get(0).getLexicon())) {
-        ret.add(words.get(0).getLexicon());
+  /**
+   * Extracts keywords from a list of {@link LexicalChain lexical chains}, 
limited by {@code noOfKeywords}.
+   *
+   * @param lexicalChains The {@link LexicalChain lexical chains} to process. 
Must not be {@code null}.
+   * @param noOfKeywords The upper limit of keywords. Must be greater than 
{@code zero}.
+   *
+   * @return The extracted keywords as a list. Guaranteed to be not {@code 
null}.
+   *
+   * @throws IllegalArgumentException Thrown if parameters are invalid.
+   * @implNote This operation is based on longest lexical chains.

Review Comment:
   Interesting, I never used `@implNote`, but searched about it, and learned 
something new today! Thanks!!



##########
summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java:
##########
@@ -44,95 +48,130 @@ public class LexicalChainingSummarizer implements 
Summarizer {
   private final DocProcessor docProcessor;
   private final WordRelationshipDetermination wordRel;
 
-  public LexicalChainingSummarizer(DocProcessor dp, OpenNLPPOSTagger 
posTagger) {
-    docProcessor = dp;
-    tagger = posTagger;
-    wordRel = new WordRelationshipDetermination();
+  /**
+   * Instantiates a {@link LexicalChainingSummarizer}.
+   *
+   * @param docProcessor The {@link DocProcessor} to use at runtime. Must not 
be {@code null}.
+   * @param languageCode An ISO-language code for obtaining a {@link POSModel}.
+   *                     Must not be {@code null}.
+   *
+   * @throws IllegalArgumentException Thrown if parameters are invalid.
+   */
+  public LexicalChainingSummarizer(DocProcessor docProcessor, String 
languageCode) throws IOException {
+    this(docProcessor, new NounPOSTagger(languageCode));
   }
 
-  public LexicalChainingSummarizer(DocProcessor dp, InputStream posModelFile) 
throws Exception {
-    this(dp, new OpenNLPPOSTagger(dp, posModelFile));
+  /**
+   * Instantiates a {@link LexicalChainingSummarizer}.
+   *
+   * @param docProcessor The {@link DocProcessor} to use at runtime. Must not 
be {@code null}.
+   * @param posTagger The {@link NounPOSTagger} to use at runtime. Must not be 
{@code null}.
+   *
+   * @throws IllegalArgumentException Thrown if parameters are invalid.
+   */
+  public LexicalChainingSummarizer(DocProcessor docProcessor, NounPOSTagger 
posTagger) {
+    if (docProcessor == null) throw new IllegalArgumentException("Parameter 
'docProcessor' must not be null!");
+    if (posTagger == null) throw new IllegalArgumentException("Parameter 
'posTagger' must not be null!");
+
+    this.docProcessor = docProcessor;
+    tagger = posTagger;
+    wordRel = new WordRelationshipDetermination();
   }
 
-  //Build Lexical chains..
-  public List<LexicalChain> buildLexicalChains(String article, List<Sentence> 
sent) {
-    // POS tag article
-    Hashtable<String, List<LexicalChain>> chains = new Hashtable<>();
-    List<LexicalChain> lc = new ArrayList<>();
-    // Build lexical chains
-    // For each sentence
-    for (Sentence currSent : sent) {
-      String taggedSent = tagger.getTaggedString(currSent.getStringVal());
-      List<String> nouns = tagger.getWordsOfType(taggedSent, POSTagger.NOUN);
-      //       For each noun
-      for (String noun : nouns) {
-        int chainsAddCnt = 0;
-        //  Loop through each LC
-        for (LexicalChain l : lc) {
-          try {
-            WordRelation rel = wordRel.getRelation(l, noun, 
(currSent.getSentId() - l.start) > 7);
-            //  Is the noun an exact match to one of the current LCs (Strong 
relation)
-            //  Add sentence to chain
-            if (rel.relation() == WordRelation.STRONG_RELATION) {
-              addToChain(rel.dest(), l, chains, currSent);
-              if (currSent.getSentId() - l.last > 10) {
-                l.occurrences++;
-                l.start = currSent.getSentId();
-              }
-              chainsAddCnt++;
-            } else if (rel.relation() == WordRelation.MED_RELATION) {
-              //  Add sentence to chain if it is 7 sentences away from start 
of chain
-              addToChain(rel.dest(), l, chains, currSent);
-              chainsAddCnt++;
-              //If greater than 7 we will add it but call it a new occurrence 
of the lexical chain...
-              if (currSent.getSentId() - l.start > 7) {
-                l.occurrences++;
-                l.start = currSent.getSentId();
-              }
-            } else if (rel.relation() == WordRelation.WEAK_RELATION) {
-              if (currSent.getSentId() - l.start <= 3) {
+  /**
+   * Constructs a list of {@link LexicalChain lexical chains} from specified 
sentences.
+   *
+   * @param article TODO unused parameter -> remove it?!

Review Comment:
   If not used and there are no other classes (siblings/parents, or just very 
similar with exact list of params -- like in Python) then I think there's no 
need to keep it. Not sure when would be the best time to do that, probably when 
bumping the major version?



##########
summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizerNewsTest.java:
##########
@@ -15,32 +15,33 @@
  * limitations under the License.
  */
 
-package opennlp.summarization.preprocess;
+package opennlp.summarization.lexicalchaining;
 
-import java.util.List;
+import opennlp.summarization.AbstractSummarizerTest;
+import opennlp.summarization.Summarizer;
 
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
-
-import opennlp.summarization.Sentence;
+import org.junit.jupiter.api.BeforeEach;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
 
-class DocProcessorTest {
+/**
+ * Tests the implementation of {@link LexicalChainingSummarizer}.
+ */
+public class LexicalChainingSummarizerNewsTest extends AbstractSummarizerTest {
 
-  private static DefaultDocProcessor dp;
+  // SUT
+  private LexicalChainingSummarizer lexicalChainSummarizer;
 
-  @BeforeAll
-  static void initEnv() {
-    dp = new 
DefaultDocProcessor(DocProcessorTest.class.getResourceAsStream("/en-sent.bin"));
+  @BeforeEach
+  void setUp() {
+    lexicalChainSummarizer = new LexicalChainingSummarizer(docProcessor, 
posTagger);
   }
 
-  @Test
-  void testGetSentencesFromStr() {
-    String sent = "This is a sentence, with some punctuations; to test if the 
sentence breaker can handle it! Is every thing working OK ? Yes.";
-    List<Sentence> doc = dp.getSentencesFromStr(sent);
-    //dp.docToString(fileName);
-    assertEquals(doc.size(), 3);
+  @Override
+  public Summarizer getSummarizer() {
+    return lexicalChainSummarizer;
   }
-
+  

Review Comment:
   Unneeded spaces?



##########
summarizer/src/main/java/opennlp/summarization/lexicalchaining/POSTagger.java:
##########
@@ -28,7 +32,26 @@ public interface POSTagger {
   int ADVERB = 3;
   int PRONOUN = 4;
 
+  /**
+   * Tags a given {@code input} text so that word classes are appenend to each 
token.

Review Comment:
   s/appenend/appended? 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Re: [PR] OPENNLP-1594 Add stricter tests for Summarizer component (opennlp-sandbox)

Reply via email to