kinow commented on code in PR #158:
URL: https://github.com/apache/opennlp-sandbox/pull/158#discussion_r1676778913
##########
summarizer/src/main/java/opennlp/summarization/Sentence.java:
##########
@@ -113,38 +123,21 @@ public List<Sentence> getLinks() {
return this.links;
}
- public double getWordWt() {
+ public double getWordWeight() {
return wordWt;
}
- public void setWordWt(double wordWt) {
+ public void setWordWeight(double wordWt) {
this.wordWt = wordWt;
}
public int getWordCnt() {
Review Comment:
Since the `Wt` was de-abbreviated, maybe `cnt` could also become `count`? Or
maybe something for a future change...
##########
summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexChainingKeywordExtractor.java:
##########
@@ -22,20 +22,45 @@
import java.util.List;
/**
- * Uses the lexical chaining algorithm to extract keywords.
+ * Uses the {@link LexicalChain lexical chaining} algorithm to extract
keywords.
+ *
+ * @see LexicalChain
*/
public class LexChainingKeywordExtractor {
- // Simple logic to pull out the keyword based on longest lexical chains..
- public List<String> getKeywords(List<LexicalChain> lexicalChains, int
noOfKeywords) {
- Collections.sort(lexicalChains);
- List<String> ret = new ArrayList<>();
- for (int i = 0; i < Math.min(lexicalChains.size(), noOfKeywords); i++) {
- List<Word> words = lexicalChains.get(i).getWord();
- if (!words.isEmpty() && !ret.contains(words.get(0).getLexicon())) {
- ret.add(words.get(0).getLexicon());
+ /**
+ * Extracts keywords from a list of {@link LexicalChain lexical chains},
limited by {@code noOfKeywords}.
+ *
+ * @param lexicalChains The {@link LexicalChain lexical chains} to process.
Must not be {@code null}.
+ * @param noOfKeywords The upper limit of keywords. Must be greater than
{@code zero}.
+ *
+ * @return The extracted keywords as a list. Guaranteed to be not {@code
null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ * @implNote This operation is based on longest lexical chains.
Review Comment:
Interesting, I never used `@implNote`, but searched about it, and learned
something new today! Thanks!!
##########
summarizer/src/main/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizer.java:
##########
@@ -44,95 +48,130 @@ public class LexicalChainingSummarizer implements
Summarizer {
private final DocProcessor docProcessor;
private final WordRelationshipDetermination wordRel;
- public LexicalChainingSummarizer(DocProcessor dp, OpenNLPPOSTagger
posTagger) {
- docProcessor = dp;
- tagger = posTagger;
- wordRel = new WordRelationshipDetermination();
+ /**
+ * Instantiates a {@link LexicalChainingSummarizer}.
+ *
+ * @param docProcessor The {@link DocProcessor} to use at runtime. Must not
be {@code null}.
+ * @param languageCode An ISO-language code for obtaining a {@link POSModel}.
+ * Must not be {@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
+ public LexicalChainingSummarizer(DocProcessor docProcessor, String
languageCode) throws IOException {
+ this(docProcessor, new NounPOSTagger(languageCode));
}
- public LexicalChainingSummarizer(DocProcessor dp, InputStream posModelFile)
throws Exception {
- this(dp, new OpenNLPPOSTagger(dp, posModelFile));
+ /**
+ * Instantiates a {@link LexicalChainingSummarizer}.
+ *
+ * @param docProcessor The {@link DocProcessor} to use at runtime. Must not
be {@code null}.
+ * @param posTagger The {@link NounPOSTagger} to use at runtime. Must not be
{@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
+ public LexicalChainingSummarizer(DocProcessor docProcessor, NounPOSTagger
posTagger) {
+ if (docProcessor == null) throw new IllegalArgumentException("Parameter
'docProcessor' must not be null!");
+ if (posTagger == null) throw new IllegalArgumentException("Parameter
'posTagger' must not be null!");
+
+ this.docProcessor = docProcessor;
+ tagger = posTagger;
+ wordRel = new WordRelationshipDetermination();
}
- //Build Lexical chains..
- public List<LexicalChain> buildLexicalChains(String article, List<Sentence>
sent) {
- // POS tag article
- Hashtable<String, List<LexicalChain>> chains = new Hashtable<>();
- List<LexicalChain> lc = new ArrayList<>();
- // Build lexical chains
- // For each sentence
- for (Sentence currSent : sent) {
- String taggedSent = tagger.getTaggedString(currSent.getStringVal());
- List<String> nouns = tagger.getWordsOfType(taggedSent, POSTagger.NOUN);
- // For each noun
- for (String noun : nouns) {
- int chainsAddCnt = 0;
- // Loop through each LC
- for (LexicalChain l : lc) {
- try {
- WordRelation rel = wordRel.getRelation(l, noun,
(currSent.getSentId() - l.start) > 7);
- // Is the noun an exact match to one of the current LCs (Strong
relation)
- // Add sentence to chain
- if (rel.relation() == WordRelation.STRONG_RELATION) {
- addToChain(rel.dest(), l, chains, currSent);
- if (currSent.getSentId() - l.last > 10) {
- l.occurrences++;
- l.start = currSent.getSentId();
- }
- chainsAddCnt++;
- } else if (rel.relation() == WordRelation.MED_RELATION) {
- // Add sentence to chain if it is 7 sentences away from start
of chain
- addToChain(rel.dest(), l, chains, currSent);
- chainsAddCnt++;
- //If greater than 7 we will add it but call it a new occurrence
of the lexical chain...
- if (currSent.getSentId() - l.start > 7) {
- l.occurrences++;
- l.start = currSent.getSentId();
- }
- } else if (rel.relation() == WordRelation.WEAK_RELATION) {
- if (currSent.getSentId() - l.start <= 3) {
+ /**
+ * Constructs a list of {@link LexicalChain lexical chains} from specified
sentences.
+ *
+ * @param article TODO unused parameter -> remove it?!
Review Comment:
If not used and there are no other classes (siblings/parents, or just very
similar with exact list of params -- like in Python) then I think there's no
need to keep it. Not sure when would be the best time to do that, probably when
bumping the major version?
##########
summarizer/src/test/java/opennlp/summarization/lexicalchaining/LexicalChainingSummarizerNewsTest.java:
##########
@@ -15,32 +15,33 @@
* limitations under the License.
*/
-package opennlp.summarization.preprocess;
+package opennlp.summarization.lexicalchaining;
-import java.util.List;
+import opennlp.summarization.AbstractSummarizerTest;
+import opennlp.summarization.Summarizer;
-import org.junit.jupiter.api.BeforeAll;
-import org.junit.jupiter.api.Test;
-
-import opennlp.summarization.Sentence;
+import org.junit.jupiter.api.BeforeEach;
import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotNull;
-class DocProcessorTest {
+/**
+ * Tests the implementation of {@link LexicalChainingSummarizer}.
+ */
+public class LexicalChainingSummarizerNewsTest extends AbstractSummarizerTest {
- private static DefaultDocProcessor dp;
+ // SUT
+ private LexicalChainingSummarizer lexicalChainSummarizer;
- @BeforeAll
- static void initEnv() {
- dp = new
DefaultDocProcessor(DocProcessorTest.class.getResourceAsStream("/en-sent.bin"));
+ @BeforeEach
+ void setUp() {
+ lexicalChainSummarizer = new LexicalChainingSummarizer(docProcessor,
posTagger);
}
- @Test
- void testGetSentencesFromStr() {
- String sent = "This is a sentence, with some punctuations; to test if the
sentence breaker can handle it! Is every thing working OK ? Yes.";
- List<Sentence> doc = dp.getSentencesFromStr(sent);
- //dp.docToString(fileName);
- assertEquals(doc.size(), 3);
+ @Override
+ public Summarizer getSummarizer() {
+ return lexicalChainSummarizer;
}
-
+
Review Comment:
Unneeded spaces?
##########
summarizer/src/main/java/opennlp/summarization/lexicalchaining/POSTagger.java:
##########
@@ -28,7 +32,26 @@ public interface POSTagger {
int ADVERB = 3;
int PRONOUN = 4;
+ /**
+ * Tags a given {@code input} text so that word classes are appenend to each
token.
Review Comment:
s/appenend/appended?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]