This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch remove_copy_of_porterstemmer_from_summarizer_component in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit eb14b3379a3af101b965a9b6dccaef1093ecea26 Author: Martin Wiesner <[email protected]> AuthorDate: Sun Apr 16 17:05:14 2023 +0200 removes copy of PorterStemmer from summarizer component now relying on OpenNLP tools' default `PorterStemmer` (DRY) improves `DefaultDocProcessor` to better re-use the stemmer instance and make use of pre-compiled Pattern improves formatting along the path --- .../main/java/opennlp/summarization/Sentence.java | 11 +- .../opennlp/summarization/meta/MetaSummarizer.java | 14 +- .../preprocess/DefaultDocProcessor.java | 23 +- .../summarization/preprocess/PorterStemmer.java | 388 --------------------- 4 files changed, 24 insertions(+), 412 deletions(-) diff --git a/summarizer/src/main/java/opennlp/summarization/Sentence.java b/summarizer/src/main/java/opennlp/summarization/Sentence.java index fad8cf1..07079b2 100755 --- a/summarizer/src/main/java/opennlp/summarization/Sentence.java +++ b/summarizer/src/main/java/opennlp/summarization/Sentence.java @@ -23,8 +23,8 @@ import java.util.ArrayList; import java.util.Locale; import java.util.Objects; -import opennlp.summarization.preprocess.PorterStemmer; import opennlp.summarization.preprocess.StopWords; +import opennlp.tools.stemmer.PorterStemmer; /** * A representation of a sentence geared toward pagerank and summarization. @@ -159,16 +159,15 @@ public class Sentence { StringBuilder b = new StringBuilder(); wrdItr.setText(stringVal); for(int wrdEnd = wrdItr.next(); wrdEnd != BreakIterator.DONE; - wrdStrt = wrdEnd, wrdEnd = wrdItr.next()) - { + wrdStrt = wrdEnd, wrdEnd = wrdItr.next()) { String word = this.getStringVal().substring(wrdStrt, wrdEnd);//words[i].trim(); word = word.replace("\"|'",""); - //Skip stop words and stem the word. + // Skip stop words and stem the word. if(sw.isStopWord(word)) continue; - + stemmer.stem(word); - b.append(stemmer); + b.append(stemmer.toString()); b.append(SPACE); } return b.toString(); diff --git a/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java b/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java index e6eca05..d4743ce 100644 --- a/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java +++ b/summarizer/src/main/java/opennlp/summarization/meta/MetaSummarizer.java @@ -33,7 +33,8 @@ import opennlp.summarization.preprocess.DefaultDocProcessor; import opennlp.summarization.textrank.TextRankSummarizer; import opennlp.summarization.DocProcessor; -/* + +/** * A summarizer that combines results from the text rank algorithm and the lexical chaining algorithm. * It runs both algorithm and uses the lexical chains to identify the main topics and relative importance * and the text rank to pick sentences from lexical chains. @@ -60,9 +61,8 @@ public class MetaSummarizer { // Rank sentences by merging the scores from lexical chaining and text rank. // maxWords -1 indicates rank all sentences. public int getBestSent(LexicalChain l, Hashtable<Integer, Score> pageRankScores) { - double bestScore = 0; int bestStr=-1; - for(Sentence s : l.getSentences()) - { + double bestScore = 0; int bestStr = -1; + for(Sentence s : l.getSentences()) { Score sc = pageRankScores.get(s.getSentId()); if(sc!=null && sc.getScore() > bestScore) { @@ -115,12 +115,10 @@ public class MetaSummarizer { } //Default Summarization using only lexical chains.. - public String summarize(String article, int maxWords) - { + public String summarize(String article, int maxWords) { //Build lexical Chains.. List<Sentence> sent = dp.getSentencesFromStr(article); - - List<Score>finalSc = rankSentences(article, sent, maxWords); + List<Score> finalSc = rankSentences(article, sent, maxWords); StringBuilder sb = new StringBuilder(); for (Score score : finalSc) { diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java index c54f76e..f4e1a0e 100755 --- a/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java +++ b/summarizer/src/main/java/opennlp/summarization/preprocess/DefaultDocProcessor.java @@ -29,11 +29,13 @@ import java.util.Locale; import java.util.Hashtable; import java.util.logging.Level; import java.util.logging.Logger; +import java.util.regex.Pattern; import opennlp.summarization.Sentence; import opennlp.summarization.DocProcessor; import opennlp.tools.sentdetect.SentenceDetectorME; import opennlp.tools.sentdetect.SentenceModel; +import opennlp.tools.stemmer.PorterStemmer; import opennlp.tools.stemmer.Stemmer; /** @@ -41,15 +43,19 @@ import opennlp.tools.stemmer.Stemmer; */ public class DefaultDocProcessor implements DocProcessor { private SentenceModel sentModel; - private Stemmer stemmer; - private StopWords sw; + private final Stemmer stemmer; + + private final static Pattern REPLACEMENT_PATTERN = + Pattern.compile("&#?[0-9 a-z A-Z][0-9 a-z A-Z][0-9 a-z A-Z]?;"); // Sentence fragmentation to use.. private static final int OPEN_NLP = 1; private static final int SIMPLE = 2; - private static final int SENTENCE_FRAG= OPEN_NLP; + private static final int SENTENCE_FRAG = OPEN_NLP; public DefaultDocProcessor(InputStream fragModelFile) { + stemmer = new PorterStemmer(); + try (InputStream modelIn = new BufferedInputStream(fragModelFile)){ sentModel = new SentenceModel(modelIn); } catch(Exception ex){ @@ -65,7 +71,6 @@ public class DefaultDocProcessor implements DocProcessor { Hashtable<String, List<Integer>> iidx, List<String> processedSent) { int oldSentEndIdx = 0; int sentEndIdx = 0; - Stemmer stemmer = new PorterStemmer(); StopWords sw = StopWords.getInstance(); BreakIterator iterator = BreakIterator.getSentenceInstance(Locale.US); BreakIterator wrdItr = BreakIterator.getWordInstance(Locale.US); @@ -73,8 +78,7 @@ public class DefaultDocProcessor implements DocProcessor { int start = iterator.first(); int sentCnt = 0; - for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) - { + for (int end = iterator.next(); end != BreakIterator.DONE; start = end, end = iterator.next()) { String sentence = str.substring(start,end);//str.substring(oldSentEndIdx, sentEndIdx).trim(); //Add the sentence as-is; do any processing at the word level @@ -100,7 +104,7 @@ public class DefaultDocProcessor implements DocProcessor { { if(stemedWrd.length()>1) { - List<Integer> sentList= iidx.get(stemedWrd); + List<Integer> sentList = iidx.get(stemedWrd); if(sentList==null) { sentList = new ArrayList<>(); @@ -130,7 +134,7 @@ public class DefaultDocProcessor implements DocProcessor { while ((nextLine = lnr.readLine()) != null) { String trimmedLine = nextLine.trim(); if (!trimmedLine.isEmpty() ) { - docBuffer.append(trimmedLine.replaceAll("&#?[0-9 a-z A-Z][0-9 a-z A-Z][0-9 a-z A-Z]?;", "")).append(" "); + docBuffer.append(REPLACEMENT_PATTERN.matcher(trimmedLine).replaceAll("")).append(" "); } } } catch (Exception ex) { @@ -154,8 +158,7 @@ public class DefaultDocProcessor implements DocProcessor { List<String> cleanedSents = new ArrayList<>(); this.getSentences(trimmedLine, sents, null, cleanedSents); int paraPos = 1; - for(String sen:sents) - { + for(String sen:sents) { Sentence s = new Sentence(); s.setSentId(sentNo++); s.setParagraph(paraNo); diff --git a/summarizer/src/main/java/opennlp/summarization/preprocess/PorterStemmer.java b/summarizer/src/main/java/opennlp/summarization/preprocess/PorterStemmer.java deleted file mode 100755 index 3b787f6..0000000 --- a/summarizer/src/main/java/opennlp/summarization/preprocess/PorterStemmer.java +++ /dev/null @@ -1,388 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. -*/ - -package opennlp.summarization.preprocess; - -/* - - Porter stemmer in Java. The original paper is in - - Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, - no. 3, pp 130-137, - - See also http://www.tartarus.org/~martin/PorterStemmer - - History: - - Release 1 - - Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below. - The words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1] - is then out outside the bounds of b. - - Release 2 - - Similarly, - - Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below. - 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and - b[j] is then outside the bounds of b. - - Release 3 - - Considerably revised 4/9/00 in the light of many helpful suggestions - from Brian Goetz of Quiotix Corporation ([email protected]). - - Release 4 - -*/ - -import opennlp.tools.stemmer.Stemmer; - -/** - * Stemmer, implementing the Porter Stemming Algorithm - * <p> - * The Stemmer class transforms a word into its root form. The input - * word can be provided a character at time (by calling add()), or at once - * by calling one of the various stem(something) methods. - */ -public class PorterStemmer implements Stemmer { - - private char[] b; - private int i, /* offset into b */ - i_end, /* offset to end of stemmed word */ - j, k; - private static final int INC = 50; - /* unit of size whereby b is increased */ - public PorterStemmer() { - b = new char[INC]; - i = 0; - i_end = 0; - } - - /** - * Add a character to the word being stemmed. When you are finished - * adding characters, you can call stem(void) to stem the word. - */ - - public void add(char ch) { - if (i == b.length) - { char[] new_b = new char[i+INC]; - for (int c = 0; c < i; c++) new_b[c] = b[c]; - b = new_b; - } - b[i++] = ch; - } - - - /** Adds wLen characters to the word being stemmed contained in a portion - * of a char[] array. This is like repeated calls of add(char ch), but - * faster. - */ - - public void add(char[] w, int wLen) { - if (i+wLen >= b.length) - { char[] new_b = new char[i+wLen+INC]; - for (int c = 0; c < i; c++) new_b[c] = b[c]; - b = new_b; - } - for (int c = 0; c < wLen; c++) b[i++] = w[c]; - } - - /** - * After a word has been stemmed, it can be retrieved by toString(), - * or a reference to the internal buffer can be retrieved by getResultBuffer - * and getResultLength (which is generally more efficient.) - */ - @Override - public String toString() { return new String(b,0,i_end); } - - /** - * Returns the length of the word resulting from the stemming process. - */ - public int getResultLength() { return i_end; } - - /** - * Returns a reference to a character buffer containing the results of - * the stemming process. You also need to consult getResultLength() - * to determine the length of the result. - */ - public char[] getResultBuffer() { return b; } - - /* cons(i) is true <=> b[i] is a consonant. */ - - private boolean cons(int i) { - switch (b[i]) - { case 'a': case 'e': case 'i': case 'o': case 'u': return false; - case 'y': return (i==0) ? true : !cons(i-1); - default: return true; - } - } - - /* m() measures the number of consonant sequences between 0 and j. if c is - a consonant sequence and v a vowel sequence, and <..> indicates arbitrary - presence, - - <c><v> gives 0 - <c>vc<v> gives 1 - <c>vcvc<v> gives 2 - <c>vcvcvc<v> gives 3 - .... - */ - - private int m() { - int n = 0; - int i = 0; - while(true) - { if (i > j) return n; - if (! cons(i)) break; i++; - } - i++; - while(true) - { while(true) - { if (i > j) return n; - if (cons(i)) break; - i++; - } - i++; - n++; - while(true) - { if (i > j) return n; - if (! cons(i)) break; - i++; - } - i++; - } - } - - /* vowelinstem() is true <=> 0,...j contains a vowel */ - - private boolean vowelinstem() { - int i; for (i = 0; i <= j; i++) if (! cons(i)) return true; - return false; - } - - /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ - - private boolean doublec(int j) { - if (j < 1) return false; - if (b[j] != b[j-1]) return false; - return cons(j); - } - - /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant - and also if the second c is not w,x or y. this is used when trying to - restore an e at the end of a short word. e.g. - - cav(e), lov(e), hop(e), crim(e), but - snow, box, tray. - - */ - - private boolean cvc(int i) { - if (i < 2 || !cons(i) || cons(i-1) || !cons(i-2)) return false; - { int ch = b[i]; - if (ch == 'w' || ch == 'x' || ch == 'y') return false; - } - return true; - } - - private boolean ends(String s) { - int l = s.length(); - int o = k-l+1; - if (o < 0) return false; - for (int i = 0; i < l; i++) if (b[o+i] != s.charAt(i)) return false; - j = k-l; - return true; - } - - /* setto(s) sets (j+1),...k to the characters in the string s, readjusting - k. */ - - private void setto(String s) { - int l = s.length(); - int o = j+1; - for (int i = 0; i < l; i++) b[o+i] = s.charAt(i); - k = j+l; - } - - /* r(s) is used further down. */ - - private void r(String s) { if (m() > 0) setto(s); } - - /* step1() gets rid of plurals and -ed or -ing. e.g. - - caresses -> caress - ponies -> poni - ties -> ti - caress -> caress - cats -> cat - - feed -> feed - agreed -> agree - disabled -> disable - - matting -> mat - mating -> mate - meeting -> meet - milling -> mill - messing -> mess - - meetings -> meet - - */ - - private void step1() { - if (b[k] == 's') - { - if (ends("sses")) k -= 2; else - if (ends("ies")) setto("i"); else - if (b[k-1] != 's') k--; - } - if (ends("eed")) { if (m() > 0) k--; } else - if ((ends("ed") || ends("ing")) && vowelinstem()) - { k = j; - if (ends("at")) setto("ate"); else - if (ends("bl")) setto("ble"); else - if (ends("iz")) setto("ize"); else - if (doublec(k)) - { k--; - { int ch = b[k]; - if (ch == 'l' || ch == 's' || ch == 'z') k++; - } - } - else if (m() == 1 && cvc(k)) setto("e"); - } - } - - /* step2() turns terminal y to i when there is another vowel in the stem. */ - - private void step2() { if (ends("y") && vowelinstem()) b[k] = 'i'; } - - /* step3() maps double suffices to single ones. so -ization ( = -ize plus - -ation) maps to -ize etc. note that the string before the suffix must give - m() > 0. */ - - private void step3() { if (k == 0) return; /* For Bug 1 */ switch (b[k-1]) { - case 'a': if (ends("ational")) { r("ate"); break; } - if (ends("tional")) { r("tion"); break; } - break; - case 'c': if (ends("enci")) { r("ence"); break; } - if (ends("anci")) { r("ance"); break; } - break; - case 'e': if (ends("izer")) { r("ize"); break; } - break; - case 'l': if (ends("bli")) { r("ble"); break; } - if (ends("alli")) { r("al"); break; } - if (ends("entli")) { r("ent"); break; } - if (ends("eli")) { r("e"); break; } - if (ends("ousli")) { r("ous"); break; } - break; - case 'o': if (ends("ization")) { r("ize"); break; } - if (ends("ation")) { r("ate"); break; } - if (ends("ator")) { r("ate"); break; } - break; - case 's': if (ends("alism")) { r("al"); break; } - if (ends("iveness")) { r("ive"); break; } - if (ends("fulness")) { r("ful"); break; } - if (ends("ousness")) { r("ous"); break; } - break; - case 't': if (ends("aliti")) { r("al"); break; } - if (ends("iviti")) { r("ive"); break; } - if (ends("biliti")) { r("ble"); break; } - break; - case 'g': if (ends("logi")) { r("log"); break; } - } } - - /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ - - private void step4() { switch (b[k]) { - case 'e': if (ends("icate")) { r("ic"); break; } - if (ends("ative")) { r(""); break; } - if (ends("alize")) { r("al"); break; } - break; - case 'i': if (ends("iciti")) { r("ic"); break; } - break; - case 'l': if (ends("ical")) { r("ic"); break; } - if (ends("ful")) { r(""); break; } - break; - case 's': if (ends("ness")) { r(""); break; } - break; - } } - - /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */ - - private void step5() { - if (k == 0) return; /* for Bug 1 */ switch (b[k-1]) - { case 'a': if (ends("al")) break; return; - case 'c': if (ends("ance")) break; - if (ends("ence")) break; return; - case 'e': if (ends("er")) break; return; - case 'i': if (ends("ic")) break; return; - case 'l': if (ends("able")) break; - if (ends("ible")) break; return; - case 'n': if (ends("ant")) break; - if (ends("ement")) break; - if (ends("ment")) break; - /* element etc. not stripped before the m */ - if (ends("ent")) break; return; - case 'o': if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break; - /* j >= 0 fixes Bug 2 */ - if (ends("ou")) break; return; - /* takes care of -ous */ - case 's': if (ends("ism")) break; return; - case 't': if (ends("ate")) break; - if (ends("iti")) break; return; - case 'u': if (ends("ous")) break; return; - case 'v': if (ends("ive")) break; return; - case 'z': if (ends("ize")) break; return; - default: return; - } - if (m() > 1) k = j; - } - - /* step6() removes a final -e if m() > 1. */ - - private void step6() { - j = k; - if (b[k] == 'e') - { int a = m(); - if (a > 1 || a == 1 && !cvc(k-1)) k--; - } - if (b[k] == 'l' && doublec(k) && m() > 1) k--; - } - - /** Stem the word placed into the Stemmer buffer through calls to add(). - * Returns true if the stemming process resulted in a word different - * from the input. You can retrieve the result with - * getResultLength()/getResultBuffer() or toString(). - */ - public void stem() { - k = i - 1; - if (k > 1) { step1(); step2(); step3(); step4(); step5(); step6(); } - i_end = k+1; i = 0; - } - - public CharSequence stem(CharSequence word) { - b = new char[word.length()]; - char[] arr = word.toString().toCharArray(); - for(k=0;k<arr.length;k++) this.add(arr[k]); - stem(); - return this.toString(); - } -} -
