This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch update-solr-dep-to-9.10.x in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit 387c6287ce7f4a2c1403e46734061a849615b49f Author: Martin Wiesner <[email protected]> AuthorDate: Wed Nov 26 09:04:58 2025 +0100 Modernize opennlp-similarity component - updates opennlp-similarity's dep 'solr-core' to a more recent version 9.10.0 - updates transient dep Lucene to 9.12.3 - adds required dep lucene-queryparser as some classes were extracted from Lucene core jar - declares compile-time dependency 'commons-collections4' - removes non-required runtime dependency 'commons-io' - conducts code simplifications - conducts some code cleanup along the path --- opennlp-similarity/pom.xml | 19 +++++-- .../tools/doc_classifier/DocClassifier.java | 40 +++++---------- .../java/opennlp/tools/fca/BasicLevelMetrics.java | 19 +++---- .../java/opennlp/tools/fca/ConceptLattice.java | 5 +- .../JSMLearnerOnLatticeWithDeduction.java | 43 ++++++---------- .../apps/solr/SyntGenRequestHandler.java | 41 +++++++-------- .../tools/textsimilarity/ParseTreeChunk.java | 60 ++++++++++++---------- 7 files changed, 103 insertions(+), 124 deletions(-) diff --git a/opennlp-similarity/pom.xml b/opennlp-similarity/pom.xml index 3fcbcbe..06a5360 100644 --- a/opennlp-similarity/pom.xml +++ b/opennlp-similarity/pom.xml @@ -31,8 +31,10 @@ <jakarta.mail.version>2.1.4</jakarta.mail.version> <org.json.version>20250517</org.json.version> + <lucene.version>9.12.3</lucene.version> + <solr.version>9.10.0</solr.version> <tika.version>3.2.3</tika.version> - <solr.version>8.11.4</solr.version> + <docx4j.version>11.5.7</docx4j.version> <dl4j.version>1.0.0-M2.1</dl4j.version> <hdf5.version>1.14.3-1.5.10</hdf5.version> @@ -101,12 +103,12 @@ <scope>runtime</scope> </dependency> <!-- End model resources --> - + <dependency> - <groupId>commons-io</groupId> - <artifactId>commons-io</artifactId> - <scope>runtime</scope> + <groupId>org.apache.commons</groupId> + <artifactId>commons-collections4</artifactId> </dependency> + <dependency> <groupId>jakarta.xml.bind</groupId> <artifactId>jakarta.xml.bind-api</artifactId> @@ -158,6 +160,13 @@ </exclusions> </dependency> + <!-- required for solr in the chosen version --> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-queryparser</artifactId> + <version>${lucene.version}</version> + </dependency> + <dependency> <groupId>edu.mit</groupId> <artifactId>jverbnet</artifactId> diff --git a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java index 784ebb2..5f50277 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java @@ -49,7 +49,9 @@ public class DocClassifier { private static final Logger LOGGER = LoggerFactory.getLogger(DocClassifier.class); public static final String DOC_CLASSIFIER_KEY = "doc_class"; + public static final String DOC_CLASSIFIER_MAP = "doc_classifier_map"; public static final String RESOURCE_DIR = null; + private Map<String, Float> scoredClasses; public static final Float MIN_TOTAL_SCORE_FOR_CATEGORY = 0.3f; //3.0f; @@ -66,22 +68,10 @@ public class DocClassifier { // to accumulate classif results private final CountItemsList<String> localCats = new CountItemsList<>(); private static final int MAX_TOKENS_TO_FORM = 30; - private final String CAT_COMPUTING = "computing"; - public static final String DOC_CLASSIFIER_MAP = "doc_classifier_map"; - private static final int MIN_SENTENCE_LENGTH_TO_CATEGORIZE = 60; // if - // sentence - // is - // shorter, - // should - // not - // be - // used - // for - // classification - private static final int MIN_CHARS_IN_QUERY = 30; // if combination of - // keywords are shorter, - // should not be used - // for classification + // if sentence is shorter, should not be used for classification + private static final int MIN_SENTENCE_LENGTH_TO_CATEGORIZE = 60; + // if combination of keywords are shorter, should not be used for classification + private static final int MIN_CHARS_IN_QUERY = 30; // these are categories from the index public static final String[] CATEGORIES = new String[] @@ -94,13 +84,13 @@ public class DocClassifier { try { indexDirectory = FSDirectory.open(new File(INDEX_PATH).toPath()); } catch (IOException e2) { - LOGGER.error("problem opening index " + e2); + LOGGER.error("problem opening index {}", String.valueOf(e2)); } try { indexReader = DirectoryReader.open(indexDirectory); indexSearcher = new IndexSearcher(indexReader); } catch (IOException e2) { - LOGGER.error("problem reading index \n" + e2); + LOGGER.error("problem reading index \n{}", String.valueOf(e2)); } } } @@ -134,7 +124,7 @@ public class DocClassifier { } catch (IOException e1) { LOGGER.error("problem searching index \n", e1); } - LOGGER.debug("Found " + hits.totalHits + " hits for " + queryStr); + LOGGER.debug("Found {} hits for {}", hits.totalHits, queryStr); int count = 0; @@ -143,8 +133,7 @@ public class DocClassifier { try { doc = indexSearcher.doc(scoreDoc.doc); } catch (IOException e) { - LOGGER.error("Problem searching training set for classif \n" - + e); + LOGGER.error("Problem searching training set for classif \n{}", String.valueOf(e)); continue; } String flag = doc.get("class"); @@ -170,8 +159,7 @@ public class DocClassifier { if (scoredClasses.get(key) > MIN_TOTAL_SCORE_FOR_CATEGORY) resultsAboveThresh.add(key); else - LOGGER.debug("Too low score of " + scoredClasses.get(key) - + " for category = " + key); + LOGGER.debug("Too low score of {} for category = {}", scoredClasses.get(key), key); } int len = resultsAboveThresh.size(); @@ -182,7 +170,7 @@ public class DocClassifier { else results = resultsAboveThresh; } catch (Exception e) { - LOGGER.error("Problem aggregating search results\n" + e); + LOGGER.error("Problem aggregating search results\n{}", String.valueOf(e)); } if (results.size() < 2) return results; @@ -262,9 +250,9 @@ public class DocClassifier { continue; String query = formClassifQuery(sentence, MAX_TOKENS_TO_FORM); classifResults = classifySentence(query); - if (classifResults != null && classifResults.size() > 0) { + if (classifResults != null && !classifResults.isEmpty()) { localCats.addAll(classifResults); - LOGGER.debug(sentence + " => " + classifResults); + LOGGER.debug("{} => {}", sentence, classifResults); } } } catch (Exception e) { diff --git a/opennlp-similarity/src/main/java/opennlp/tools/fca/BasicLevelMetrics.java b/opennlp-similarity/src/main/java/opennlp/tools/fca/BasicLevelMetrics.java index 668e0ab..cc619cc 100755 --- a/opennlp-similarity/src/main/java/opennlp/tools/fca/BasicLevelMetrics.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/fca/BasicLevelMetrics.java @@ -19,16 +19,16 @@ package opennlp.tools.fca; import java.util.ArrayList; import java.util.HashSet; +import java.util.List; import java.util.Set; -import org.apache.commons.collections.ListUtils; +import org.apache.commons.collections4.ListUtils; public class BasicLevelMetrics { - final ConceptLattice cl; - ArrayList<ArrayList<Integer>> attributesExtent; - ArrayList<ArrayList<Integer>> objectsIntent = null; - ArrayList<Integer> attributes = null; + private final ConceptLattice cl; + private List<ArrayList<Integer>> attributesExtent; + private List<Integer> attributes = null; private final double[][] objectsSimilarityJ; private final double [][] objectsSimilaritySMC; @@ -41,7 +41,7 @@ public class BasicLevelMetrics { public void setUp(){ attributesExtent = new ArrayList<>(); - objectsIntent = new ArrayList<>(); + List<ArrayList<Integer>> objectsIntent = new ArrayList<>(); attributes = new ArrayList<>(); for (int i=0;i<cl.attributeCount;i++){ @@ -66,15 +66,10 @@ public class BasicLevelMetrics { objectsSimilarityJ[i][i] = 1; objectsSimilaritySMC[i][i] = 1; } - - //System.out.println("J"); - //System.out.println(Arrays.deepToString(objectsSimilarityJ)); - //System.out.println("SMC"); - //System.out.println(Arrays.deepToString(objectsSimilaritySMC)); } - //Utility functions for Similarity approach (S) + // Utility functions for Similarity approach (S) public double simSMC (ArrayList<Integer> intent1, ArrayList<Integer>intent2){ int tp = (ListUtils.intersection(intent1,intent2)).size(); ArrayList<Integer> fnlst = new ArrayList<>(this.attributes); diff --git a/opennlp-similarity/src/main/java/opennlp/tools/fca/ConceptLattice.java b/opennlp-similarity/src/main/java/opennlp/tools/fca/ConceptLattice.java index 6d59154..5735b31 100755 --- a/opennlp-similarity/src/main/java/opennlp/tools/fca/ConceptLattice.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/fca/ConceptLattice.java @@ -26,7 +26,7 @@ import java.util.LinkedHashSet; import java.util.List; import java.util.Set; -import org.apache.commons.collections.ListUtils; +import org.apache.commons.collections4.ListUtils; public class ConceptLattice { int objectCount; @@ -88,7 +88,6 @@ public class ConceptLattice { } } - public int GetMaximalConcept(List<Integer> intent, int Generator) { boolean parentIsMaximal = true; while(parentIsMaximal) { @@ -105,7 +104,7 @@ public class ConceptLattice { } public void AddExtentToAncestors(LinkedHashSet<Integer>extent, int curNode) { - if (conceptList.get(curNode).parents.size()>0){ + if (!conceptList.get(curNode).parents.isEmpty()){ for (int parent : conceptList.get(curNode).parents){ conceptList.get(parent).addExtents(extent); AddExtentToAncestors(extent, parent); diff --git a/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithDeduction.java b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithDeduction.java index d9c8b83..6b9f32d 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithDeduction.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/JSMLearnerOnLatticeWithDeduction.java @@ -22,7 +22,7 @@ import java.util.Arrays; import java.util.LinkedHashSet; import java.util.List; -import org.apache.commons.collections.ListUtils; +import org.apache.commons.collections4.ListUtils; import opennlp.tools.parse_thicket.pattern_structure.LinguisticPatternStructure; import opennlp.tools.similarity.apps.utils.Pair; @@ -31,13 +31,12 @@ import opennlp.tools.textsimilarity.ParseTreeChunk; public class JSMLearnerOnLatticeWithDeduction extends JSMLearnerOnLatticeBase{ final List<JSMDecision> accumulatedJSMResults = new ArrayList<>(); - - - public JSMDecision buildLearningModel(List<String> posTexts, List<String> negTexts, - String unknown, String[] separationKeywords){ - psPos = new LinguisticPatternStructure(0,0); psNeg = new LinguisticPatternStructure(0,0); + public JSMDecision buildLearningModel(List<String> posTexts, List<String> negTexts, + String unknown, String[] separationKeywords){ + psPos = new LinguisticPatternStructure(0,0); + psNeg = new LinguisticPatternStructure(0,0); if (separationKeywords!=null){ // re-sort by occurrence of separation keyword - Pair<List<String>, List<String>> pair = reGroupByOccurrenceOfSeparationKeyword(posTexts, negTexts, separationKeywords ); + Pair<List<String>, List<String>> pair = reGroupByOccurrenceOfSeparationKeyword(posTexts, negTexts, separationKeywords); posTexts = pair.getFirst(); negTexts = pair.getSecond(); } @@ -96,8 +95,8 @@ public class JSMLearnerOnLatticeWithDeduction extends JSMLearnerOnLatticeBase{ for(int iConcept = 0; iConcept<psNeg.conceptList.size(); iConcept++){ for (List<List<ParseTreeChunk>> negIntersection : negIntersections) { - intersection = md - .matchTwoSentencesGroupedChunksDeterministic(psNeg.conceptList.get(iConcept).intent, negIntersection); + intersection = md.matchTwoSentencesGroupedChunksDeterministic( + psNeg.conceptList.get(iConcept).intent, negIntersection); if (reduceList(intersection).size() > 0) posIntersectionsUnderNeg.add(reduceList(intersection)); } @@ -112,8 +111,8 @@ public class JSMLearnerOnLatticeWithDeduction extends JSMLearnerOnLatticeBase{ } } - List<ParseTreeChunk>posIntersectionsUnderNegLst = flattenParseTreeChunkLst(posIntersectionsUnderNeg); - List<ParseTreeChunk>negIntersectionsUnderPosLst=flattenParseTreeChunkLst(negIntersectionsUnderPos); + List<ParseTreeChunk> posIntersectionsUnderNegLst = flattenParseTreeChunkLst(posIntersectionsUnderNeg); + List<ParseTreeChunk> negIntersectionsUnderPosLst=flattenParseTreeChunkLst(negIntersectionsUnderPos); posIntersectionsUnderNegLst = subtract(posIntersectionsUnderNegLst, negIntersectionsUnderPosLst); negIntersectionsUnderPosLst= subtract(negIntersectionsUnderPosLst, posIntersectionsUnderNegLst); @@ -135,13 +134,10 @@ public class JSMLearnerOnLatticeWithDeduction extends JSMLearnerOnLatticeBase{ } private List<List<ParseTreeChunk>> computeIntersectionWithIntentExtendedByDeduction( - LinguisticPatternStructure psPos, int iConcept, - List<List<ParseTreeChunk>> chunksUnknown) { + LinguisticPatternStructure psPos, int iConcept, List<List<ParseTreeChunk>> chunksUnknown) { - List<List<ParseTreeChunk>> intent = psPos.conceptList.get(iConcept).intent, - intentExtendedByDeduction = new ArrayList<>(); + List<List<ParseTreeChunk>> intent = psPos.conceptList.get(iConcept).intent, intentExtendedByDeduction = new ArrayList<>(); - for( List<ParseTreeChunk> group: intent){ List<ParseTreeChunk> newGroup = new ArrayList<>(); for(ParseTreeChunk ch: group){ @@ -153,9 +149,7 @@ public class JSMLearnerOnLatticeWithDeduction extends JSMLearnerOnLatticeBase{ } intentExtendedByDeduction .add(newGroup); } - return md - .matchTwoSentencesGroupedChunksDeterministic(intentExtendedByDeduction, chunksUnknown); - + return md.matchTwoSentencesGroupedChunksDeterministic(intentExtendedByDeduction, chunksUnknown); } // for list of words in a phrase, identify if it includes a separation word/multiword and get respective clause body @@ -176,7 +170,7 @@ public class JSMLearnerOnLatticeWithDeduction extends JSMLearnerOnLatticeBase{ public Pair<List<String>, List<String>> reGroupByOccurrenceOfSeparationKeyword(List<String> posTexts, List<String> negTexts, String[] keywords){ List<String> posTextsNew = new ArrayList<>(), negTextsNew = new ArrayList<>(); - for(String posText:posTexts){ + for(String posText:posTexts) { boolean multiwordOccurs = true; for(String keyword: keywords){ if (!posText.contains(keyword)) { @@ -190,7 +184,7 @@ public class JSMLearnerOnLatticeWithDeduction extends JSMLearnerOnLatticeBase{ else negTextsNew.add(posText); } - for(String negText:negTexts){ + for(String negText:negTexts) { boolean multiwordOccurs = true; for(String keyword: keywords){ if (!negText.contains(keyword)) { @@ -204,8 +198,6 @@ public class JSMLearnerOnLatticeWithDeduction extends JSMLearnerOnLatticeBase{ else negTextsNew.add(negText); } - - return new Pair<>(posTextsNew, negTextsNew); } @@ -234,10 +226,5 @@ public class JSMLearnerOnLatticeWithDeduction extends JSMLearnerOnLatticeBase{ // Finally, do prediction JSMDecision dec = // may be determined by ... jsm.buildLearningModel(Arrays.asList(posArr), Arrays.asList(negArr), unknown , new String[]{"property"}); - - - - - } } diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java index 4a25654..bf78e15 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java @@ -60,7 +60,10 @@ import org.apache.solr.search.SolrIndexSearcher; public class SyntGenRequestHandler extends SearchHandler { - private final ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer(); + private static final String SCORE = "score"; + private static final String RESPONSE = "response"; + private static final String PREFIX_QUERY = "q="; + private final ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer(); public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp){ try { @@ -77,7 +80,7 @@ public class SyntGenRequestHandler extends SearchHandler { //modify rsp NamedList<Object> values = rsp.getValues(); - ResultContext c = (ResultContext) values.get("response"); + ResultContext c = (ResultContext) values.get(RESPONSE); if (c==null) return; @@ -97,13 +100,12 @@ public class SyntGenRequestHandler extends SearchHandler { e.printStackTrace(); } // c.docs = dListResult; - values.remove("response"); + values.remove(RESPONSE); rsp.setAllValues(values); } - public DocList filterResultsBySyntMatchReduceDocSet(DocList docList, - SolrQueryRequest req, SolrParams params) { + public DocList filterResultsBySyntMatchReduceDocSet(DocList docList, SolrQueryRequest req, SolrParams params) { //if (!docList.hasScores()) // return docList; @@ -117,7 +119,7 @@ public class SyntGenRequestHandler extends SearchHandler { String requestExpression = req.getParamString(); String[] exprParts = requestExpression.split("&"); for(String part: exprParts){ - if (part.startsWith("q=")) + if (part.startsWith(PREFIX_QUERY)) requestExpression = part; } String fieldNameQuery = StringUtils.substringBetween(requestExpression, "=", ":"); @@ -126,7 +128,7 @@ public class SyntGenRequestHandler extends SearchHandler { if (queryParts.length>=2 && queryParts[1].length()>5) requestExpression = queryParts[1].replace('+', ' '); else if (requestExpression.contains(":")) {// still field-based expression - requestExpression = requestExpression.replaceAll(fieldNameQuery+":", "").replace('+',' ').replaceAll(" ", " ").replace("q=", ""); + requestExpression = requestExpression.replaceAll(fieldNameQuery+":", "").replace('+',' ').replaceAll(" ", " ").replace(PREFIX_QUERY, ""); } if (fieldNameQuery ==null) @@ -217,7 +219,7 @@ public class SyntGenRequestHandler extends SearchHandler { int numFound = 0; List<SolrDocument> slice = new ArrayList<>(); for (SolrDocument sdoc : results) { - Float score = (Float) sdoc.getFieldValue("score"); + Float score = (Float) sdoc.getFieldValue(SCORE); if (maxScore < score) { maxScore = score; } @@ -231,13 +233,13 @@ public class SyntGenRequestHandler extends SearchHandler { results.setNumFound(numFound); results.setMaxScore(maxScore); results.setStart(start); - rsp.add("response", results); + rsp.add(RESPONSE, results); } private Query buildFilter(String[] fqs, SolrQueryRequest req) - throws IOException, ParseException { + throws IOException, ParseException { if (fqs != null && fqs.length > 0) { BooleanQuery.Builder fquery = new BooleanQuery.Builder(); for (String fq : fqs) { @@ -254,17 +256,16 @@ public class SyntGenRequestHandler extends SearchHandler { return null; } - private void doSearch1(SolrDocumentList results, - SolrIndexSearcher searcher, String q, Query filter, - int ndocs, SolrQueryRequest req, - Map<String,SchemaField> fields, Set<Integer> alreadyFound) - throws IOException { + private void doSearch1(SolrDocumentList results, SolrIndexSearcher searcher, + String q, Query filter, int ndocs, SolrQueryRequest req, + Map<String,SchemaField> fields, Set<Integer> alreadyFound) + throws IOException { // build custom query and extra fields Map<String,Object> extraFields = new HashMap<>(); extraFields.put("search_type", "search1"); boolean includeScore = - req.getParams().get(CommonParams.FL).contains("score"); + req.getParams().get(CommonParams.FL).contains(SCORE); int maxDocsPerSearcherType = 0; float maprelScoreCutoff = 2.0f; @@ -296,7 +297,7 @@ public class SyntGenRequestHandler extends SearchHandler { sdoc.addField(extraField, extraFields.get(extraField)); } if (includeScore) { - sdoc.addField("score", hit.score); + sdoc.addField(SCORE, hit.score); } results.add(sdoc); alreadyFound.add(hit.doc); @@ -315,9 +316,3 @@ public class SyntGenRequestHandler extends SearchHandler { } } - -/* - * - * - * http://localhost:8080/solr/syntgen/?q=add-style-to-your-every-day-fresh-design-iphone-cases&t1=Personalized+iPhone+Cases&d1=Add+style+to+your+every+day+with+a+custom+iPhone+case&t2=Personalized+iPhone+Cases&d2=Add+style+to+your+every+day+with+a+custom+iPhone+case&t3=Personalized+iPhone+Cases&d3=Add+style+to+your+every+day+with+a+custom+iPhone+case&t4=Personalized+iPhone+Cases&d4=add+style+to+your+every+day+with+a+custom+iPhone+case - * */ diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java index 8224273..54faa5d 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java @@ -24,7 +24,7 @@ import java.util.Collections; import java.util.List; import java.util.Map; -import org.apache.commons.collections.ListUtils; +import org.apache.commons.collections4.ListUtils; import org.apache.commons.lang3.StringUtils; import opennlp.tools.parse_thicket.ParseTreeNode; @@ -32,7 +32,13 @@ import opennlp.tools.parse_thicket.ParseTreeNode; public class ParseTreeChunk implements Serializable { private static final long serialVersionUID = -9007722991829174647L; - private String mainPOS; + private static final String COLON = ":"; + private static final String ASTERISK = "*"; + private static final String DASH = "-"; + private static final String WHITESPACE = " "; + private static final String HASH = "#"; + + private String mainPOS; private List<String> lemmas; @@ -71,7 +77,7 @@ public class ParseTreeChunk implements Serializable { this.mainPOS = StringUtils.substringBetween(phrStr, ">", "'"); for(String part: parts){ String lemma = StringUtils.substringBetween(part, "P'", "':"); - String pos = part.substring(part.indexOf(":")+1, part.length()); + String pos = part.substring(part.indexOf(COLON)+1, part.length()); if (pos==null || lemma ==null){ continue; @@ -173,7 +179,7 @@ public class ParseTreeChunk implements Serializable { public List<ParseTreeChunk> buildChunks(List<LemmaPair> parseResults) { List<ParseTreeChunk> chunksResults = new ArrayList<>(); for (LemmaPair chunk : parseResults) { - String[] lemmasAr = chunk.getLemma().split(" "); + String[] lemmasAr = chunk.getLemma().split(WHITESPACE); List<String> poss = new ArrayList<>(), lems = new ArrayList<>(); for (String lem : lemmasAr) { lems.add(lem); @@ -220,9 +226,11 @@ public class ParseTreeChunk implements Serializable { // groups noun phrases, verb phrases, propos phrases etc. for separate match - public List<List<ParseTreeChunk>> groupChunksAsParses( - List<ParseTreeChunk> parseResults) { - List<ParseTreeChunk> np = new ArrayList<>(), vp = new ArrayList<>(), prp = new ArrayList<>(), sbarp = new ArrayList<>(), pp = new ArrayList<>(), adjp = new ArrayList<>(), whadvp = new ArrayList<>(), restOfPhrasesTypes = new ArrayList<>(); + public List<List<ParseTreeChunk>> groupChunksAsParses(List<ParseTreeChunk> parseResults) { + List<ParseTreeChunk> np = new ArrayList<>(), vp = new ArrayList<>(), prp = new ArrayList<>(), + sbarp = new ArrayList<>(), pp = new ArrayList<>(), + adjp = new ArrayList<>(), whadvp = new ArrayList<>(), + restOfPhrasesTypes = new ArrayList<>(); List<List<ParseTreeChunk>> results = new ArrayList<>(); for (ParseTreeChunk ch : parseResults) { String mainPos = ch.getMainPOS().toLowerCase(); @@ -276,7 +284,7 @@ public class ParseTreeChunk implements Serializable { public List<List<ParseTreeChunk>> matchTwoSentencesGroupedChunks( List<List<ParseTreeChunk>> sent1, List<List<ParseTreeChunk>> sent2) { List<List<ParseTreeChunk>> results = new ArrayList<>(); - // first irerate through component + // first iterate through component for (int comp = 0; comp < 2 && // just np & vp comp < sent1.size() && comp < sent2.size(); comp++) { List<ParseTreeChunk> resultComps = new ArrayList<>(); @@ -284,8 +292,7 @@ public class ParseTreeChunk implements Serializable { for (ParseTreeChunk ch1 : sent1.get(comp)) { for (ParseTreeChunk ch2 : sent2.get(comp)) { // simpler version ParseTreeChunk chunkToAdd = parseTreeMatcher - .generalizeTwoGroupedPhrasesRandomSelectHighestScoreWithTransforms( - ch1, ch2); + .generalizeTwoGroupedPhrasesRandomSelectHighestScoreWithTransforms(ch1, ch2); if (!lemmaFormManager.mustOccurVerifier(ch1, ch2, chunkToAdd)) { continue; // if the words which have to stay do not stay, proceed to @@ -298,8 +305,7 @@ public class ParseTreeChunk implements Serializable { break; } - if (parseTreeMatcher - .generalizeTwoGroupedPhrasesRandomSelectHighestScore(chunk, + if (parseTreeMatcher.generalizeTwoGroupedPhrasesRandomSelectHighestScore(chunk, chunkToAdd).equalsTo(chunkToAdd)) { alreadyThere = true; break; @@ -371,7 +377,7 @@ public class ParseTreeChunk implements Serializable { } // this => * ch=> run - if (!this.lemmas.get(i).equals(lems.get(i)) && this.lemmas.get(i).equals("*")) + if (!this.lemmas.get(i).equals(lems.get(i)) && this.lemmas.get(i).equals(ASTERISK)) notSubChunkWithGivenAlignment = true; } if (!notSubChunkWithGivenAlignment && !unComparable) @@ -395,7 +401,7 @@ public class ParseTreeChunk implements Serializable { } // this => * ch=> run - if (!thisLemma.get(i).equals(chLemma.get(i)) && thisLemma.get(i).equals("*")) + if (!thisLemma.get(i).equals(chLemma.get(i)) && thisLemma.get(i).equals(ASTERISK)) notSubChunkWithGivenAlignment = true; } @@ -430,11 +436,11 @@ public class ParseTreeChunk implements Serializable { if (mainPOS != null) buf = new StringBuilder(mainPOS + " ["); for (int i = 0; i < lemmas.size() && i < POSs.size() ; i++) { - buf.append(POSs.get(i)).append("-").append(lemmas.get(i)).append(" "); + buf.append(POSs.get(i)).append(DASH).append(lemmas.get(i)).append(WHITESPACE); if (this.parseTreeNodes!=null){ Map<String, Object> attrs = this.parseTreeNodes.get(i).getAttributes(); if (attrs!=null && attrs.keySet().size()>0){ - buf.append(attrs).append(" "); + buf.append(attrs).append(WHITESPACE); } String ner =this.parseTreeNodes.get(i).getNe(); if (ner!=null && ner.length()>1) @@ -448,7 +454,7 @@ public class ParseTreeChunk implements Serializable { StringBuilder buf = new StringBuilder(); for (String lemma : lemmas) { - buf.append(lemma).append(" "); + buf.append(lemma).append(WHITESPACE); } return buf.toString().trim(); } @@ -463,25 +469,25 @@ public class ParseTreeChunk implements Serializable { public String listToString(List<List<ParseTreeChunk>> chunks) { StringBuilder buf = new StringBuilder(); - if (chunks.get(0).size() > 0) { + if (!chunks.get(0).isEmpty()) { buf.append(" np ").append(chunks.get(0).toString()); } - if (chunks.get(1).size() > 0) { + if (!chunks.get(1).isEmpty()) { buf.append(" vp ").append(chunks.get(1).toString()); } if (chunks.size() < 3) { return buf.toString(); } - if (chunks.get(2).size() > 0) { + if (!chunks.get(2).isEmpty()) { buf.append(" prp ").append(chunks.get(2).toString()); } - if (chunks.get(3).size() > 0) { + if (!chunks.get(3).isEmpty()) { buf.append(" pp ").append(chunks.get(3).toString()); } - if (chunks.get(4).size() > 0) { + if (!chunks.get(4).isEmpty()) { buf.append(" adjp ").append(chunks.get(4).toString()); } - if (chunks.get(5).size() > 0) { + if (!chunks.get(5).isEmpty()) { buf.append(" whadvp ").append(chunks.get(5).toString()); } /* @@ -502,17 +508,17 @@ public class ParseTreeChunk implements Serializable { toParse = toParse.replace(" ]], [ [", "&"); String[] phraseTypeFragments = toParse.trim().split("&"); for (String toParseFragm : phraseTypeFragments) { - toParseFragm = toParseFragm.replace("], [", "#"); + toParseFragm = toParseFragm.replace("], [", HASH); List<ParseTreeChunk> resultsPhraseType = new ArrayList<>(); - String[] indivChunks = toParseFragm.trim().split("#"); + String[] indivChunks = toParseFragm.trim().split(HASH); for (String expr : indivChunks) { List<String> lems = new ArrayList<>(), poss = new ArrayList<>(); expr = expr.replace("[", "").replace(" ]", ""); - String[] pairs = expr.trim().split(" "); + String[] pairs = expr.trim().split(WHITESPACE); for (String word : pairs) { word = word.replace("]]", "").replace("]", ""); - String[] pos_lem = word.split("-"); + String[] pos_lem = word.split(DASH); lems.add(pos_lem[1].trim()); poss.add(pos_lem[0].trim()); }
