http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java index 23fd5a3..25d5ac5 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java @@ -1,166 +1,192 @@ -package opennlp.tools.parse_thicket.pattern_structure; - -import java.util.*; -import java.io.*; - -import opennlp.tools.parse_thicket.ParseCorefsBuilder; -import opennlp.tools.parse_thicket.ParseThicket; -import opennlp.tools.parse_thicket.ParseTreeNode; -import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder; -import opennlp.tools.textsimilarity.ParseTreeChunk; -import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic; - - -public class PhrasePatternStructure { - int objectCount; - int attributeCount; - ArrayList<PhraseConcept> conceptList; - ParseTreeMatcherDeterministic md; - public PhrasePatternStructure(int objectCounts, int attributeCounts) { - objectCount = objectCounts; - attributeCount = attributeCounts; - conceptList = new ArrayList<PhraseConcept>(); - PhraseConcept bottom = new PhraseConcept(); - md = new ParseTreeMatcherDeterministic(); - /*Set<Integer> b_intent = new HashSet<Integer>(); - for (int index = 0; index < attributeCount; ++index) { - b_intent.add(index); - } - bottom.setIntent(b_intent);*/ - bottom.setPosition(0); - conceptList.add(bottom); - } - public int GetMaximalConcept(List<List<ParseTreeChunk>> intent, int Generator) { - boolean parentIsMaximal = true; - while(parentIsMaximal) { - parentIsMaximal = false; - for (int parent : conceptList.get(Generator).parents) { - if (conceptList.get(parent).intent.containsAll(intent)) { - Generator = parent; - parentIsMaximal = true; - break; - } - } - } - return Generator; - } - public int AddIntent(List<List<ParseTreeChunk>> intent, int generator) { - System.out.println("debug"); - System.out.println("called for " + intent); - //printLattice(); - int generator_tmp = GetMaximalConcept(intent, generator); - generator = generator_tmp; - if (conceptList.get(generator).intent.equals(intent)) { - System.out.println("at generator:" + conceptList.get(generator).intent); - System.out.println("to add:" + intent); - - System.out.println("already generated"); - return generator; - } - Set<Integer> generatorParents = conceptList.get(generator).parents; - Set<Integer> newParents = new HashSet<Integer>(); - for (int candidate : generatorParents) { - if (!intent.containsAll(conceptList.get(candidate).intent)) { - //if (!conceptList.get(candidate).intent.containsAll(intent)) { - //Set<Integer> intersection = new HashSet<Integer>(conceptList.get(candidate).intent); - //List<List<ParseTreeChunk>> intersection = new ArrayList<List<ParseTreeChunk>>(conceptList.get(candidate).intent); - //intersection.retainAll(intent); - List<List<ParseTreeChunk>> intersection = md - .matchTwoSentencesGroupedChunksDeterministic(intent, conceptList.get(candidate).intent); - System.out.println("recursive call (inclusion)"); - candidate = AddIntent(intersection, candidate); - } - boolean addParents = true; - System.out.println("now iterating over parents"); - Iterator<Integer> iterator = newParents.iterator(); - while (iterator.hasNext()) { - Integer parent = iterator.next(); - if (conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent)) { - addParents = false; - break; - } - else { - if (conceptList.get(candidate).intent.containsAll(conceptList.get(parent).intent)) { - iterator.remove(); - } - } - } - /*for (int parent : newParents) { - System.out.println("parent = " + parent); - System.out.println("candidate intent:"+conceptList.get(candidate).intent); - System.out.println("parent intent:"+conceptList.get(parent).intent); - - if (conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent)) { - addParents = false; - break; - } - else { - if (conceptList.get(candidate).intent.containsAll(conceptList.get(parent).intent)) { - newParents.remove(parent); - } - } - }*/ - if (addParents) { - newParents.add(candidate); - } - } - System.out.println("size of lattice: " + conceptList.size()); - PhraseConcept newConcept = new PhraseConcept(); - newConcept.setIntent(intent); - newConcept.setPosition(conceptList.size()); - conceptList.add(newConcept); - conceptList.get(generator).parents.add(newConcept.position); - for (int newParent: newParents) { - if (conceptList.get(generator).parents.contains(newParent)) { - conceptList.get(generator).parents.remove(newParent); - } - conceptList.get(newConcept.position).parents.add(newParent); - } - return newConcept.position; - } - public void printLatticeStats() { - System.out.println("Lattice stats"); - System.out.println("max_object_index = " + objectCount); - System.out.println("max_attribute_index = " + attributeCount); - System.out.println("Current concept count = " + conceptList.size()); - } - public void printLattice() { - for (int i = 0; i < conceptList.size(); ++i) { - printConceptByPosition(i); - } - } - public void printConceptByPosition(int index) { - System.out.println("Concept at position " + index); - conceptList.get(index).printConcept(); - } - public List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForPara( - List<List<ParseTreeNode>> phrs) { - List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>(); - List<ParseTreeChunk> nps = new ArrayList<ParseTreeChunk>(), vps = new ArrayList<ParseTreeChunk>(), - pps = new ArrayList<ParseTreeChunk>(); - for(List<ParseTreeNode> ps:phrs){ - ParseTreeChunk ch = convertNodeListIntoChunk(ps); - String ptype = ps.get(0).getPhraseType(); - if (ptype.equals("NP")){ - nps.add(ch); - } else if (ptype.equals("VP")){ - vps.add(ch); - } else if (ptype.equals("PP")){ - pps.add(ch); - } - } - results.add(nps); results.add(vps); results.add(pps); - return results; - } - private ParseTreeChunk convertNodeListIntoChunk(List<ParseTreeNode> ps) { - List<String> lemmas = new ArrayList<String>(), poss = new ArrayList<String>(); - for(ParseTreeNode n: ps){ - lemmas.add(n.getWord()); - poss.add(n.getPos()); - } - ParseTreeChunk ch = new ParseTreeChunk(lemmas, poss, 0, 0); - ch.setMainPOS(ps.get(0).getPhraseType()); - return ch; - } - -} \ No newline at end of file +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package opennlp.tools.parse_thicket.pattern_structure; + + + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Iterator; +import java.util.List; +import java.util.Set; + +import opennlp.tools.parse_thicket.ParseTreeNode; +import opennlp.tools.textsimilarity.ParseTreeChunk; +import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic; + + +public class PhrasePatternStructure { + int objectCount; + int attributeCount; + public List<PhraseConcept> conceptList; + ParseTreeMatcherDeterministic md; + public PhrasePatternStructure(int objectCounts, int attributeCounts) { + objectCount = objectCounts; + attributeCount = attributeCounts; + conceptList = new ArrayList<PhraseConcept>(); + PhraseConcept bottom = new PhraseConcept(); + md = new ParseTreeMatcherDeterministic(); + /*Set<Integer> b_intent = new HashSet<Integer>(); + for (int index = 0; index < attributeCount; ++index) { + b_intent.add(index); + } + bottom.setIntent(b_intent);*/ + bottom.setPosition(0); + conceptList.add(bottom); + } + public int GetMaximalConcept(List<List<ParseTreeChunk>> intent, int Generator) { + boolean parentIsMaximal = true; + while(parentIsMaximal) { + parentIsMaximal = false; + for (int parent : conceptList.get(Generator).parents) { + if (conceptList.get(parent).intent.containsAll(intent)) { + Generator = parent; + parentIsMaximal = true; + break; + } + } + } + return Generator; + } + public int AddIntent(List<List<ParseTreeChunk>> intent, int generator) { + System.out.println("debug"); + System.out.println("called for " + intent); + //printLattice(); + int generator_tmp = GetMaximalConcept(intent, generator); + generator = generator_tmp; + if (conceptList.get(generator).intent.equals(intent)) { + System.out.println("at generator:" + conceptList.get(generator).intent); + System.out.println("to add:" + intent); + System.out.println("already generated"); + return generator; + } + Set<Integer> generatorParents = conceptList.get(generator).parents; + Set<Integer> newParents = new HashSet<Integer>(); + for (int candidate : generatorParents) { + if (!intent.containsAll(conceptList.get(candidate).intent)) { + //if (!conceptList.get(candidate).intent.containsAll(intent)) { + //Set<Integer> intersection = new HashSet<Integer>(conceptList.get(candidate).intent); + //List<List<ParseTreeChunk>> intersection = new ArrayList<List<ParseTreeChunk>>(conceptList.get(candidate).intent); + //intersection.retainAll(intent); + List<List<ParseTreeChunk>> intersection = md + .matchTwoSentencesGroupedChunksDeterministic(intent, conceptList.get(candidate).intent); + System.out.println("recursive call (inclusion)"); + candidate = AddIntent(intersection, candidate); + } + boolean addParents = true; + System.out.println("now iterating over parents"); + Iterator<Integer> iterator = newParents.iterator(); + while (iterator.hasNext()) { + Integer parent = iterator.next(); + if (conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent)) { + addParents = false; + break; + } + else { + if (conceptList.get(candidate).intent.containsAll(conceptList.get(parent).intent)) { + iterator.remove(); + } + } + } + /*for (int parent : newParents) { + System.out.println("parent = " + parent); + System.out.println("candidate intent:"+conceptList.get(candidate).intent); + System.out.println("parent intent:"+conceptList.get(parent).intent); + + if (conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent)) { + addParents = false; + break; + } + else { + if (conceptList.get(candidate).intent.containsAll(conceptList.get(parent).intent)) { + newParents.remove(parent); + } + } + }*/ + if (addParents) { + newParents.add(candidate); + } + } + System.out.println("size of lattice: " + conceptList.size()); + PhraseConcept newConcept = new PhraseConcept(); + newConcept.setIntent(intent); + newConcept.setPosition(conceptList.size()); + conceptList.add(newConcept); + conceptList.get(generator).parents.add(newConcept.position); + for (int newParent: newParents) { + if (conceptList.get(generator).parents.contains(newParent)) { + conceptList.get(generator).parents.remove(newParent); + } + conceptList.get(newConcept.position).parents.add(newParent); + } + return newConcept.position; + } + + public void printLatticeStats() { + System.out.println("Lattice stats"); + System.out.println("max_object_index = " + objectCount); + System.out.println("max_attribute_index = " + attributeCount); + System.out.println("Current concept count = " + conceptList.size()); + + } + + public void printLattice() { + for (int i = 0; i < conceptList.size(); ++i) { + printConceptByPosition(i); + } + } + + public void printConceptByPosition(int index) { + System.out.println("Concept at position " + index); + conceptList.get(index).printConcept(); + } + + public List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForPara( + List<List<ParseTreeNode>> phrs) { + List<List<ParseTreeChunk>> results = new ArrayList<List<ParseTreeChunk>>(); + List<ParseTreeChunk> nps = new ArrayList<ParseTreeChunk>(), vps = new ArrayList<ParseTreeChunk>(), + pps = new ArrayList<ParseTreeChunk>(); + for(List<ParseTreeNode> ps:phrs){ + ParseTreeChunk ch = convertNodeListIntoChunk(ps); + String ptype = ps.get(0).getPhraseType(); + System.out.println(ps); + if (ptype.equals("NP")){ + nps.add(ch); + } else if (ptype.equals("VP")){ + vps.add(ch); + } else if (ptype.equals("PP")){ + pps.add(ch); + } + } + results.add(nps); results.add(vps); results.add(pps); + return results; + } + + private ParseTreeChunk convertNodeListIntoChunk(List<ParseTreeNode> ps) { + List<String> lemmas = new ArrayList<String>(), poss = new ArrayList<String>(); + for(ParseTreeNode n: ps){ + lemmas.add(n.getWord()); + poss.add(n.getPos()); + } + ParseTreeChunk ch = new ParseTreeChunk(lemmas, poss, 0, 0); + ch.setMainPOS(ps.get(0).getPhraseType()); + return ch; + } + + +} +
http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java index 3a36e80..96bec44 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package opennlp.tools.parse_thicket.rhetoric_structure; import java.util.ArrayList; http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java index 060d32f..3b1c576 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package opennlp.tools.parse_thicket.rhetoric_structure; import java.util.ArrayList; http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java index c9b1f76..cd0e541 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java @@ -21,6 +21,8 @@ import java.util.ArrayList; import java.util.List; import java.util.logging.Logger; +import org.apache.commons.lang.StringUtils; + import net.billylieurance.azuresearch.AzureSearchImageQuery; import net.billylieurance.azuresearch.AzureSearchImageResult; import net.billylieurance.azuresearch.AzureSearchResultSet; @@ -29,7 +31,11 @@ import net.billylieurance.azuresearch.AzureSearchWebResult; public class BingQueryRunner { - protected static String BING_KEY = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0="; + protected static String BING_KEY = + "WFoNMM706MMJ5JYfcHaSEDP+faHj3xAxt28CPljUAHA"; + //"pjtCgujmf9TtfjCVBdcQ2rBUQwGLmtLtgCG4Ex7kekw"; + //"e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0="; + //"Cec1TlE67kPGDA/1MbeqPfHzP0I1eJypf3o0pYxRsuU="; private static final Logger LOG = Logger .getLogger("opennlp.tools.similarity.apps.BingQueryRunner"); protected AzureSearchWebQuery aq = new AzureSearchWebQuery(); @@ -39,11 +45,32 @@ public class BingQueryRunner { BING_KEY = key; } + private int MAX_QUERY_LENGTH = 100; + public void setLang(String language){ aq.setMarket(language); } + public List<HitBase> runSearchMultiplePages(String query, int nPages) { + List<HitBase> results = new ArrayList<HitBase>(); + for(int i=0; i< nPages; i++){ + aq.setPage(i); + results.addAll( runSearch(query, 50)); + } + return results; + } + public List<HitBase> runSearch(String query, int nRes) { + + if (query.length()>MAX_QUERY_LENGTH){ + try { + query = query.substring(0, MAX_QUERY_LENGTH); + //should not cut words, need the last space to end the query + query = query.substring(0, StringUtils.lastIndexOf(query, " ")); + } catch (Exception e) { + LOG.severe("Problem reducing the length of query :"+query); + } + } aq.setAppid(BING_KEY); aq.setQuery(query); aq.setPerPage(nRes); @@ -54,8 +81,12 @@ public class BingQueryRunner { try { aq.doQuery(); } catch (Exception e1) { - // TODO Auto-generated catch block - e1.printStackTrace(); + aq.setAppid("Cec1TlE67kPGDA/1MbeqPfHzP0I1eJypf3o0pYxRsuU="); + try { + aq.doQuery(); + } catch (Exception e2) { + e2.printStackTrace(); + } } e.printStackTrace(); } @@ -114,138 +145,12 @@ public class BingQueryRunner { } - /* - private String constructBingUrl(String query, String domainWeb, String lang, - int numbOfHits) throws Exception { - String codedQuery = URLEncoder.encode(query, "UTF-8"); - String yahooRequest = "http://api.search.live.net/json.aspx?Appid=" - + APP_ID + "&query=" + codedQuery // + - // "&sources=web"+ - + "&Sources=News" - // Common request fields (optional) - + "&Version=2.0" + "&Market=en-us" - // + "&Options=EnableHighlighting" - - // News-specific request fields (optional) - + "&News.Offset=0"; - - return yahooRequest; - } - - - - public ArrayList<String> search(String query, String domainWeb, String lang, - int numbOfHits) throws Exception { - URL url = new URL(constructBingUrl(query, domainWeb, lang, numbOfHits)); - URLConnection connection = url.openConnection(); - - String line; - ArrayList<String> result = new ArrayList<String>(); - BufferedReader reader = new BufferedReader(new InputStreamReader( - connection.getInputStream())); - int count = 0; - while ((line = reader.readLine()) != null) { - result.add(line); - count++; - } - return result; - } - - public BingResponse populateBingHit(String response) throws Exception { - BingResponse resp = new BingResponse(); - JSONObject rootObject = new JSONObject(response); - JSONObject responseObject = rootObject.getJSONObject("SearchResponse"); - JSONObject web = responseObject.getJSONObject("News"); - - // the search result is in an array under the name of "results" - JSONArray resultSet = null; - try { - resultSet = web.getJSONArray("Results"); - } catch (Exception e) { - System.err.print("\n!!!!!!!"); - LOG.severe("\nNo search results"); - - } - if (resultSet != null) { - for (int i = 0; i < resultSet.length(); i++) { - HitBase hit = new HitBase(); - JSONObject singleResult = resultSet.getJSONObject(i); - hit.setAbstractText(singleResult.getString("Snippet")); - hit.setDate(singleResult.getString("Date")); - String title = StringUtils.replace(singleResult.getString("Title"), - "î", " "); - hit.setTitle(title); - hit.setUrl(singleResult.getString("Url")); - hit.setSource(singleResult.getString("Source")); - - resp.appendHits(hit); - } - } - return resp; - } - - public List<HitBase> runSearch(String query) { - BingResponse resp = null; - try { - List<String> resultList = search(query, "", "", 8); - resp = populateBingHit(resultList.get(0)); - - } catch (Exception e) { - // e.printStackTrace(); - LOG.severe("No news search results for query " + query); - return null; - } - // cast to super class - List<HitBase> hits = new ArrayList<HitBase>(); - for (HitBase h : resp.getHits()) - hits.add((HitBase) h); - - hits = HitBase.removeDuplicates(hits); - return hits; - } - */ - - // TODO comment back when dependencies resolved (CopyrightViolations) - /* - * public List<CopyrightViolations> runCopyRightViolExtenralSearch(String - * query, String report) { - * - * List<CopyrightViolations> genResult = new ArrayList<CopyrightViolations>(); - * BingResponse newResp = null; StringDistanceMeasurer meas = new - * StringDistanceMeasurer(); try { List<String> resultList = search(query, "", - * "", 5); - * - * BingResponse resp = populateBingHit(resultList.get(0)); - * //printSearchResult(resultList.get(0)); - * - * for(int i=0; i<resp.getHits().size(); i++){ BingHit h1 = - * resp.getHits().get(i); String snippet = h1.getAbstractText(); Double sim = - * meas.measureStringDistance(report, snippet); if - * (sim>snapshotSimilarityThreshold){ //genResult.add(snapshot); - * CopyrightViolations cvr = new CopyrightViolations(); - * cvr.setSnippet(snippet); cvr.setTitle(h1.getTitle()); - * cvr.setUrl(h1.getDisplayUrl()); genResult.add(cvr); log.debug(new - * String("Copyright violation detected in snapshot" - * ).toUpperCase()+" : sim = "+ new Double(sim).toString().substring(0, 3)+ - * " \n "+snippet); - * - * } else { log.debug("Different news: sim = "+ new - * Double(sim).toString().substring(0, 3)+ " \n "+snippet); - * - * } - * - * } - * - * } catch (Exception e) { e.printStackTrace(); } - * - * - * return genResult; } - */ - public static void main(String[] args) { BingQueryRunner self = new BingQueryRunner(); + List<HitBase> resp1 = self.runSearch("albert einstein", 15); + System.out.println(resp1); AzureSearchResultSet<AzureSearchImageResult> res = self.runImageSearch("albert einstein"); System.out.println(res); http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java index a934264..d28f4e3 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java @@ -17,25 +17,18 @@ package opennlp.tools.similarity.apps; -import java.io.BufferedReader; -import java.io.FileWriter; -import java.io.IOException; -import java.io.InputStreamReader; -import java.net.URL; -import java.net.URLConnection; -import java.net.URLEncoder; import java.util.ArrayList; import java.util.List; import java.util.logging.Logger; + import net.billylieurance.azuresearch.AzureSearchResultSet; import net.billylieurance.azuresearch.AzureSearchWebQuery; import net.billylieurance.azuresearch.AzureSearchWebResult; import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer; import org.apache.commons.lang.StringUtils; -import org.json.JSONArray; -import org.json.JSONObject; + public class BingWebQueryRunner { @@ -111,6 +104,12 @@ public class BingWebQueryRunner { return 0; } + + public static void main(String[] args) { + BingWebQueryRunner self = new BingWebQueryRunner(); + + List<HitBase> res = self.runSearch ("albert einstein", 10); + } } http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java index 428cd4e..a017105 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java @@ -132,7 +132,7 @@ public class ContentGeneratorSupport { return queryArrayStr; } - + public static String[] cleanListOfSents(String[] sents) { List<String> sentsClean = new ArrayList<String>(); for (String s : sents) { @@ -144,11 +144,9 @@ public class ContentGeneratorSupport { } public static String cleanSpacesInCleanedHTMLpage(String pageContent){ //was 4 spaces - //was 3 spaces => now back to 2 + //was 3 spaces => now back to 2 //TODO - verify regexp!! pageContent = pageContent.trim().replaceAll("([a-z])(\\s{2,3})([A-Z])", "$1. $3") - //replaceAll("[a-z] [A-Z]", ". $0")// .replace(" ", - // ". ") .replace("..", ".").replace(". . .", " "). replace(". .",". ").trim(); // sometimes html breaks are converted into ' ' (two spaces), so // we need to put '.' @@ -461,7 +459,22 @@ public class ContentGeneratorSupport { } return (String[]) sentsClean.toArray(new String[0]); } - + + public static String getPortionOfTitleWithoutDelimiters(String title){ + String[] delimiters = new String[]{"\\+","-", "=", "_", "\\)", "\\|"}; + for(String delim: delimiters ){ + String[] split = title.split(delim); + if (split.length>1){ + for(String s: split){ + if (s.indexOf(".")<0) + return s; + } + } + } + + return title; + } + public static void main(String[] args){ String s = "You can grouP parts Of your regular expression In your pattern You grouP elements"; //with round brackets, e.g., ()." + @@ -472,6 +485,15 @@ public class ContentGeneratorSupport { sr1 = s.replaceAll(" [A-Z]", ". $1"); } + public static boolean problematicHitList(List<HitBase> hits){ + if (hits.size()<1) + return true; + for(HitBase hit: hits){ + if (!hit.getFragments().isEmpty()) + return false; + } + return true; + } } http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java index 3e79b7a..17421fd 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java @@ -89,7 +89,7 @@ public class GeneratedSentenceProcessor { String[] periods = StringUtils.split(sent.replace('.', '#'), '#'); if ((float) periods.length / (float) spaces.length > 0.2) { - System.out.println("Rejection: too many periods in sent ='"+sent); + //System.out.println("Rejection: too many periods in sent ='"+sent); return null; } // commented [x], to avoid rejection sentences with refs[] @@ -102,7 +102,7 @@ public class GeneratedSentenceProcessor { String[] pipes = StringUtils.split(sent, '|'); if (StringUtils.split(sent, '|').length > 2 || StringUtils.split(sent, '>').length > 2) { - System.out.println("Rejection: too many |s or >s in sent ='"+sent); + //System.out.println("Rejection: too many |s or >s in sent ='"+sent); return null; } String sentTry = sent.toLowerCase(); @@ -200,14 +200,14 @@ public class GeneratedSentenceProcessor { public static boolean isProhibitiveWordsOccurOrStartWith(String sentenceLowercase){ for(String o: occurs){ if (sentenceLowercase.indexOf(o)>-1){ - System.out.println("Found prohibited occurrence "+ o +" \n in sentence = "+ sentenceLowercase); + //System.out.println("Found prohibited occurrence "+ o +" \n in sentence = "+ sentenceLowercase); return true; } } for(String o: occursStartsWith){ if (sentenceLowercase.startsWith(o)){ - System.out.println("Found prohibited occurrence Start With "+ o +" \n in sentence = "+ sentenceLowercase); + //System.out.println("Found prohibited occurrence Start With "+ o +" \n in sentence = "+ sentenceLowercase); return true; } } http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java index bfeff62..91f6fda 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java @@ -82,6 +82,8 @@ public class RelatedSentenceFinder { this.RELEVANCE_THRESHOLD=thresh; yrunner.setKey(key); } + + int generateContentAboutIter = 0; public RelatedSentenceFinder() { // TODO Auto-generated constructor stub @@ -171,6 +173,20 @@ public class RelatedSentenceFinder { if (stepCount>MAX_STEPS) break; } + + // if nothing is written, then get first search result and try again + try { + if (generateContentAboutIter<4 && ContentGeneratorSupport.problematicHitList(opinionSentencesToAdd)){ + List<HitBase> resultList = yrunner.runSearch(sentence, 10); + String discoveredSimilarTopic = resultList.get(generateContentAboutIter).getTitle(); + discoveredSimilarTopic = ContentGeneratorSupport.getPortionOfTitleWithoutDelimiters(discoveredSimilarTopic); + generateContentAboutIter++; + opinionSentencesToAdd = generateContentAbout(discoveredSimilarTopic); + } + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } opinionSentencesToAdd = removeDuplicatesFromResultantHits(opinionSentencesToAdd); return opinionSentencesToAdd; http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java index b2d2194..1c50fbf 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java @@ -23,8 +23,11 @@ import java.util.Collection; import java.util.HashSet; import java.util.List; +import org.apache.commons.lang.StringUtils; + +import opennlp.tools.similarity.apps.utils.PageFetcher; import opennlp.tools.similarity.apps.utils.StringCleaner; -import opennlp.tools.stemmer.PorterStemmer; +import opennlp.tools.stemmer.PStemmer; import opennlp.tools.textsimilarity.ParseTreeChunk; import opennlp.tools.textsimilarity.SentencePairMatchResult; import opennlp.tools.textsimilarity.TextProcessor; @@ -34,7 +37,8 @@ public class StoryDiscourseNavigator { protected BingQueryRunner yrunner = new BingQueryRunner(); ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor .getInstance(); - private PorterStemmer ps = new PorterStemmer(); + private PStemmer ps = new PStemmer(); + PageFetcher pFetcher = new PageFetcher(); public static final String[] frequentPerformingVerbs = { " born raised meet learn ", " graduated enter discover", @@ -53,8 +57,34 @@ public class StoryDiscourseNavigator { "meet enjoy follow create", "discover continue produce" }; + + private String[] obtainKeywordsForAnEntityFromWikipedia(String entity){ + yrunner.setKey("xdnRVcVf9m4vDvW1SkTAz5kS5DFYa19CrPYGelGJxnc"); + List<HitBase> resultList = yrunner.runSearch(entity, 20); + HitBase h = null; + for (int i = 0; i < resultList.size(); i++) { + h = resultList.get(i); + if (h.getUrl().indexOf("wikipedia.")>-1) + break; + } + String content = pFetcher.fetchOrigHTML(h.getUrl()); + content = content.replace("\"><a href=\"#", "&_&_&_&"); + String[] portions = StringUtils.substringsBetween(content, "&_&_&_&", "\"><span"); + List<String> results = new ArrayList<String>(); + for(int i = 0; i< portions.length; i++){ + if (portions[i].indexOf("cite_note")>-1) + continue; + results.add(entity + " " + portions[i].replace('_', ' ').replace('.',' ')); + } + return results.toArray(new String[0]); + } public String[] obtainAdditionalKeywordsForAnEntity(String entity){ + String[] keywordsFromWikipedia = obtainKeywordsForAnEntityFromWikipedia(entity); + // these keywords should include *entity* + if (keywordsFromWikipedia!=null && keywordsFromWikipedia.length>3) + return keywordsFromWikipedia; + List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath( entity, "", "en", 30); Collection<String> keywordsToRemove = TextProcessor.fastTokenize(entity.toLowerCase(), false); @@ -70,7 +100,7 @@ public class StoryDiscourseNavigator { return res; } - public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query, + private List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query, String domain, String lang, int numbOfHits) { List<List<ParseTreeChunk>> genResult = new ArrayList<List<ParseTreeChunk>>(); try { @@ -127,5 +157,7 @@ public class StoryDiscourseNavigator { public static void main(String[] args){ String[] res = new StoryDiscourseNavigator().obtainAdditionalKeywordsForAnEntity("Albert Einstein"); System.out.println(Arrays.asList(res)); + res = new StoryDiscourseNavigator().obtainAdditionalKeywordsForAnEntity("search engine marketing"); + System.out.println(Arrays.asList(res)); } } http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java index 0e8d743..41afe36 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java @@ -1,7 +1,24 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package opennlp.tools.similarity.apps.solr; import java.io.BufferedReader; import java.io.File; +import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; @@ -61,6 +78,7 @@ public class ContentGeneratorRequestHandler extends SearchHandler { private static Logger LOG = Logger .getLogger("com.become.search.requestHandlers.SearchResultsReRankerRequestHandler"); private ParserChunker2MatcherProcessor sm = null; + WordDocBuilderEndNotes docBuilder = new WordDocBuilderEndNotes (); public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse rsp){ @@ -142,10 +160,9 @@ public class ContentGeneratorRequestHandler extends SearchHandler { } public String cgRunner(String[] args) { - ParserChunker2MatcherProcessor sm = null; int count=0; for(String a: args){ - System.out.print(count+" >> " + a); + System.out.print(count+">>" + a + " | "); count++; } @@ -164,13 +181,13 @@ public class ContentGeneratorRequestHandler extends SearchHandler { String bingKey = args[7]; if (bingKey == null){ - bingKey = //"e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0="; - "xdnRVcVf9m4vDvW1SkTAz5kS5DFYa19CrPYGelGJxnc"; + bingKey = "e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0="; + //"xdnRVcVf9m4vDvW1SkTAz5kS5DFYa19CrPYGelGJxnc"; } RelatedSentenceFinder f = null; String lang = args[6]; - if (lang.startsWith("es")){ + if (lang.startsWith("es") || lang.startsWith("ru") || lang.startsWith("de")){ f = new RelatedSentenceFinderML(Integer.parseInt(args[3]), Integer.parseInt(args[4]), Float.parseFloat(args[5]), bingKey); f.setLang(lang); } else @@ -184,14 +201,28 @@ public class ContentGeneratorRequestHandler extends SearchHandler { try { hits = f.generateContentAbout(args[0].replace('+', ' ').replace('"', ' ').trim()); + System.out.println(HitBase.toString(hits)); - generatedContent = HitBase.toResultantString(hits); + generatedContent = HitBase.toResultantString(hits) + "\n REFERENCES \n" + HitBase.produceReferenceSection(hits) ; + try { + writeResultInAFile(args[0].replace('+', ' '), generatedContent); + } catch (Exception e2) { + e2.printStackTrace(); + } + + String attachmentFileName = null; + try { + attachmentFileName = docBuilder.buildWordDoc(hits, args[0].replace('+', ' ').replace('"', ' ')); + } catch (Exception e2) { + e2.printStackTrace(); + } + opennlp.tools.apps.utils.email.EmailSender s = new opennlp.tools.apps.utils.email.EmailSender(); try { s.sendMail("smtp.rambler.ru", "[email protected]", "pill0693", new InternetAddress("[email protected]"), new InternetAddress[]{new InternetAddress(args[1])}, new InternetAddress[]{}, new InternetAddress[]{}, - "Generated content for you on '"+args[0].replace('+', ' ')+"'", generatedContent, null); + "Generated content for you on '"+args[0].replace('+', ' ')+"'", generatedContent, attachmentFileName); } catch (AddressException e) { // TODO Auto-generated catch block e.printStackTrace(); @@ -200,7 +231,7 @@ public class ContentGeneratorRequestHandler extends SearchHandler { e.printStackTrace(); try { s.sendMail("smtp.rambler.ru", "[email protected]", "pill0693", new InternetAddress("[email protected]"), new InternetAddress[]{new InternetAddress(args[1])}, new InternetAddress[]{}, new InternetAddress[]{}, - "Generated content for you on '"+args[0].replace('+', ' ')+"'", generatedContent, null); + "Generated content for you on '"+args[0].replace('+', ' ')+"'", generatedContent, attachmentFileName); } catch (Exception e1) { // TODO Auto-generated catch block e1.printStackTrace(); @@ -214,6 +245,40 @@ public class ContentGeneratorRequestHandler extends SearchHandler { return generatedContent; } + private void writeResultInAFile(String title, String content){ + FileOutputStream fop = null; + File file; + String absPath = new File(".").getAbsolutePath(); + absPath = absPath.substring(0, absPath.length()-1); + + try { + + file = new File(absPath+"/written/"+ title.replace(' ','_').replace('\"', ' ').trim()+ ".txt"); + // if file doesnt exists, then create it + if (!file.exists()) { + file.createNewFile(); + } + fop = new FileOutputStream(file); + + // get the content in bytes + byte[] contentInBytes = content.getBytes(); + + fop.write(contentInBytes); + fop.flush(); + fop.close(); + + } catch (IOException e) { + e.printStackTrace(); + } finally { + try { + if (fop != null) { + fop.close(); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + } } http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeQueryComponent.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeQueryComponent.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeQueryComponent.java index 14dc9ff..6693bbf 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeQueryComponent.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeQueryComponent.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package opennlp.tools.similarity.apps.solr; import java.io.IOException; @@ -139,13 +155,13 @@ public class IterativeQueryComponent extends QueryComponent{ e.printStackTrace(); } rb.setQparser(parser); - try { + /* try { rb.setScoreDoc(parser.getPaging()); } catch (Exception e) { // TODO Auto-generated catch block e.printStackTrace(); } - +*/ String[] fqs = rb.req.getParams().getParams(CommonParams.FQ); if (fqs!=null && fqs.length!=0) { List<Query> filters = rb.getFilters(); http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeSearchRequestHandler.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeSearchRequestHandler.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeSearchRequestHandler.java index 87f5ed9..be125b7 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeSearchRequestHandler.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeSearchRequestHandler.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package opennlp.tools.similarity.apps.solr; import java.io.IOException; http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/NLProgram2CodeRequestHandler.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/NLProgram2CodeRequestHandler.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/NLProgram2CodeRequestHandler.java index 0876700..413dd5d 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/NLProgram2CodeRequestHandler.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/NLProgram2CodeRequestHandler.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package opennlp.tools.similarity.apps.solr; import java.io.IOException; @@ -58,8 +74,8 @@ public class NLProgram2CodeRequestHandler extends SearchHandler { private ParseTreeChunkListScorer parseTreeChunkListScorer = new ParseTreeChunkListScorer(); private ParserChunker2MatcherProcessor sm = null; private int MAX_QUERY_LENGTH_NOT_TO_RERANK=3; - private static String resourceDir = "/home/solr/solr-4.4.0/example/src/test/resources"; - //"C:/workspace/TestSolr/src/test/resources"; + private static String resourceDir = //"/home/solr/solr-4.4.0/example/src/test/resources"; + "C:/workspace/TestSolr/src/test/resources"; //"/data1/solr/example/src/test/resources"; http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerRequestHandler.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerRequestHandler.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerRequestHandler.java index fbef398..b259528 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerRequestHandler.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerRequestHandler.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package opennlp.tools.similarity.apps.solr; import java.io.IOException; @@ -164,7 +180,7 @@ public class SearchResultsReRankerRequestHandler extends SearchHandler { NamedList<Object> values = rsp.getValues(); values.remove("response"); values.add("response", scoreNum); - values.add("new_order", bufNums.toString().trim()); + //values.add("new_order", bufNums.toString().trim()); rsp.setAllValues(values); } @@ -187,9 +203,7 @@ public class SearchResultsReRankerRequestHandler extends SearchHandler { private List<HitBase> calculateMatchScoreResortHits(List<HitBase> hits, String searchQuery) { try { - System.out.println("loading openNLP models...from "+resourceDir); sm = ParserChunker2MatcherProcessor.getInstance(resourceDir); - System.out.println("DONE loading openNLP model s."); } catch (Exception e){ LOG.severe(e.getMessage()); } http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java index b2d6295..d2f4b1b 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java @@ -1,3 +1,19 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package opennlp.tools.similarity.apps.solr; import java.io.IOException; @@ -56,7 +72,6 @@ import org.apache.solr.search.DocList; import org.apache.solr.search.DocSlice; import org.apache.solr.search.QParser; import org.apache.solr.search.SolrIndexSearcher; - import org.apache.solr.util.RTimer; import org.apache.solr.util.SolrPluginUtils; http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java index 84440bd..59f2146 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java @@ -26,7 +26,7 @@ import java.util.logging.Logger; import opennlp.tools.similarity.apps.BingQueryRunner; import opennlp.tools.similarity.apps.HitBase; import opennlp.tools.similarity.apps.utils.StringCleaner; -import opennlp.tools.stemmer.PorterStemmer; +import opennlp.tools.stemmer.PStemmer; import opennlp.tools.textsimilarity.ParseTreeChunk; import opennlp.tools.textsimilarity.ParseTreeChunkListScorer; import opennlp.tools.textsimilarity.SentencePairMatchResult; @@ -51,7 +51,7 @@ public class TaxonomyExtenderViaMebMining extends BingQueryRunner { private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new HashMap<String, List<List<String>>>(); private Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords = new HashMap<List<String>, List<List<String>>>(); - private PorterStemmer ps; + private PStemmer ps; public Map<List<String>, List<List<String>>> getAssocWords_ExtendedAssocWords() { return assocWords_ExtendedAssocWords; @@ -73,7 +73,7 @@ public class TaxonomyExtenderViaMebMining extends BingQueryRunner { System.err.println("Problem loading synt matcher"); } - ps = new PorterStemmer(); + ps = new PStemmer(); } http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java index 16e9fb2..a70340e 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java @@ -22,10 +22,13 @@ import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.io.Serializable; +import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; +import opennlp.tools.jsmlearning.ProfileReaderWriter; + /** * This class stores the taxonomy on the file-system * @@ -80,6 +83,31 @@ public class TaxonomySerializer implements Serializable { ex.printStackTrace(); } + String csvFilename = filename+".csv"; + List<String[]> taxo_list = new ArrayList<String[]>(); + List<String> entries = new ArrayList<String>(lemma_ExtendedAssocWords.keySet()); + for(String e: entries){ + List<String> lines = new ArrayList<String>(); + lines.add(e); + for(List<String> ls: lemma_ExtendedAssocWords.get(e)){ + lines.add(ls.toString()); + } + taxo_list.add((String[])lines.toArray(new String[0])); + } + ProfileReaderWriter.writeReport(taxo_list, csvFilename); + + String csvFilenameListEntries = filename+"_ListEntries.csv"; + taxo_list = new ArrayList<String[]>(); + List<List<String>> entriesList = new ArrayList<List<String>>( assocWords_ExtendedAssocWords.keySet()); + for(List<String> e: entriesList){ + List<String> lines = new ArrayList<String>(); + lines.addAll(e); + for(List<String> ls: assocWords_ExtendedAssocWords.get(e)){ + lines.add(ls.toString()); + } + taxo_list.add((String[])lines.toArray(new String[0])); + } + ProfileReaderWriter.writeReport(taxo_list, csvFilenameListEntries); } public static TaxonomySerializer readTaxonomy(String filename) { http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java index 4c01e39..7f17f84 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java @@ -18,6 +18,7 @@ package opennlp.tools.similarity.apps.utils; import java.io.BufferedReader; +import java.io.File; import java.io.IOException; import java.io.InputStreamReader; import java.net.MalformedURLException; @@ -27,54 +28,94 @@ import java.util.logging.Logger; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; + public class PageFetcher { - private static final Logger LOG = Logger + private static final Logger log = Logger .getLogger("opennlp.tools.similarity.apps.utils.PageFetcher"); + Tika tika = new Tika(); - private static int DEFAULT_TIMEOUT = 15000; + private static int DEFAULT_TIMEOUT = 1500; + private void setTimeout(int to){ + DEFAULT_TIMEOUT = to; + } public String fetchPage(final String url) { return fetchPage(url, DEFAULT_TIMEOUT); } + + public String fetchPageAutoDetectParser(final String url ){ + String fetchURL = addHttp(url); + String pageContent = null; + URLConnection connection; + try { + log.info("fetch url auto detect parser " + url); + connection = new URL(fetchURL).openConnection(); + connection.setReadTimeout(DEFAULT_TIMEOUT); + + //parse method parameters + Parser parser = new AutoDetectParser(); + BodyContentHandler handler = new BodyContentHandler(); + Metadata metadata = new Metadata(); + ParseContext context = new ParseContext(); + + //parsing the file + parser.parse(connection.getInputStream(), handler, metadata, context); + + pageContent = handler.toString(); + } catch (Exception e) { + log.info(e.getMessage() + "\n" + e); + } + return pageContent; + } + public String fetchPage(final String url, final int timeout) { String fetchURL = addHttp(url); - LOG.info("fetch url " + fetchURL); + log.info("fetch url " + fetchURL); String pageContent = null; URLConnection connection; try { - connection = new URL(url).openConnection(); + connection = new URL(fetchURL).openConnection(); connection.setReadTimeout(DEFAULT_TIMEOUT); - Tika tika = new Tika(); + pageContent = tika.parseToString(connection.getInputStream()) .replace('\n', ' ').replace('\t', ' '); } catch (MalformedURLException e) { - LOG.severe(e.getMessage() + "\n" + e); + log.severe(e.getMessage() + "\n" + e); } catch (IOException e) { - LOG.severe(e.getMessage() + "\n" + e); + log.severe(e.getMessage() + "\n" + e); } catch (TikaException e) { - LOG.severe(e.getMessage() + "\n" + e); + log.severe(e.getMessage() + "\n" + e); } return pageContent; } private String addHttp(final String url) { - if (!url.startsWith("http://")) { + if (!url.startsWith("http://") && !url.startsWith("https://")) { return "http://" + url; } return url; } + + public String fetchOrigHTML(String url, int timeout) { + setTimeout(timeout); + return fetchOrigHTML(url); + } public String fetchOrigHTML(String url) { - System.out.println("fetch url " + url); - String pageContent = null; + log.info("fetch url " + url); StringBuffer buf = new StringBuffer(); try { URLConnection connection = new URL(url).openConnection(); - connection.setReadTimeout(10000); + connection.setReadTimeout(DEFAULT_TIMEOUT); connection .setRequestProperty( "User-Agent", @@ -85,8 +126,8 @@ public class PageFetcher { reader = new BufferedReader(new InputStreamReader( connection.getInputStream())); } catch (Exception e) { - // we dont need to log trial web pages if access fails - // LOG.error(e.getMessage(), e); + // we dont always need to log trial web pages if access fails + log.severe(e.toString()); } while ((line = reader.readLine()) != null) { @@ -107,5 +148,19 @@ public class PageFetcher { } */ return buf.toString(); } + + public static void main(String[] args){ + PageFetcher fetcher = new PageFetcher(); + String content = fetcher.fetchPageAutoDetectParser("http://www.elastica.net/"); + System.out.println(content); + content = fetcher. + fetchPageAutoDetectParser("http://www.cnn.com"); + System.out.println(content); + content = new PageFetcher().fetchPage("https://github.com"); + System.out.println(content); + content = new PageFetcher().fetchOrigHTML("http://www.cnn.com"); + System.out.println(content); + + } } http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java index c2238c5..377b02a 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java @@ -20,11 +20,11 @@ package opennlp.tools.similarity.apps.utils; import java.util.ArrayList; import java.util.List; -import opennlp.tools.stemmer.PorterStemmer; +import opennlp.tools.stemmer.PStemmer; public class StringDistanceMeasurer { // external tools - private PorterStemmer ps; // stemmer + private PStemmer ps; // stemmer private static final int MIN_STRING_LENGTH_FOR_WORD = 4; @@ -36,7 +36,7 @@ public class StringDistanceMeasurer { public StringDistanceMeasurer() { // first get stemmer - ps = new PorterStemmer(); + ps = new PStemmer(); if (MIN_SCORE_FOR_LING > 1.0) return; http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PorterStemmer.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PorterStemmer.java b/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PorterStemmer.java deleted file mode 100644 index e23da90..0000000 --- a/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PorterStemmer.java +++ /dev/null @@ -1,521 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.tools.stemmer; - - - import java.io.IOException; - import java.io.InputStream; - import java.io.FileInputStream; - - import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_CHAR; - import org.apache.lucene.util.ArrayUtil; - - /** - * - * Stemmer, implementing the Porter Stemming Algorithm - * - * The Stemmer class transforms a word into its root form. The input - * word can be provided a character at time (by calling add()), or at once - * by calling one of the various stem(something) methods. - */ - - public class PorterStemmer - { - private char[] b; - private int i, /* offset into b */ - j, k, k0; - private boolean dirty = false; - private static final int INITIAL_SIZE = 50; - - public PorterStemmer() { - b = new char[INITIAL_SIZE]; - i = 0; - } - - /** - * reset() resets the stemmer so it can stem another word. If you invoke - * the stemmer by calling add(char) and then stem(), you must call reset() - * before starting another word. - */ - public void reset() { i = 0; dirty = false; } - - /** - * Add a character to the word being stemmed. When you are finished - * adding characters, you can call stem(void) to process the word. - */ - public void add(char ch) { - if (b.length <= i) { - b = ArrayUtil.grow(b, i+1); - } - b[i++] = ch; - } - - /** - * After a word has been stemmed, it can be retrieved by toString(), - * or a reference to the internal buffer can be retrieved by getResultBuffer - * and getResultLength (which is generally more efficient.) - */ - @Override - public String toString() { return new String(b,0,i); } - - /** - * Returns the length of the word resulting from the stemming process. - */ - public int getResultLength() { return i; } - - /** - * Returns a reference to a character buffer containing the results of - * the stemming process. You also need to consult getResultLength() - * to determine the length of the result. - */ - public char[] getResultBuffer() { return b; } - - /* cons(i) is true <=> b[i] is a consonant. */ - - private final boolean cons(int i) { - switch (b[i]) { - case 'a': case 'e': case 'i': case 'o': case 'u': - return false; - case 'y': - return (i==k0) ? true : !cons(i-1); - default: - return true; - } - } - - /* m() measures the number of consonant sequences between k0 and j. if c is - a consonant sequence and v a vowel sequence, and <..> indicates arbitrary - presence, - - <c><v> gives 0 - <c>vc<v> gives 1 - <c>vcvc<v> gives 2 - <c>vcvcvc<v> gives 3 - .... - */ - - private final int m() { - int n = 0; - int i = k0; - while(true) { - if (i > j) - return n; - if (! cons(i)) - break; - i++; - } - i++; - while(true) { - while(true) { - if (i > j) - return n; - if (cons(i)) - break; - i++; - } - i++; - n++; - while(true) { - if (i > j) - return n; - if (! cons(i)) - break; - i++; - } - i++; - } - } - - /* vowelinstem() is true <=> k0,...j contains a vowel */ - - private final boolean vowelinstem() { - int i; - for (i = k0; i <= j; i++) - if (! cons(i)) - return true; - return false; - } - - /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ - - private final boolean doublec(int j) { - if (j < k0+1) - return false; - if (b[j] != b[j-1]) - return false; - return cons(j); - } - - /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant - and also if the second c is not w,x or y. this is used when trying to - restore an e at the end of a short word. e.g. - - cav(e), lov(e), hop(e), crim(e), but - snow, box, tray. - - */ - - private final boolean cvc(int i) { - if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2)) - return false; - else { - int ch = b[i]; - if (ch == 'w' || ch == 'x' || ch == 'y') return false; - } - return true; - } - - private final boolean ends(String s) { - int l = s.length(); - int o = k-l+1; - if (o < k0) - return false; - for (int i = 0; i < l; i++) - if (b[o+i] != s.charAt(i)) - return false; - j = k-l; - return true; - } - - /* setto(s) sets (j+1),...k to the characters in the string s, readjusting - k. */ - - void setto(String s) { - int l = s.length(); - int o = j+1; - for (int i = 0; i < l; i++) - b[o+i] = s.charAt(i); - k = j+l; - dirty = true; - } - - /* r(s) is used further down. */ - - void r(String s) { if (m() > 0) setto(s); } - - /* step1() gets rid of plurals and -ed or -ing. e.g. - - caresses -> caress - ponies -> poni - ties -> ti - caress -> caress - cats -> cat - - feed -> feed - agreed -> agree - disabled -> disable - - matting -> mat - mating -> mate - meeting -> meet - milling -> mill - messing -> mess - - meetings -> meet - - */ - - private final void step1() { - if (b[k] == 's') { - if (ends("sses")) k -= 2; - else if (ends("ies")) setto("i"); - else if (b[k-1] != 's') k--; - } - if (ends("eed")) { - if (m() > 0) - k--; - } - else if ((ends("ed") || ends("ing")) && vowelinstem()) { - k = j; - if (ends("at")) setto("ate"); - else if (ends("bl")) setto("ble"); - else if (ends("iz")) setto("ize"); - else if (doublec(k)) { - int ch = b[k--]; - if (ch == 'l' || ch == 's' || ch == 'z') - k++; - } - else if (m() == 1 && cvc(k)) - setto("e"); - } - } - - /* step2() turns terminal y to i when there is another vowel in the stem. */ - - private final void step2() { - if (ends("y") && vowelinstem()) { - b[k] = 'i'; - dirty = true; - } - } - - /* step3() maps double suffices to single ones. so -ization ( = -ize plus - -ation) maps to -ize etc. note that the string before the suffix must give - m() > 0. */ - - private final void step3() { - if (k == k0) return; /* For Bug 1 */ - switch (b[k-1]) { - case 'a': - if (ends("ational")) { r("ate"); break; } - if (ends("tional")) { r("tion"); break; } - break; - case 'c': - if (ends("enci")) { r("ence"); break; } - if (ends("anci")) { r("ance"); break; } - break; - case 'e': - if (ends("izer")) { r("ize"); break; } - break; - case 'l': - if (ends("bli")) { r("ble"); break; } - if (ends("alli")) { r("al"); break; } - if (ends("entli")) { r("ent"); break; } - if (ends("eli")) { r("e"); break; } - if (ends("ousli")) { r("ous"); break; } - break; - case 'o': - if (ends("ization")) { r("ize"); break; } - if (ends("ation")) { r("ate"); break; } - if (ends("ator")) { r("ate"); break; } - break; - case 's': - if (ends("alism")) { r("al"); break; } - if (ends("iveness")) { r("ive"); break; } - if (ends("fulness")) { r("ful"); break; } - if (ends("ousness")) { r("ous"); break; } - break; - case 't': - if (ends("aliti")) { r("al"); break; } - if (ends("iviti")) { r("ive"); break; } - if (ends("biliti")) { r("ble"); break; } - break; - case 'g': - if (ends("logi")) { r("log"); break; } - } - } - - /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ - - private final void step4() { - switch (b[k]) { - case 'e': - if (ends("icate")) { r("ic"); break; } - if (ends("ative")) { r(""); break; } - if (ends("alize")) { r("al"); break; } - break; - case 'i': - if (ends("iciti")) { r("ic"); break; } - break; - case 'l': - if (ends("ical")) { r("ic"); break; } - if (ends("ful")) { r(""); break; } - break; - case 's': - if (ends("ness")) { r(""); break; } - break; - } - } - - /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */ - - private final void step5() { - if (k == k0) return; /* for Bug 1 */ - switch (b[k-1]) { - case 'a': - if (ends("al")) break; - return; - case 'c': - if (ends("ance")) break; - if (ends("ence")) break; - return; - case 'e': - if (ends("er")) break; return; - case 'i': - if (ends("ic")) break; return; - case 'l': - if (ends("able")) break; - if (ends("ible")) break; return; - case 'n': - if (ends("ant")) break; - if (ends("ement")) break; - if (ends("ment")) break; - /* element etc. not stripped before the m */ - if (ends("ent")) break; - return; - case 'o': - if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break; - /* j >= 0 fixes Bug 2 */ - if (ends("ou")) break; - return; - /* takes care of -ous */ - case 's': - if (ends("ism")) break; - return; - case 't': - if (ends("ate")) break; - if (ends("iti")) break; - return; - case 'u': - if (ends("ous")) break; - return; - case 'v': - if (ends("ive")) break; - return; - case 'z': - if (ends("ize")) break; - return; - default: - return; - } - if (m() > 1) - k = j; - } - - /* step6() removes a final -e if m() > 1. */ - - private final void step6() { - j = k; - if (b[k] == 'e') { - int a = m(); - if (a > 1 || a == 1 && !cvc(k-1)) - k--; - } - if (b[k] == 'l' && doublec(k) && m() > 1) - k--; - } - - - /** - * Stem a word provided as a String. Returns the result as a String. - */ - public String stem(String s) { - if (stem(s.toCharArray(), s.length())) - return toString(); - else - return s; - } - - /** Stem a word contained in a char[]. Returns true if the stemming process - * resulted in a word different from the input. You can retrieve the - * result with getResultLength()/getResultBuffer() or toString(). - */ - public boolean stem(char[] word) { - return stem(word, word.length); - } - - /** Stem a word contained in a portion of a char[] array. Returns - * true if the stemming process resulted in a word different from - * the input. You can retrieve the result with - * getResultLength()/getResultBuffer() or toString(). - */ - public boolean stem(char[] wordBuffer, int offset, int wordLen) { - reset(); - if (b.length < wordLen) { - b = new char[ArrayUtil.oversize(wordLen, NUM_BYTES_CHAR)]; - } - System.arraycopy(wordBuffer, offset, b, 0, wordLen); - i = wordLen; - return stem(0); - } - - /** Stem a word contained in a leading portion of a char[] array. - * Returns true if the stemming process resulted in a word different - * from the input. You can retrieve the result with - * getResultLength()/getResultBuffer() or toString(). - */ - public boolean stem(char[] word, int wordLen) { - return stem(word, 0, wordLen); - } - - /** Stem the word placed into the Stemmer buffer through calls to add(). - * Returns true if the stemming process resulted in a word different - * from the input. You can retrieve the result with - * getResultLength()/getResultBuffer() or toString(). - */ - public boolean stem() { - return stem(0); - } - - public boolean stem(int i0) { - k = i - 1; - k0 = i0; - if (k > k0+1) { - step1(); step2(); step3(); step4(); step5(); step6(); - } - // Also, a word is considered dirty if we lopped off letters - // Thanks to Ifigenia Vairelles for pointing this out. - if (i != k+1) - dirty = true; - i = k+1; - return dirty; - } - - /** Test program for demonstrating the Stemmer. It reads a file and - * stems each word, writing the result to standard out. - * Usage: Stemmer file-name - */ - public static void main(String[] args) { - PorterStemmer s = new PorterStemmer(); - - for (int i = 0; i < args.length; i++) { - try { - InputStream in = new FileInputStream(args[i]); - byte[] buffer = new byte[1024]; - int bufferLen, offset, ch; - - bufferLen = in.read(buffer); - offset = 0; - s.reset(); - - while(true) { - if (offset < bufferLen) - ch = buffer[offset++]; - else { - bufferLen = in.read(buffer); - offset = 0; - if (bufferLen < 0) - ch = -1; - else - ch = buffer[offset++]; - } - - if (Character.isLetter((char) ch)) { - s.add(Character.toLowerCase((char) ch)); - } - else { - s.stem(); - System.out.print(s.toString()); - s.reset(); - if (ch < 0) - break; - else { - System.out.print((char) ch); - } - } - } - - in.close(); - } - catch (IOException e) { - System.out.println("error reading " + args[i]); - } - } - } - } - http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java index 1dc100c..a72583e 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java @@ -19,11 +19,11 @@ package opennlp.tools.textsimilarity; import java.util.List; -import opennlp.tools.stemmer.PorterStemmer; +import opennlp.tools.stemmer.PStemmer; public class LemmaFormManager { - public String matchLemmas(PorterStemmer ps, String lemma1, String lemma2, + public String matchLemmas(PStemmer ps, String lemma1, String lemma2, String POS) { if (POS == null) { return null;
