[3/5] opennlp-sandbox git commit: merge from bgalitsky's own git repo

bgalitsky Wed, 16 Nov 2016 01:05:15 -0800

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java
index 23fd5a3..25d5ac5 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/pattern_structure/PhrasePatternStructure.java
@@ -1,166 +1,192 @@
-package opennlp.tools.parse_thicket.pattern_structure;
-
-import java.util.*;
-import java.io.*;
-
-import opennlp.tools.parse_thicket.ParseCorefsBuilder;
-import opennlp.tools.parse_thicket.ParseThicket;
-import opennlp.tools.parse_thicket.ParseTreeNode;
-import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder;
-import opennlp.tools.textsimilarity.ParseTreeChunk;
-import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
-
-
-public class PhrasePatternStructure {
-       int objectCount;
-       int attributeCount;
-       ArrayList<PhraseConcept> conceptList;
-       ParseTreeMatcherDeterministic md; 
-       public PhrasePatternStructure(int objectCounts, int attributeCounts) {
-               objectCount = objectCounts;
-               attributeCount = attributeCounts;
-               conceptList = new ArrayList<PhraseConcept>();
-               PhraseConcept bottom = new PhraseConcept();
-               md = new ParseTreeMatcherDeterministic();
-               /*Set<Integer> b_intent = new HashSet<Integer>();
-               for (int index = 0; index < attributeCount; ++index) {
-                       b_intent.add(index);
-               }
-               bottom.setIntent(b_intent);*/
-               bottom.setPosition(0);
-               conceptList.add(bottom);
-       }
-       public int GetMaximalConcept(List<List<ParseTreeChunk>> intent, int 
Generator) {
-               boolean parentIsMaximal = true;
-               while(parentIsMaximal) {
-                       parentIsMaximal = false;
-                       for (int parent : conceptList.get(Generator).parents) {
-                               if 
(conceptList.get(parent).intent.containsAll(intent)) {
-                                       Generator = parent;
-                                       parentIsMaximal = true;
-                                       break;
-                               }
-                       }
-               }
-               return Generator;
-       }
-       public int AddIntent(List<List<ParseTreeChunk>> intent, int generator) {
-               System.out.println("debug");
-               System.out.println("called for " + intent);
-               //printLattice();
-               int generator_tmp = GetMaximalConcept(intent, generator);
-               generator = generator_tmp;
-               if (conceptList.get(generator).intent.equals(intent)) {
-                       System.out.println("at generator:" + 
conceptList.get(generator).intent);
-                       System.out.println("to add:" + intent);
-
-                       System.out.println("already generated");
-                       return generator;
-               }
-               Set<Integer> generatorParents = 
conceptList.get(generator).parents;
-               Set<Integer> newParents = new HashSet<Integer>();
-               for (int candidate : generatorParents) {
-                       if 
(!intent.containsAll(conceptList.get(candidate).intent)) {
-                       //if 
(!conceptList.get(candidate).intent.containsAll(intent)) {
-                               //Set<Integer> intersection = new 
HashSet<Integer>(conceptList.get(candidate).intent);
-                               //List<List<ParseTreeChunk>> intersection = new 
ArrayList<List<ParseTreeChunk>>(conceptList.get(candidate).intent);
-                               //intersection.retainAll(intent);
-                               List<List<ParseTreeChunk>> intersection = md
-                               
.matchTwoSentencesGroupedChunksDeterministic(intent, 
conceptList.get(candidate).intent);
-                               System.out.println("recursive call 
(inclusion)");
-                               candidate = AddIntent(intersection, candidate);
-                       }
-                       boolean addParents = true;
-                       System.out.println("now iterating over parents");
-                       Iterator<Integer> iterator = newParents.iterator();
-                       while (iterator.hasNext()) {
-                               Integer parent = iterator.next();
-                               if 
(conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent)) 
{
-                                       addParents = false;
-                                       break;
-                               }
-                               else {
-                                       if 
(conceptList.get(candidate).intent.containsAll(conceptList.get(parent).intent)) 
{
-                                               iterator.remove();
-                                       }
-                               }
-                       }
-                       /*for (int parent : newParents) {
-                               System.out.println("parent = " + parent);
-                               System.out.println("candidate 
intent:"+conceptList.get(candidate).intent);
-                               System.out.println("parent 
intent:"+conceptList.get(parent).intent);
-                               
-                               if 
(conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent)) 
{
-                                       addParents = false;
-                                       break;
-                               }
-                               else {
-                                       if 
(conceptList.get(candidate).intent.containsAll(conceptList.get(parent).intent)) 
{
-                                               newParents.remove(parent);
-                                       }
-                               }
-                       }*/
-                       if (addParents) {
-                               newParents.add(candidate);
-                       }
-               }
-               System.out.println("size of lattice: " + conceptList.size());
-               PhraseConcept newConcept = new PhraseConcept();
-               newConcept.setIntent(intent);
-               newConcept.setPosition(conceptList.size());
-               conceptList.add(newConcept);
-               conceptList.get(generator).parents.add(newConcept.position);
-               for (int newParent: newParents) {
-                       if 
(conceptList.get(generator).parents.contains(newParent)) {
-                               
conceptList.get(generator).parents.remove(newParent);
-                       }
-                       
conceptList.get(newConcept.position).parents.add(newParent);
-               }
-               return newConcept.position;
-       }
-       public void printLatticeStats() {
-               System.out.println("Lattice stats");
-               System.out.println("max_object_index = " + objectCount);
-               System.out.println("max_attribute_index = " + attributeCount);
-               System.out.println("Current concept count = " + 
conceptList.size());
-       }
-       public void printLattice() {
-               for (int i = 0; i < conceptList.size(); ++i) {
-                       printConceptByPosition(i);
-               }
-       }
-       public void printConceptByPosition(int index) {
-               System.out.println("Concept at position " + index);
-               conceptList.get(index).printConcept();
-       }
-       public List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForPara(
-                       List<List<ParseTreeNode>> phrs) {
-               List<List<ParseTreeChunk>> results = new 
ArrayList<List<ParseTreeChunk>>();
-               List<ParseTreeChunk> nps = new ArrayList<ParseTreeChunk>(), vps 
= new ArrayList<ParseTreeChunk>(), 
-                               pps = new ArrayList<ParseTreeChunk>();
-               for(List<ParseTreeNode> ps:phrs){
-                       ParseTreeChunk ch = convertNodeListIntoChunk(ps);
-                       String ptype = ps.get(0).getPhraseType();
-                       if (ptype.equals("NP")){
-                               nps.add(ch);
-                       } else if (ptype.equals("VP")){
-                               vps.add(ch);
-                       } else if (ptype.equals("PP")){
-                               pps.add(ch);
-                       }
-               }
-               results.add(nps); results.add(vps); results.add(pps);
-               return results;
-       }
-       private ParseTreeChunk convertNodeListIntoChunk(List<ParseTreeNode> ps) 
{
-               List<String> lemmas = new ArrayList<String>(),  poss = new 
ArrayList<String>();
-               for(ParseTreeNode n: ps){
-                       lemmas.add(n.getWord());
-                       poss.add(n.getPos());
-               }
-               ParseTreeChunk ch = new ParseTreeChunk(lemmas, poss, 0, 0);
-               ch.setMainPOS(ps.get(0).getPhraseType());
-               return ch;
-       }
-       
-}
\ No newline at end of file
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.parse_thicket.pattern_structure;
+
+
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
+
+import opennlp.tools.parse_thicket.ParseTreeNode;
+import opennlp.tools.textsimilarity.ParseTreeChunk;
+import opennlp.tools.textsimilarity.ParseTreeMatcherDeterministic;
+
+
+public class PhrasePatternStructure {
+       int objectCount;
+       int attributeCount;
+       public List<PhraseConcept> conceptList;
+       ParseTreeMatcherDeterministic md; 
+       public PhrasePatternStructure(int objectCounts, int attributeCounts) {
+               objectCount = objectCounts;
+               attributeCount = attributeCounts;
+               conceptList = new ArrayList<PhraseConcept>();
+               PhraseConcept bottom = new PhraseConcept();
+               md = new ParseTreeMatcherDeterministic();
+               /*Set<Integer> b_intent = new HashSet<Integer>();
+               for (int index = 0; index < attributeCount; ++index) {
+                       b_intent.add(index);
+               }
+               bottom.setIntent(b_intent);*/
+               bottom.setPosition(0);
+               conceptList.add(bottom);
+       }
+       public int GetMaximalConcept(List<List<ParseTreeChunk>> intent, int 
Generator) {
+               boolean parentIsMaximal = true;
+               while(parentIsMaximal) {
+                       parentIsMaximal = false;
+                       for (int parent : conceptList.get(Generator).parents) {
+                               if 
(conceptList.get(parent).intent.containsAll(intent)) {
+                                       Generator = parent;
+                                       parentIsMaximal = true;
+                                       break;
+                               }
+                       }
+               }
+               return Generator;
+       }
+       public int AddIntent(List<List<ParseTreeChunk>> intent, int generator) {
+               System.out.println("debug");
+               System.out.println("called for " + intent);
+               //printLattice();
+               int generator_tmp = GetMaximalConcept(intent, generator);
+               generator = generator_tmp;
+               if (conceptList.get(generator).intent.equals(intent)) {
+                       System.out.println("at generator:" + 
conceptList.get(generator).intent);
+                       System.out.println("to add:" + intent);
+                       System.out.println("already generated");
+                       return generator;
+               }
+               Set<Integer> generatorParents = 
conceptList.get(generator).parents;
+               Set<Integer> newParents = new HashSet<Integer>();
+               for (int candidate : generatorParents) {
+                       if 
(!intent.containsAll(conceptList.get(candidate).intent)) {
+                               //if 
(!conceptList.get(candidate).intent.containsAll(intent)) {
+                               //Set<Integer> intersection = new 
HashSet<Integer>(conceptList.get(candidate).intent);
+                               //List<List<ParseTreeChunk>> intersection = new 
ArrayList<List<ParseTreeChunk>>(conceptList.get(candidate).intent);
+                               //intersection.retainAll(intent);
+                               List<List<ParseTreeChunk>> intersection = md
+                                               
.matchTwoSentencesGroupedChunksDeterministic(intent, 
conceptList.get(candidate).intent);
+                               System.out.println("recursive call 
(inclusion)");
+                               candidate = AddIntent(intersection, candidate);
+                       }
+                       boolean addParents = true;
+                       System.out.println("now iterating over parents");
+                       Iterator<Integer> iterator = newParents.iterator();
+                       while (iterator.hasNext()) {
+                               Integer parent = iterator.next();
+                               if 
(conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent)) 
{
+                                       addParents = false;
+                                       break;
+                               }
+                               else {
+                                       if 
(conceptList.get(candidate).intent.containsAll(conceptList.get(parent).intent)) 
{
+                                               iterator.remove();
+                                       }
+                               }
+                       }
+                       /*for (int parent : newParents) {
+                               System.out.println("parent = " + parent);
+                               System.out.println("candidate 
intent:"+conceptList.get(candidate).intent);
+                               System.out.println("parent 
intent:"+conceptList.get(parent).intent);
+
+                               if 
(conceptList.get(parent).intent.containsAll(conceptList.get(candidate).intent)) 
{
+                                       addParents = false;
+                                       break;
+                               }
+                               else {
+                                       if 
(conceptList.get(candidate).intent.containsAll(conceptList.get(parent).intent)) 
{
+                                               newParents.remove(parent);
+                                       }
+                               }
+                       }*/
+                       if (addParents) {
+                               newParents.add(candidate);
+                       }
+               }
+               System.out.println("size of lattice: " + conceptList.size());
+               PhraseConcept newConcept = new PhraseConcept();
+               newConcept.setIntent(intent);
+               newConcept.setPosition(conceptList.size());
+               conceptList.add(newConcept);
+               conceptList.get(generator).parents.add(newConcept.position);
+               for (int newParent: newParents) {
+                       if 
(conceptList.get(generator).parents.contains(newParent)) {
+                               
conceptList.get(generator).parents.remove(newParent);
+                       }
+                       
conceptList.get(newConcept.position).parents.add(newParent);
+               }
+               return newConcept.position;
+       }
+
+       public void printLatticeStats() {
+               System.out.println("Lattice stats");
+               System.out.println("max_object_index = " + objectCount);
+               System.out.println("max_attribute_index = " + attributeCount);
+               System.out.println("Current concept count = " + 
conceptList.size());
+
+       }
+
+       public void printLattice() {
+               for (int i = 0; i < conceptList.size(); ++i) {
+                       printConceptByPosition(i);
+               }
+       }
+
+       public void printConceptByPosition(int index) {
+               System.out.println("Concept at position " + index);
+               conceptList.get(index).printConcept();
+       }
+
+       public List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForPara(
+                       List<List<ParseTreeNode>> phrs) {
+               List<List<ParseTreeChunk>> results = new 
ArrayList<List<ParseTreeChunk>>();
+               List<ParseTreeChunk> nps = new ArrayList<ParseTreeChunk>(), vps 
= new ArrayList<ParseTreeChunk>(), 
+                               pps = new ArrayList<ParseTreeChunk>();
+               for(List<ParseTreeNode> ps:phrs){
+                       ParseTreeChunk ch = convertNodeListIntoChunk(ps);
+                       String ptype = ps.get(0).getPhraseType();
+                       System.out.println(ps);
+                       if (ptype.equals("NP")){
+                               nps.add(ch);
+                       } else if (ptype.equals("VP")){
+                               vps.add(ch);
+                       } else if (ptype.equals("PP")){
+                               pps.add(ch);
+                       }
+               }
+               results.add(nps); results.add(vps); results.add(pps);
+               return results;
+       }
+
+       private ParseTreeChunk convertNodeListIntoChunk(List<ParseTreeNode> ps) 
{
+               List<String> lemmas = new ArrayList<String>(),  poss = new 
ArrayList<String>();
+               for(ParseTreeNode n: ps){
+                       lemmas.add(n.getWord());
+                       poss.add(n.getPos());
+               }
+               ParseTreeChunk ch = new ParseTreeChunk(lemmas, poss, 0, 0);
+               ch.setMainPOS(ps.get(0).getPhraseType());
+               return ch;
+       }
+
+
+}
+


http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java
index 3a36e80..96bec44 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureArcsBuilder.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package opennlp.tools.parse_thicket.rhetoric_structure;
 
 import java.util.ArrayList;

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java
index 060d32f..3b1c576 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/rhetoric_structure/RhetoricStructureMarker.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package opennlp.tools.parse_thicket.rhetoric_structure;
 
 import java.util.ArrayList;

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
index c9b1f76..cd0e541 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingQueryRunner.java
@@ -21,6 +21,8 @@ import java.util.ArrayList;
 import java.util.List;
 import java.util.logging.Logger;
 
+import org.apache.commons.lang.StringUtils;
+
 import net.billylieurance.azuresearch.AzureSearchImageQuery;
 import net.billylieurance.azuresearch.AzureSearchImageResult;
 import net.billylieurance.azuresearch.AzureSearchResultSet;
@@ -29,7 +31,11 @@ import net.billylieurance.azuresearch.AzureSearchWebResult;
 
 public class BingQueryRunner {
        
-       protected static String BING_KEY = 
"e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";
+       protected static String BING_KEY = 
+                       "WFoNMM706MMJ5JYfcHaSEDP+faHj3xAxt28CPljUAHA";
+                       //"pjtCgujmf9TtfjCVBdcQ2rBUQwGLmtLtgCG4Ex7kekw";        
        
+                       //"e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";
+                       //"Cec1TlE67kPGDA/1MbeqPfHzP0I1eJypf3o0pYxRsuU=";
        private static final Logger LOG = Logger
                      
.getLogger("opennlp.tools.similarity.apps.BingQueryRunner");
        protected AzureSearchWebQuery aq = new AzureSearchWebQuery();
@@ -39,11 +45,32 @@ public class BingQueryRunner {
                BING_KEY = key;
        }
        
+       private int MAX_QUERY_LENGTH = 100;
+       
        public void setLang(String language){
                aq.setMarket(language);
        }
   
+       public List<HitBase> runSearchMultiplePages(String query, int nPages) {
+               List<HitBase> results = new ArrayList<HitBase>();
+               for(int i=0; i< nPages; i++){
+                       aq.setPage(i);
+                   results.addAll( runSearch(query, 50));
+               }
+               return results;
+       }
+       
        public List<HitBase> runSearch(String query, int nRes) {
+               
+               if (query.length()>MAX_QUERY_LENGTH){
+                       try {
+                               query = query.substring(0, MAX_QUERY_LENGTH);
+                               //should not cut words, need the last space to 
end the query
+                               query = query.substring(0, 
StringUtils.lastIndexOf(query, " "));
+                       } catch (Exception e) {
+                               LOG.severe("Problem reducing the length of 
query :"+query);
+                       }
+               }
                aq.setAppid(BING_KEY);
                aq.setQuery(query);             
                aq.setPerPage(nRes);
@@ -54,8 +81,12 @@ public class BingQueryRunner {
                        try {
                                aq.doQuery();
                        } catch (Exception e1) {
-                               // TODO Auto-generated catch block
-                               e1.printStackTrace();
+                               
aq.setAppid("Cec1TlE67kPGDA/1MbeqPfHzP0I1eJypf3o0pYxRsuU=");
+                               try {
+                                       aq.doQuery();
+                               } catch (Exception e2) {
+                                       e2.printStackTrace();
+                               }
                        }
                        e.printStackTrace();
                }
@@ -114,138 +145,12 @@ public class BingQueryRunner {
 
   }
 
-  /*
  
 
-  private String constructBingUrl(String query, String domainWeb, String lang,
-      int numbOfHits) throws Exception {
-    String codedQuery = URLEncoder.encode(query, "UTF-8");
-    String yahooRequest = "http://api.search.live.net/json.aspx?Appid=";
-        + APP_ID + "&query=" + codedQuery // +
-        // "&sources=web"+
-        + "&Sources=News"
-        // Common request fields (optional)
-        + "&Version=2.0" + "&Market=en-us"
-        // + "&Options=EnableHighlighting"
-
-        // News-specific request fields (optional)
-        + "&News.Offset=0";
-
-    return yahooRequest;
-  }
-
- 
-    
-  public ArrayList<String> search(String query, String domainWeb, String lang,
-      int numbOfHits) throws Exception {
-    URL url = new URL(constructBingUrl(query, domainWeb, lang, numbOfHits));
-    URLConnection connection = url.openConnection();
-
-    String line;
-    ArrayList<String> result = new ArrayList<String>();
-    BufferedReader reader = new BufferedReader(new InputStreamReader(
-        connection.getInputStream()));
-    int count = 0;
-    while ((line = reader.readLine()) != null) {
-      result.add(line);
-      count++;
-    }
-    return result;
-  }
-
-  public BingResponse populateBingHit(String response) throws Exception {
-    BingResponse resp = new BingResponse();
-    JSONObject rootObject = new JSONObject(response);
-    JSONObject responseObject = rootObject.getJSONObject("SearchResponse");
-    JSONObject web = responseObject.getJSONObject("News");
-
-    // the search result is in an array under the name of "results"
-    JSONArray resultSet = null;
-    try {
-      resultSet = web.getJSONArray("Results");
-    } catch (Exception e) {
-      System.err.print("\n!!!!!!!");
-      LOG.severe("\nNo search results");
-
-    }
-    if (resultSet != null) {
-      for (int i = 0; i < resultSet.length(); i++) {
-        HitBase hit = new HitBase();
-        JSONObject singleResult = resultSet.getJSONObject(i);
-        hit.setAbstractText(singleResult.getString("Snippet"));
-        hit.setDate(singleResult.getString("Date"));
-        String title = StringUtils.replace(singleResult.getString("Title"),
-            "î", " ");
-        hit.setTitle(title);
-        hit.setUrl(singleResult.getString("Url"));
-        hit.setSource(singleResult.getString("Source"));
-
-        resp.appendHits(hit);
-      }
-    }
-    return resp;
-  }
-
-  public List<HitBase> runSearch(String query) {
-    BingResponse resp = null;
-    try {
-      List<String> resultList = search(query, "", "", 8);
-      resp = populateBingHit(resultList.get(0));
-
-    } catch (Exception e) {
-      // e.printStackTrace();
-      LOG.severe("No news search results for query " + query);
-      return null;
-    }
-    // cast to super class
-    List<HitBase> hits = new ArrayList<HitBase>();
-    for (HitBase h : resp.getHits())
-      hits.add((HitBase) h);
-
-    hits = HitBase.removeDuplicates(hits);
-    return hits;
-  }
-  */
-
-  // TODO comment back when dependencies resolved (CopyrightViolations)
-  /*
-   * public List<CopyrightViolations> runCopyRightViolExtenralSearch(String
-   * query, String report) {
-   * 
-   * List<CopyrightViolations> genResult = new 
ArrayList<CopyrightViolations>();
-   * BingResponse newResp = null; StringDistanceMeasurer meas = new
-   * StringDistanceMeasurer(); try { List<String> resultList = search(query, 
"",
-   * "", 5);
-   * 
-   * BingResponse resp = populateBingHit(resultList.get(0));
-   * //printSearchResult(resultList.get(0));
-   * 
-   * for(int i=0; i<resp.getHits().size(); i++){ BingHit h1 =
-   * resp.getHits().get(i); String snippet = h1.getAbstractText(); Double sim =
-   * meas.measureStringDistance(report, snippet); if
-   * (sim>snapshotSimilarityThreshold){ //genResult.add(snapshot);
-   * CopyrightViolations cvr = new CopyrightViolations();
-   * cvr.setSnippet(snippet); cvr.setTitle(h1.getTitle());
-   * cvr.setUrl(h1.getDisplayUrl()); genResult.add(cvr); log.debug(new
-   * String("Copyright violation detected in snapshot"
-   * ).toUpperCase()+" : sim = "+ new Double(sim).toString().substring(0, 3)+
-   * " \n "+snippet);
-   * 
-   * } else { log.debug("Different news: sim = "+ new
-   * Double(sim).toString().substring(0, 3)+ " \n "+snippet);
-   * 
-   * }
-   * 
-   * }
-   * 
-   * } catch (Exception e) { e.printStackTrace(); }
-   * 
-   * 
-   * return genResult; }
-   */
-
   public static void main(String[] args) {
     BingQueryRunner self = new BingQueryRunner();
+    List<HitBase> resp1 = self.runSearch("albert einstein", 15);
+    System.out.println(resp1);
     
     AzureSearchResultSet<AzureSearchImageResult> res = 
self.runImageSearch("albert einstein");
     System.out.println(res);

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
index a934264..d28f4e3 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/BingWebQueryRunner.java
@@ -17,25 +17,18 @@
 
 package opennlp.tools.similarity.apps;
 
-import java.io.BufferedReader;
-import java.io.FileWriter;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.net.URL;
-import java.net.URLConnection;
-import java.net.URLEncoder;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.logging.Logger;
 
+
 import net.billylieurance.azuresearch.AzureSearchResultSet;
 import net.billylieurance.azuresearch.AzureSearchWebQuery;
 import net.billylieurance.azuresearch.AzureSearchWebResult;
 import opennlp.tools.similarity.apps.utils.StringDistanceMeasurer;
 
 import org.apache.commons.lang.StringUtils;
-import org.json.JSONArray;
-import org.json.JSONObject;
+
 
 
 public class BingWebQueryRunner {
@@ -111,6 +104,12 @@ public class BingWebQueryRunner {
 
     return 0;
   }
+  
+  public static void main(String[] args) {
+           BingWebQueryRunner self = new BingWebQueryRunner();
+           
+           List<HitBase> res = self.runSearch ("albert einstein", 10);
+  }
 
   
 }

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java
index 428cd4e..a017105 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/ContentGeneratorSupport.java
@@ -132,7 +132,7 @@ public class ContentGeneratorSupport {
                return queryArrayStr;
 
        }
-       
+
        public static String[] cleanListOfSents(String[] sents) {
                List<String> sentsClean = new ArrayList<String>();
                for (String s : sents) {
@@ -144,11 +144,9 @@ public class ContentGeneratorSupport {
        }
 
        public static String cleanSpacesInCleanedHTMLpage(String pageContent){ 
//was 4 spaces 
-                //was 3 spaces => now back to 2
+               //was 3 spaces => now back to 2
                //TODO - verify regexp!!
                pageContent = 
pageContent.trim().replaceAll("([a-z])(\\s{2,3})([A-Z])", "$1. $3")
-                               //replaceAll("[a-z]  [A-Z]", ". $0")// 
.replace("  ",
-                               // ". ")
                                .replace("..", ".").replace(". . .", " ").
                                replace(".    .",". ").trim(); // sometimes   
html breaks are converted into ' ' (two spaces), so
                // we need to put '.'
@@ -461,7 +459,22 @@ public class ContentGeneratorSupport {
                }
                return (String[]) sentsClean.toArray(new String[0]);
        }
-       
+
+       public static String getPortionOfTitleWithoutDelimiters(String title){
+               String[] delimiters = new String[]{"\\+","-", "=", "_", "\\)", 
"\\|"};
+               for(String delim: delimiters ){
+                       String[] split = title.split(delim);
+                       if (split.length>1){
+                               for(String s: split){
+                                       if (s.indexOf(".")<0)
+                                               return s;
+                               }
+                       }
+               }
+
+               return title;
+       }
+
        public static void main(String[] args){
                String s = "You can grouP   parts  Of your regular expression  
In your pattern   You grouP  elements";
                //with round brackets, e.g., ()." +
@@ -472,6 +485,15 @@ public class ContentGeneratorSupport {
                sr1 = s.replaceAll("  [A-Z]", ". $1");
        }
 
+       public static boolean problematicHitList(List<HitBase> hits){
+               if (hits.size()<1)
+                       return true;
+               for(HitBase hit: hits){
+                       if (!hit.getFragments().isEmpty())
+                               return false;
+               }
+               return true;            
+       }
 }
 
 

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
index 3e79b7a..17421fd 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/GeneratedSentenceProcessor.java
@@ -89,7 +89,7 @@ public class GeneratedSentenceProcessor {
 
                String[] periods = StringUtils.split(sent.replace('.', '#'), 
'#');
                if ((float) periods.length / (float) spaces.length > 0.2) {
-                       System.out.println("Rejection: too many periods in sent 
='"+sent);
+                       //System.out.println("Rejection: too many periods in 
sent ='"+sent);
                        return null;
                }
                // commented [x], to avoid rejection sentences with refs[]
@@ -102,7 +102,7 @@ public class GeneratedSentenceProcessor {
                String[] pipes = StringUtils.split(sent, '|');
                if (StringUtils.split(sent, '|').length > 2
                                || StringUtils.split(sent, '>').length > 2) {
-                       System.out.println("Rejection: too many |s or >s in 
sent ='"+sent);
+                       //System.out.println("Rejection: too many |s or >s in 
sent ='"+sent);
                        return null;
                }
                String sentTry = sent.toLowerCase();
@@ -200,14 +200,14 @@ public class GeneratedSentenceProcessor {
        public static boolean isProhibitiveWordsOccurOrStartWith(String 
sentenceLowercase){
                for(String o: occurs){
                        if (sentenceLowercase.indexOf(o)>-1){
-                               System.out.println("Found prohibited occurrence 
"+ o +" \n in sentence = "+  sentenceLowercase);
+                               //System.out.println("Found prohibited 
occurrence "+ o +" \n in sentence = "+  sentenceLowercase);
                                return true;
                        }
                }
 
                for(String o: occursStartsWith){
                        if (sentenceLowercase.startsWith(o)){
-                               System.out.println("Found prohibited occurrence 
Start With  "+ o +" \n in sentence = "+  sentenceLowercase);
+                               //System.out.println("Found prohibited 
occurrence Start With  "+ o +" \n in sentence = "+  sentenceLowercase);
                                return true;
                        }
                }

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java
index bfeff62..91f6fda 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/RelatedSentenceFinder.java
@@ -82,6 +82,8 @@ public class RelatedSentenceFinder {
                this.RELEVANCE_THRESHOLD=thresh;
                yrunner.setKey(key);
        }
+       
+       int generateContentAboutIter = 0;
 
        public RelatedSentenceFinder() {
                // TODO Auto-generated constructor stub
@@ -171,6 +173,20 @@ public class RelatedSentenceFinder {
                        if (stepCount>MAX_STEPS)
                                break;
                }
+                
+               // if nothing is written, then get first search result and try 
again
+               try {
+                       if (generateContentAboutIter<4 && 
ContentGeneratorSupport.problematicHitList(opinionSentencesToAdd)){
+                               List<HitBase> resultList = 
yrunner.runSearch(sentence, 10);
+                               String discoveredSimilarTopic = 
resultList.get(generateContentAboutIter).getTitle();
+                               discoveredSimilarTopic = 
ContentGeneratorSupport.getPortionOfTitleWithoutDelimiters(discoveredSimilarTopic);
+                               generateContentAboutIter++;
+                               opinionSentencesToAdd =  
generateContentAbout(discoveredSimilarTopic);
+                       }
+               } catch (Exception e) {
+                       // TODO Auto-generated catch block
+                       e.printStackTrace();
+               }
 
                opinionSentencesToAdd = 
removeDuplicatesFromResultantHits(opinionSentencesToAdd);
                return opinionSentencesToAdd;

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java
index b2d2194..1c50fbf 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/StoryDiscourseNavigator.java
@@ -23,8 +23,11 @@ import java.util.Collection;
 import java.util.HashSet;
 import java.util.List;
 
+import org.apache.commons.lang.StringUtils;
+
+import opennlp.tools.similarity.apps.utils.PageFetcher;
 import opennlp.tools.similarity.apps.utils.StringCleaner;
-import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.PStemmer;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
 import opennlp.tools.textsimilarity.TextProcessor;
@@ -34,7 +37,8 @@ public class StoryDiscourseNavigator {
        protected BingQueryRunner yrunner = new BingQueryRunner();
        ParserChunker2MatcherProcessor sm = ParserChunker2MatcherProcessor
                        .getInstance();
-       private PorterStemmer ps = new PorterStemmer();
+       private PStemmer ps = new PStemmer();
+       PageFetcher pFetcher = new PageFetcher();
 
        public static final String[] frequentPerformingVerbs = {
                " born raised meet learn ", " graduated enter discover",
@@ -53,8 +57,34 @@ public class StoryDiscourseNavigator {
                "meet enjoy follow create", "discover continue produce"
 
        };
+       
+       private String[] obtainKeywordsForAnEntityFromWikipedia(String entity){
+               yrunner.setKey("xdnRVcVf9m4vDvW1SkTAz5kS5DFYa19CrPYGelGJxnc");
+               List<HitBase> resultList = yrunner.runSearch(entity, 20);
+               HitBase h = null;
+               for (int i = 0; i < resultList.size(); i++) {
+                       h = resultList.get(i);
+                       if (h.getUrl().indexOf("wikipedia.")>-1)
+                               break;
+               }
+               String content = pFetcher.fetchOrigHTML(h.getUrl());
+               content = content.replace("\"><a href=\"#", "&_&_&_&");
+               String[] portions = StringUtils.substringsBetween(content, 
"&_&_&_&", "\"><span");
+               List<String> results = new ArrayList<String>();
+               for(int i = 0; i< portions.length; i++){
+                       if (portions[i].indexOf("cite_note")>-1)
+                               continue;
+                        results.add(entity + " " + portions[i].replace('_', ' 
').replace('.',' '));
+               }
+           return results.toArray(new String[0]);      
+       }
 
        public String[] obtainAdditionalKeywordsForAnEntity(String entity){
+               String[] keywordsFromWikipedia = 
obtainKeywordsForAnEntityFromWikipedia(entity);
+               // these keywords should include *entity*
+               if (keywordsFromWikipedia!=null && 
keywordsFromWikipedia.length>3)
+                       return keywordsFromWikipedia;
+               
                List<List<ParseTreeChunk>> matchList = runSearchForTaxonomyPath(
                                entity, "", "en", 30);
                Collection<String> keywordsToRemove = 
TextProcessor.fastTokenize(entity.toLowerCase(), false);
@@ -70,7 +100,7 @@ public class StoryDiscourseNavigator {
                return res;
        }
 
-       public List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String query,
+       private List<List<ParseTreeChunk>> runSearchForTaxonomyPath(String 
query,
                        String domain, String lang, int numbOfHits) {
                List<List<ParseTreeChunk>> genResult = new 
ArrayList<List<ParseTreeChunk>>();
                try {
@@ -127,5 +157,7 @@ public class StoryDiscourseNavigator {
        public static void main(String[] args){
                String[] res = new 
StoryDiscourseNavigator().obtainAdditionalKeywordsForAnEntity("Albert 
Einstein");
                System.out.println(Arrays.asList(res));
+               res = new 
StoryDiscourseNavigator().obtainAdditionalKeywordsForAnEntity("search engine 
marketing");
+               System.out.println(Arrays.asList(res));
        }
 }

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java
index 0e8d743..41afe36 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/ContentGeneratorRequestHandler.java
@@ -1,7 +1,24 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package opennlp.tools.similarity.apps.solr;
 
 import java.io.BufferedReader;
 import java.io.File;
+import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
@@ -61,6 +78,7 @@ public class ContentGeneratorRequestHandler extends 
SearchHandler {
        private static Logger LOG = Logger
                        
.getLogger("com.become.search.requestHandlers.SearchResultsReRankerRequestHandler");
        private ParserChunker2MatcherProcessor sm = null;
+       WordDocBuilderEndNotes docBuilder = new WordDocBuilderEndNotes ();
 
 
        public void handleRequestBody(SolrQueryRequest req, SolrQueryResponse 
rsp){
@@ -142,10 +160,9 @@ public class ContentGeneratorRequestHandler extends 
SearchHandler {
        }
 
        public String cgRunner(String[] args) {
-               ParserChunker2MatcherProcessor sm = null;
                int count=0; 
                for(String a: args){
-                       System.out.print(count+" >> " + a);
+                       System.out.print(count+">>" + a + " | ");
                        count++;
                }
                
@@ -164,13 +181,13 @@ public class ContentGeneratorRequestHandler extends 
SearchHandler {
 
                String bingKey = args[7];
                if (bingKey == null){
-                       bingKey = 
//"e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";
-                                       
"xdnRVcVf9m4vDvW1SkTAz5kS5DFYa19CrPYGelGJxnc";
+                       bingKey = 
"e8ADxIjn9YyHx36EihdjH/tMqJJItUrrbPTUpKahiU0=";
+                                       
//"xdnRVcVf9m4vDvW1SkTAz5kS5DFYa19CrPYGelGJxnc";
                }
 
                RelatedSentenceFinder f = null;
                String lang = args[6];
-               if (lang.startsWith("es")){
+               if (lang.startsWith("es") || lang.startsWith("ru") || 
lang.startsWith("de")){
                        f = new 
RelatedSentenceFinderML(Integer.parseInt(args[3]), Integer.parseInt(args[4]), 
Float.parseFloat(args[5]), bingKey);
                        f.setLang(lang);
                } else      
@@ -184,14 +201,28 @@ public class ContentGeneratorRequestHandler extends 
SearchHandler {
                try {
 
                        hits = f.generateContentAbout(args[0].replace('+', ' 
').replace('"', ' ').trim());
+                       
                        System.out.println(HitBase.toString(hits));
-                       generatedContent = HitBase.toResultantString(hits);
+                       generatedContent = HitBase.toResultantString(hits) + 
"\n REFERENCES \n" + HitBase.produceReferenceSection(hits) ;
 
+                       try {
+                               writeResultInAFile(args[0].replace('+', ' '), 
generatedContent);
+                       } catch (Exception e2) {
+                               e2.printStackTrace();
+                       }
+                       
+                       String attachmentFileName = null;
+                       try {
+                               attachmentFileName = 
docBuilder.buildWordDoc(hits, args[0].replace('+', ' ').replace('"', ' '));
+                       } catch (Exception e2) {
+                               e2.printStackTrace();
+                       }
+                       
                        opennlp.tools.apps.utils.email.EmailSender s = new 
opennlp.tools.apps.utils.email.EmailSender();
 
                        try {
                                s.sendMail("smtp.rambler.ru", 
"[email protected]", "pill0693", new InternetAddress("[email protected]"), new 
InternetAddress[]{new InternetAddress(args[1])}, new InternetAddress[]{}, new 
InternetAddress[]{}, 
-                                               "Generated content for you on 
'"+args[0].replace('+', ' ')+"'", generatedContent, null);
+                                               "Generated content for you on 
'"+args[0].replace('+', ' ')+"'", generatedContent, attachmentFileName);
                        } catch (AddressException e) {
                                // TODO Auto-generated catch block
                                e.printStackTrace();
@@ -200,7 +231,7 @@ public class ContentGeneratorRequestHandler extends 
SearchHandler {
                                e.printStackTrace();
                                try {
                                        s.sendMail("smtp.rambler.ru", 
"[email protected]", "pill0693", new InternetAddress("[email protected]"), new 
InternetAddress[]{new InternetAddress(args[1])}, new InternetAddress[]{}, new 
InternetAddress[]{}, 
-                                                       "Generated content for 
you on '"+args[0].replace('+', ' ')+"'", generatedContent, null);
+                                                       "Generated content for 
you on '"+args[0].replace('+', ' ')+"'", generatedContent, attachmentFileName);
                                } catch (Exception e1) {
                                        // TODO Auto-generated catch block
                                        e1.printStackTrace();
@@ -214,6 +245,40 @@ public class ContentGeneratorRequestHandler extends 
SearchHandler {
                return generatedContent;
        }
 
+       private void writeResultInAFile(String title, String content){
+               FileOutputStream fop = null;
+               File file;
+               String absPath = new File(".").getAbsolutePath();
+               absPath = absPath.substring(0, absPath.length()-1);
+ 
+               try {
+ 
+                       file = new File(absPath+"/written/"+ title.replace(' 
','_').replace('\"', ' ').trim()+ ".txt");
+                       // if file doesnt exists, then create it
+                       if (!file.exists()) {
+                               file.createNewFile();
+                       }
+                       fop = new FileOutputStream(file);
+  
+                       // get the content in bytes
+                       byte[] contentInBytes = content.getBytes();
+ 
+                       fop.write(contentInBytes);
+                       fop.flush();
+                       fop.close(); 
+                        
+               } catch (IOException e) {
+                       e.printStackTrace();
+               } finally {
+                       try {
+                               if (fop != null) {
+                                       fop.close();
+                               }
+                       } catch (IOException e) {
+                               e.printStackTrace();
+                       }
+               }
+       }
        
 }
 

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeQueryComponent.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeQueryComponent.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeQueryComponent.java
index 14dc9ff..6693bbf 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeQueryComponent.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeQueryComponent.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package opennlp.tools.similarity.apps.solr;
 
 import java.io.IOException;
@@ -139,13 +155,13 @@ public class IterativeQueryComponent extends 
QueryComponent{
                        e.printStackTrace();
                }
                rb.setQparser(parser);
-               try {
+       /*      try {
                        rb.setScoreDoc(parser.getPaging());
                } catch (Exception e) {
                        // TODO Auto-generated catch block
                        e.printStackTrace();
                }
-
+*/
                String[] fqs = rb.req.getParams().getParams(CommonParams.FQ);
                if (fqs!=null && fqs.length!=0) {
                        List<Query> filters = rb.getFilters();

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeSearchRequestHandler.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeSearchRequestHandler.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeSearchRequestHandler.java
index 87f5ed9..be125b7 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeSearchRequestHandler.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/IterativeSearchRequestHandler.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package opennlp.tools.similarity.apps.solr;
 
 import java.io.IOException;

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/NLProgram2CodeRequestHandler.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/NLProgram2CodeRequestHandler.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/NLProgram2CodeRequestHandler.java
index 0876700..413dd5d 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/NLProgram2CodeRequestHandler.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/NLProgram2CodeRequestHandler.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package opennlp.tools.similarity.apps.solr;
 
 import java.io.IOException;
@@ -58,8 +74,8 @@ public class NLProgram2CodeRequestHandler extends 
SearchHandler {
        private ParseTreeChunkListScorer parseTreeChunkListScorer = new 
ParseTreeChunkListScorer();
        private ParserChunker2MatcherProcessor sm = null;
        private int MAX_QUERY_LENGTH_NOT_TO_RERANK=3;
-       private static String resourceDir = 
"/home/solr/solr-4.4.0/example/src/test/resources";
-       //"C:/workspace/TestSolr/src/test/resources";
+       private static String resourceDir = 
//"/home/solr/solr-4.4.0/example/src/test/resources";
+       "C:/workspace/TestSolr/src/test/resources";
 
        //"/data1/solr/example/src/test/resources";
        

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerRequestHandler.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerRequestHandler.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerRequestHandler.java
index fbef398..b259528 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerRequestHandler.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SearchResultsReRankerRequestHandler.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package opennlp.tools.similarity.apps.solr;
 
 import java.io.IOException;
@@ -164,7 +180,7 @@ public class SearchResultsReRankerRequestHandler extends 
SearchHandler {
                NamedList<Object> values = rsp.getValues();
                values.remove("response");
                values.add("response", scoreNum); 
-               values.add("new_order", bufNums.toString().trim());
+               //values.add("new_order", bufNums.toString().trim());
                rsp.setAllValues(values);
                
        }
@@ -187,9 +203,7 @@ public class SearchResultsReRankerRequestHandler extends 
SearchHandler {
        private List<HitBase> calculateMatchScoreResortHits(List<HitBase> hits,
                        String searchQuery) {
                try {
-                       System.out.println("loading openNLP models...from 
"+resourceDir);
                        sm =  
ParserChunker2MatcherProcessor.getInstance(resourceDir);
-                       System.out.println("DONE loading openNLP model s.");
                } catch (Exception e){
                        LOG.severe(e.getMessage());
                }

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java
index b2d6295..d2f4b1b 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/solr/SyntGenRequestHandler.java
@@ -1,3 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
 package opennlp.tools.similarity.apps.solr;
 
 import java.io.IOException;
@@ -56,7 +72,6 @@ import org.apache.solr.search.DocList;
 import org.apache.solr.search.DocSlice;
 import org.apache.solr.search.QParser;
 import org.apache.solr.search.SolrIndexSearcher;
-
 import org.apache.solr.util.RTimer;
 import org.apache.solr.util.SolrPluginUtils;
 

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
index 84440bd..59f2146 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomyExtenderViaMebMining.java
@@ -26,7 +26,7 @@ import java.util.logging.Logger;
 import opennlp.tools.similarity.apps.BingQueryRunner;
 import opennlp.tools.similarity.apps.HitBase;
 import opennlp.tools.similarity.apps.utils.StringCleaner;
-import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.PStemmer;
 import opennlp.tools.textsimilarity.ParseTreeChunk;
 import opennlp.tools.textsimilarity.ParseTreeChunkListScorer;
 import opennlp.tools.textsimilarity.SentencePairMatchResult;
@@ -51,7 +51,7 @@ public class TaxonomyExtenderViaMebMining extends 
BingQueryRunner {
 
   private Map<String, List<List<String>>> lemma_ExtendedAssocWords = new 
HashMap<String, List<List<String>>>();
   private Map<List<String>, List<List<String>>> assocWords_ExtendedAssocWords 
= new HashMap<List<String>, List<List<String>>>();
-  private PorterStemmer ps;
+  private PStemmer ps;
 
   public Map<List<String>, List<List<String>>> 
getAssocWords_ExtendedAssocWords() {
     return assocWords_ExtendedAssocWords;
@@ -73,7 +73,7 @@ public class TaxonomyExtenderViaMebMining extends 
BingQueryRunner {
       System.err.println("Problem loading synt matcher");
 
     }
-    ps = new PorterStemmer();
+    ps = new PStemmer();
 
   }
 

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java
index 16e9fb2..a70340e 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/taxo_builder/TaxonomySerializer.java
@@ -22,10 +22,13 @@ import java.io.IOException;
 import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
 import java.io.Serializable;
+import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 
+import opennlp.tools.jsmlearning.ProfileReaderWriter;
+
 /**
  * This class stores the taxonomy on the file-system
  * 
@@ -80,6 +83,31 @@ public class TaxonomySerializer implements Serializable {
       ex.printStackTrace();
     }
 
+     String csvFilename = filename+".csv";
+     List<String[]> taxo_list = new  ArrayList<String[]>();
+     List<String> entries = new 
ArrayList<String>(lemma_ExtendedAssocWords.keySet());
+     for(String e: entries){
+        List<String> lines = new ArrayList<String>();
+        lines.add(e);
+        for(List<String> ls: lemma_ExtendedAssocWords.get(e)){
+                lines.add(ls.toString());
+        }
+        taxo_list.add((String[])lines.toArray(new String[0]));
+     }
+     ProfileReaderWriter.writeReport(taxo_list, csvFilename);
+     
+     String csvFilenameListEntries = filename+"_ListEntries.csv";
+     taxo_list = new  ArrayList<String[]>();
+     List<List<String>> entriesList = new ArrayList<List<String>>( 
assocWords_ExtendedAssocWords.keySet());
+     for(List<String> e: entriesList){
+        List<String> lines = new ArrayList<String>();
+        lines.addAll(e);
+        for(List<String> ls: assocWords_ExtendedAssocWords.get(e)){
+                lines.add(ls.toString());
+        }
+        taxo_list.add((String[])lines.toArray(new String[0]));
+     }
+     ProfileReaderWriter.writeReport(taxo_list, csvFilenameListEntries);
   }
 
   public static TaxonomySerializer readTaxonomy(String filename) {

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java
index 4c01e39..7f17f84 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/PageFetcher.java
@@ -18,6 +18,7 @@
 package opennlp.tools.similarity.apps.utils;
 
 import java.io.BufferedReader;
+import java.io.File;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.net.MalformedURLException;
@@ -27,54 +28,94 @@ import java.util.logging.Logger;
 
 import org.apache.tika.Tika;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+
 
 public class PageFetcher {
-  private static final Logger LOG = Logger
+  private static final Logger log = Logger
       .getLogger("opennlp.tools.similarity.apps.utils.PageFetcher");
+  Tika tika = new Tika();
 
-  private static int DEFAULT_TIMEOUT = 15000;
+  private static int DEFAULT_TIMEOUT = 1500;
+  private void setTimeout(int to){
+         DEFAULT_TIMEOUT = to;
+  }
 
   public String fetchPage(final String url) {
     return fetchPage(url, DEFAULT_TIMEOUT);
   }
+  
+  public String fetchPageAutoDetectParser(final String url ){
+         String fetchURL = addHttp(url);
+         String pageContent = null;
+           URLConnection connection;
+           try {
+             log.info("fetch url  auto detect parser " + url);
+             connection = new URL(fetchURL).openConnection();
+             connection.setReadTimeout(DEFAULT_TIMEOUT);
+             
+           //parse method parameters
+             Parser parser = new AutoDetectParser();
+             BodyContentHandler handler = new BodyContentHandler();
+             Metadata metadata = new Metadata();
+             ParseContext context = new ParseContext();
+             
+             //parsing the file
+             parser.parse(connection.getInputStream(), handler, metadata, 
context);
+             
+             pageContent = handler.toString();
+           } catch (Exception e) {
+             log.info(e.getMessage() + "\n" + e);
+           }
+           return  pageContent;
+  }
+  
 
   public String fetchPage(final String url, final int timeout) {
     String fetchURL = addHttp(url);
 
-    LOG.info("fetch url " + fetchURL);
+    log.info("fetch url " + fetchURL);
 
     String pageContent = null;
     URLConnection connection;
     try {
-      connection = new URL(url).openConnection();
+      connection = new URL(fetchURL).openConnection();
       connection.setReadTimeout(DEFAULT_TIMEOUT);
-      Tika tika = new Tika();
+      
       pageContent = tika.parseToString(connection.getInputStream())
           .replace('\n', ' ').replace('\t', ' ');
     } catch (MalformedURLException e) {
-      LOG.severe(e.getMessage() + "\n" + e);
+      log.severe(e.getMessage() + "\n" + e);
     } catch (IOException e) {
-      LOG.severe(e.getMessage() + "\n" + e);
+      log.severe(e.getMessage() + "\n" + e);
     } catch (TikaException e) {
-      LOG.severe(e.getMessage() + "\n" + e);
+      log.severe(e.getMessage() + "\n" + e);
     }
     return pageContent;
   }
 
   private String addHttp(final String url) {
-    if (!url.startsWith("http://";)) {
+    if (!url.startsWith("http://";) && !url.startsWith("https://";)) {
       return "http://"; + url;
     }
     return url;
   }
+  
+  public String fetchOrigHTML(String url, int timeout) {
+         setTimeout(timeout);
+         return fetchOrigHTML(url);
+  }
 
   public String fetchOrigHTML(String url) {
-    System.out.println("fetch url " + url);
-    String pageContent = null;
+    log.info("fetch url " + url);
     StringBuffer buf = new StringBuffer();
     try {
       URLConnection connection = new URL(url).openConnection();
-      connection.setReadTimeout(10000);
+      connection.setReadTimeout(DEFAULT_TIMEOUT);
       connection
           .setRequestProperty(
               "User-Agent",
@@ -85,8 +126,8 @@ public class PageFetcher {
         reader = new BufferedReader(new InputStreamReader(
             connection.getInputStream()));
       } catch (Exception e) {
-        // we dont need to log trial web pages if access fails
-        // LOG.error(e.getMessage(), e);
+        // we dont always need to log trial web pages if access fails
+        log.severe(e.toString());
       }
 
       while ((line = reader.readLine()) != null) {
@@ -107,5 +148,19 @@ public class PageFetcher {
     } */
     return buf.toString();
   }
+  
+  public static void main(String[] args){
+         PageFetcher fetcher = new PageFetcher();
+         String content = 
fetcher.fetchPageAutoDetectParser("http://www.elastica.net/";);
+         System.out.println(content);
+         content = fetcher.
+                         fetchPageAutoDetectParser("http://www.cnn.com";);
+         System.out.println(content);
+         content = new PageFetcher().fetchPage("https://github.com";);
+         System.out.println(content);
+         content = new PageFetcher().fetchOrigHTML("http://www.cnn.com";);
+         System.out.println(content);
+         
+  }
 
 }

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java
index c2238c5..377b02a 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/similarity/apps/utils/StringDistanceMeasurer.java
@@ -20,11 +20,11 @@ package opennlp.tools.similarity.apps.utils;
 import java.util.ArrayList;
 import java.util.List;
 
-import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.PStemmer;
 
 public class StringDistanceMeasurer {
   // external tools
-  private PorterStemmer ps; // stemmer
+  private PStemmer ps; // stemmer
 
   private static final int MIN_STRING_LENGTH_FOR_WORD = 4;
 
@@ -36,7 +36,7 @@ public class StringDistanceMeasurer {
 
   public StringDistanceMeasurer() {
     // first get stemmer
-    ps = new PorterStemmer();
+    ps = new PStemmer();
     if (MIN_SCORE_FOR_LING > 1.0)
       return;
 

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PorterStemmer.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PorterStemmer.java 
b/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PorterStemmer.java
deleted file mode 100644
index e23da90..0000000
--- a/opennlp-similarity/src/main/java/opennlp/tools/stemmer/PorterStemmer.java
+++ /dev/null
@@ -1,521 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.stemmer;
-
-
-       import java.io.IOException;
-       import java.io.InputStream;
-       import java.io.FileInputStream;
-
-       import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_CHAR;
-       import org.apache.lucene.util.ArrayUtil;
-
-       /**
-        *
-        * Stemmer, implementing the Porter Stemming Algorithm
-        *
-        * The Stemmer class transforms a word into its root form.  The input
-        * word can be provided a character at time (by calling add()), or at 
once
-        * by calling one of the various stem(something) methods.
-        */
-
-       public class PorterStemmer
-       {
-         private char[] b;
-         private int i,    /* offset into b */
-           j, k, k0;
-         private boolean dirty = false;
-         private static final int INITIAL_SIZE = 50;
-
-         public PorterStemmer() {
-           b = new char[INITIAL_SIZE];
-           i = 0;
-         }
-
-         /**
-          * reset() resets the stemmer so it can stem another word.  If you 
invoke
-          * the stemmer by calling add(char) and then stem(), you must call 
reset()
-          * before starting another word.
-          */
-         public void reset() { i = 0; dirty = false; }
-
-         /**
-          * Add a character to the word being stemmed.  When you are finished
-          * adding characters, you can call stem(void) to process the word.
-          */
-         public void add(char ch) {
-           if (b.length <= i) {
-             b = ArrayUtil.grow(b, i+1);
-           }
-           b[i++] = ch;
-         }
-
-         /**
-          * After a word has been stemmed, it can be retrieved by toString(),
-          * or a reference to the internal buffer can be retrieved by 
getResultBuffer
-          * and getResultLength (which is generally more efficient.)
-          */
-         @Override
-         public String toString() { return new String(b,0,i); }
-
-         /**
-          * Returns the length of the word resulting from the stemming process.
-          */
-         public int getResultLength() { return i; }
-
-         /**
-          * Returns a reference to a character buffer containing the results of
-          * the stemming process.  You also need to consult getResultLength()
-          * to determine the length of the result.
-          */
-         public char[] getResultBuffer() { return b; }
-
-         /* cons(i) is true <=> b[i] is a consonant. */
-
-         private final boolean cons(int i) {
-           switch (b[i]) {
-           case 'a': case 'e': case 'i': case 'o': case 'u':
-             return false;
-           case 'y':
-             return (i==k0) ? true : !cons(i-1);
-           default:
-             return true;
-           }
-         }
-
-         /* m() measures the number of consonant sequences between k0 and j. 
if c is
-            a consonant sequence and v a vowel sequence, and <..> indicates 
arbitrary
-            presence,
-
-                 <c><v>       gives 0
-                 <c>vc<v>     gives 1
-                 <c>vcvc<v>   gives 2
-                 <c>vcvcvc<v> gives 3
-                 ....
-         */
-
-         private final int m() {
-           int n = 0;
-           int i = k0;
-           while(true) {
-             if (i > j)
-               return n;
-             if (! cons(i))
-               break;
-             i++;
-           }
-           i++;
-           while(true) {
-             while(true) {
-               if (i > j)
-                 return n;
-               if (cons(i))
-                 break;
-               i++;
-             }
-             i++;
-             n++;
-             while(true) {
-               if (i > j)
-                 return n;
-               if (! cons(i))
-                 break;
-               i++;
-             }
-             i++;
-           }
-         }
-
-         /* vowelinstem() is true <=> k0,...j contains a vowel */
-
-         private final boolean vowelinstem() {
-           int i;
-           for (i = k0; i <= j; i++)
-             if (! cons(i))
-               return true;
-           return false;
-         }
-
-         /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
-
-         private final boolean doublec(int j) {
-           if (j < k0+1)
-             return false;
-           if (b[j] != b[j-1])
-             return false;
-           return cons(j);
-         }
-
-         /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - 
consonant
-            and also if the second c is not w,x or y. this is used when trying 
to
-            restore an e at the end of a short word. e.g.
-
-                 cav(e), lov(e), hop(e), crim(e), but
-                 snow, box, tray.
-
-         */
-
-         private final boolean cvc(int i) {
-           if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2))
-             return false;
-           else {
-             int ch = b[i];
-             if (ch == 'w' || ch == 'x' || ch == 'y') return false;
-           }
-           return true;
-         }
-
-         private final boolean ends(String s) {
-           int l = s.length();
-           int o = k-l+1;
-           if (o < k0)
-             return false;
-           for (int i = 0; i < l; i++)
-             if (b[o+i] != s.charAt(i))
-               return false;
-           j = k-l;
-           return true;
-         }
-
-         /* setto(s) sets (j+1),...k to the characters in the string s, 
readjusting
-            k. */
-
-         void setto(String s) {
-           int l = s.length();
-           int o = j+1;
-           for (int i = 0; i < l; i++)
-             b[o+i] = s.charAt(i);
-           k = j+l;
-           dirty = true;
-         }
-
-         /* r(s) is used further down. */
-
-         void r(String s) { if (m() > 0) setto(s); }
-
-         /* step1() gets rid of plurals and -ed or -ing. e.g.
-
-                  caresses  ->  caress
-                  ponies    ->  poni
-                  ties      ->  ti
-                  caress    ->  caress
-                  cats      ->  cat
-
-                  feed      ->  feed
-                  agreed    ->  agree
-                  disabled  ->  disable
-
-                  matting   ->  mat
-                  mating    ->  mate
-                  meeting   ->  meet
-                  milling   ->  mill
-                  messing   ->  mess
-
-                  meetings  ->  meet
-
-         */
-
-         private final void step1() {
-           if (b[k] == 's') {
-             if (ends("sses")) k -= 2;
-             else if (ends("ies")) setto("i");
-             else if (b[k-1] != 's') k--;
-           }
-           if (ends("eed")) {
-             if (m() > 0)
-               k--;
-           }
-           else if ((ends("ed") || ends("ing")) && vowelinstem()) {
-             k = j;
-             if (ends("at")) setto("ate");
-             else if (ends("bl")) setto("ble");
-             else if (ends("iz")) setto("ize");
-             else if (doublec(k)) {
-               int ch = b[k--];
-               if (ch == 'l' || ch == 's' || ch == 'z')
-                 k++;
-             }
-             else if (m() == 1 && cvc(k))
-               setto("e");
-           }
-         }
-
-         /* step2() turns terminal y to i when there is another vowel in the 
stem. */
-
-         private final void step2() {
-           if (ends("y") && vowelinstem()) {
-             b[k] = 'i';
-             dirty = true;
-           }
-         }
-
-         /* step3() maps double suffices to single ones. so -ization ( = -ize 
plus
-            -ation) maps to -ize etc. note that the string before the suffix 
must give
-            m() > 0. */
-
-         private final void step3() {
-           if (k == k0) return; /* For Bug 1 */
-           switch (b[k-1]) {
-           case 'a':
-             if (ends("ational")) { r("ate"); break; }
-             if (ends("tional")) { r("tion"); break; }
-             break;
-           case 'c':
-             if (ends("enci")) { r("ence"); break; }
-             if (ends("anci")) { r("ance"); break; }
-             break;
-           case 'e':
-             if (ends("izer")) { r("ize"); break; }
-             break;
-           case 'l':
-             if (ends("bli")) { r("ble"); break; }
-             if (ends("alli")) { r("al"); break; }
-             if (ends("entli")) { r("ent"); break; }
-             if (ends("eli")) { r("e"); break; }
-             if (ends("ousli")) { r("ous"); break; }
-             break;
-           case 'o':
-             if (ends("ization")) { r("ize"); break; }
-             if (ends("ation")) { r("ate"); break; }
-             if (ends("ator")) { r("ate"); break; }
-             break;
-           case 's':
-             if (ends("alism")) { r("al"); break; }
-             if (ends("iveness")) { r("ive"); break; }
-             if (ends("fulness")) { r("ful"); break; }
-             if (ends("ousness")) { r("ous"); break; }
-             break;
-           case 't':
-             if (ends("aliti")) { r("al"); break; }
-             if (ends("iviti")) { r("ive"); break; }
-             if (ends("biliti")) { r("ble"); break; }
-             break;
-           case 'g':
-             if (ends("logi")) { r("log"); break; }
-           }
-         }
-
-         /* step4() deals with -ic-, -full, -ness etc. similar strategy to 
step3. */
-
-         private final void step4() {
-           switch (b[k]) {
-           case 'e':
-             if (ends("icate")) { r("ic"); break; }
-             if (ends("ative")) { r(""); break; }
-             if (ends("alize")) { r("al"); break; }
-             break;
-           case 'i':
-             if (ends("iciti")) { r("ic"); break; }
-             break;
-           case 'l':
-             if (ends("ical")) { r("ic"); break; }
-             if (ends("ful")) { r(""); break; }
-             break;
-           case 's':
-             if (ends("ness")) { r(""); break; }
-             break;
-           }
-         }
-
-         /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
-
-         private final void step5() {
-           if (k == k0) return; /* for Bug 1 */
-           switch (b[k-1]) {
-           case 'a':
-             if (ends("al")) break;
-             return;
-           case 'c':
-             if (ends("ance")) break;
-             if (ends("ence")) break;
-             return;
-           case 'e':
-             if (ends("er")) break; return;
-           case 'i':
-             if (ends("ic")) break; return;
-           case 'l':
-             if (ends("able")) break;
-             if (ends("ible")) break; return;
-           case 'n':
-             if (ends("ant")) break;
-             if (ends("ement")) break;
-             if (ends("ment")) break;
-             /* element etc. not stripped before the m */
-             if (ends("ent")) break;
-             return;
-           case 'o':
-             if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
-             /* j >= 0 fixes Bug 2 */
-             if (ends("ou")) break;
-             return;
-             /* takes care of -ous */
-           case 's':
-             if (ends("ism")) break;
-             return;
-           case 't':
-             if (ends("ate")) break;
-             if (ends("iti")) break;
-             return;
-           case 'u':
-             if (ends("ous")) break;
-             return;
-           case 'v':
-             if (ends("ive")) break;
-             return;
-           case 'z':
-             if (ends("ize")) break;
-             return;
-           default:
-             return;
-           }
-           if (m() > 1)
-             k = j;
-         }
-
-         /* step6() removes a final -e if m() > 1. */
-
-         private final void step6() {
-           j = k;
-           if (b[k] == 'e') {
-             int a = m();
-             if (a > 1 || a == 1 && !cvc(k-1))
-               k--;
-           }
-           if (b[k] == 'l' && doublec(k) && m() > 1)
-             k--;
-         }
-
-
-         /**
-          * Stem a word provided as a String.  Returns the result as a String.
-          */
-         public String stem(String s) {
-           if (stem(s.toCharArray(), s.length()))
-             return toString();
-           else
-             return s;
-         }
-
-         /** Stem a word contained in a char[].  Returns true if the stemming 
process
-          * resulted in a word different from the input.  You can retrieve the
-          * result with getResultLength()/getResultBuffer() or toString().
-          */
-         public boolean stem(char[] word) {
-           return stem(word, word.length);
-         }
-
-         /** Stem a word contained in a portion of a char[] array.  Returns
-          * true if the stemming process resulted in a word different from
-          * the input.  You can retrieve the result with
-          * getResultLength()/getResultBuffer() or toString().
-          */
-         public boolean stem(char[] wordBuffer, int offset, int wordLen) {
-           reset();
-           if (b.length < wordLen) {
-             b = new char[ArrayUtil.oversize(wordLen, NUM_BYTES_CHAR)];
-           }
-           System.arraycopy(wordBuffer, offset, b, 0, wordLen);
-           i = wordLen;
-           return stem(0);
-         }
-
-         /** Stem a word contained in a leading portion of a char[] array.
-          * Returns true if the stemming process resulted in a word different
-          * from the input.  You can retrieve the result with
-          * getResultLength()/getResultBuffer() or toString().
-          */
-         public boolean stem(char[] word, int wordLen) {
-           return stem(word, 0, wordLen);
-         }
-
-         /** Stem the word placed into the Stemmer buffer through calls to 
add().
-          * Returns true if the stemming process resulted in a word different
-          * from the input.  You can retrieve the result with
-          * getResultLength()/getResultBuffer() or toString().
-          */
-         public boolean stem() {
-           return stem(0);
-         }
-
-         public boolean stem(int i0) {
-           k = i - 1;
-           k0 = i0;
-           if (k > k0+1) {
-             step1(); step2(); step3(); step4(); step5(); step6();
-           }
-           // Also, a word is considered dirty if we lopped off letters
-           // Thanks to Ifigenia Vairelles for pointing this out.
-           if (i != k+1)
-             dirty = true;
-           i = k+1;
-           return dirty;
-         }
-
-         /** Test program for demonstrating the Stemmer.  It reads a file and
-          * stems each word, writing the result to standard out.
-          * Usage: Stemmer file-name
-          */
-         public static void main(String[] args) {
-           PorterStemmer s = new PorterStemmer();
-
-           for (int i = 0; i < args.length; i++) {
-             try {
-               InputStream in = new FileInputStream(args[i]);
-               byte[] buffer = new byte[1024];
-               int bufferLen, offset, ch;
-
-               bufferLen = in.read(buffer);
-               offset = 0;
-               s.reset();
-
-               while(true) {
-                 if (offset < bufferLen)
-                   ch = buffer[offset++];
-                 else {
-                   bufferLen = in.read(buffer);
-                   offset = 0;
-                   if (bufferLen < 0)
-                     ch = -1;
-                   else
-                     ch = buffer[offset++];
-                 }
-
-                 if (Character.isLetter((char) ch)) {
-                   s.add(Character.toLowerCase((char) ch));
-                 }
-                 else {
-                    s.stem();
-                    System.out.print(s.toString());
-                    s.reset();
-                    if (ch < 0)
-                      break;
-                    else {
-                      System.out.print((char) ch);
-                    }
-                  }
-               }
-
-               in.close();
-             }
-             catch (IOException e) {
-               System.out.println("error reading " + args[i]);
-             }
-           }
-         }
-       }
-

http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java
----------------------------------------------------------------------
diff --git 
a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java
 
b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java
index 1dc100c..a72583e 100644
--- 
a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java
+++ 
b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/LemmaFormManager.java
@@ -19,11 +19,11 @@ package opennlp.tools.textsimilarity;
 
 import java.util.List;
 
-import opennlp.tools.stemmer.PorterStemmer;
+import opennlp.tools.stemmer.PStemmer;
 
 public class LemmaFormManager {
 
-  public String matchLemmas(PorterStemmer ps, String lemma1, String lemma2,
+  public String matchLemmas(PStemmer ps, String lemma1, String lemma2,
       String POS) {
     if (POS == null) {
       return null;

[3/5] opennlp-sandbox git commit: merge from bgalitsky's own git repo

Reply via email to