merge from bgalitsky's own git repo
Project: http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/repo Commit: http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/commit/9aa270c1 Tree: http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/tree/9aa270c1 Diff: http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/diff/9aa270c1 Branch: refs/heads/master Commit: 9aa270c11a5974fbd10d42f1510e855cb1040035 Parents: ad4195b Author: Boris Galitsky <[email protected]> Authored: Wed Nov 16 10:04:29 2016 -0800 Committer: Boris Galitsky <[email protected]> Committed: Wed Nov 16 10:04:29 2016 -0800 ---------------------------------------------------------------------- opennlp-similarity/README | 138 --- opennlp-similarity/pom.xml | 424 +++++--- .../multithreaded/BingWebQueryRunnerThread.java | 2 +- .../tools/apps/relevanceVocabs/POStags.java | 17 + .../apps/relevanceVocabs/PhraseProcessor.java | 17 + .../apps/relevanceVocabs/SentimentVocab.java | 19 +- .../apps/relevanceVocabs/SynonymListFilter.java | 17 + .../tools/apps/relevanceVocabs/SynonymMap.java | 64 +- .../apps/relevanceVocabs/WordDictionary.java | 17 + .../tools/apps/utils/email/EmailSender.java | 7 +- .../tools/jsmlearning/ProfileReaderWriter.java | 25 + .../tools/parse_thicket/ParseCorefsBuilder.java | 24 +- .../tools/parse_thicket/ParseThicket.java | 46 + .../tools/parse_thicket/ParseTreeNode.java | 215 ++-- .../WordWordInterSentenceRelationArc.java | 2 +- .../MultiSentenceSearchResultsProcessor.java | 4 +- ...edForestSearchResultsProcessorSetFormer.java | 3 +- ...ntenceKernelBasedSearchResultsProcessor.java | 2 +- .../PT2ExtendedTreeForestBuilder.java | 35 +- .../SnippetToParagraphFull.java | 17 + .../TreeExtenderByAnotherLinkedTree.java | 111 +- .../kernel_interface/TreeKernelRunner.java | 33 +- .../matching/GeneralizationListReducer.java | 148 --- .../matching/LemmaFormManager.java | 8 +- .../tools/parse_thicket/matching/Matcher.java | 199 +++- .../matching/PT2ThicketPhraseBuilder.java | 298 +++--- .../matching/ParseTreeChunkListScorer.java | 96 -- .../parse_thicket/matching/ParseTreePath.java | 422 -------- .../matching/ParseTreePathComparable.java | 32 - .../matching/ParseTreePathMatcher.java | 254 ----- .../ParseTreePathMatcherDeterministic.java | 280 ----- .../parse_thicket2graph/EdgeProductBuilder.java | 16 + .../GraphFromPTreeBuilder.java | 16 + .../parse_thicket2graph/ParseGraphNode.java | 16 + .../ParseTreeVisualizer.java | 35 +- .../pattern_structure/PhraseConcept.java | 129 ++- .../PhrasePatternStructure.java | 358 ++++--- .../RhetoricStructureArcsBuilder.java | 16 + .../RhetoricStructureMarker.java | 16 + .../tools/similarity/apps/BingQueryRunner.java | 167 +-- .../similarity/apps/BingWebQueryRunner.java | 17 +- .../apps/ContentGeneratorSupport.java | 32 +- .../apps/GeneratedSentenceProcessor.java | 8 +- .../similarity/apps/RelatedSentenceFinder.java | 16 + .../apps/StoryDiscourseNavigator.java | 38 +- .../solr/ContentGeneratorRequestHandler.java | 81 +- .../apps/solr/IterativeQueryComponent.java | 20 +- .../solr/IterativeSearchRequestHandler.java | 16 + .../apps/solr/NLProgram2CodeRequestHandler.java | 20 +- .../SearchResultsReRankerRequestHandler.java | 20 +- .../apps/solr/SyntGenRequestHandler.java | 17 +- .../TaxonomyExtenderViaMebMining.java | 6 +- .../apps/taxo_builder/TaxonomySerializer.java | 28 + .../similarity/apps/utils/PageFetcher.java | 83 +- .../apps/utils/StringDistanceMeasurer.java | 6 +- .../opennlp/tools/stemmer/PorterStemmer.java | 521 --------- .../tools/textsimilarity/LemmaFormManager.java | 4 +- .../tools/textsimilarity/ParseTreeChunk.java | 933 +++++++++------- .../ParseTreeChunkListScorer.java | 13 +- .../ParseTreeMatcherDeterministic.java | 4 +- .../tools/textsimilarity/TextProcessor.java | 6 +- .../opennlp/tools/textsimilarity/readme.txt | 15 + .../apps/RelatedSentenceFinderTest.java | 20 +- .../matching/PT2ThicketPhraseBuilderTest.java | 17 + .../parse_thicket/matching/PTMatcherTest.java | 73 +- .../matching/PTPhraseBuilderTest.java | 17 + .../matching/PairwiseMatcherTest.java | 19 +- .../PhrasePatternStructureTest.java | 27 +- .../tools/textsimilarity/SyntMatcherTest.java | 8 +- .../ParserChunker2MatcherProcessorTest.java | 10 +- .../src/test/resources/sentence_parseObject.csv | 1000 ++++++++++++++---- 71 files changed, 3408 insertions(+), 3432 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/README ---------------------------------------------------------------------- diff --git a/opennlp-similarity/README b/opennlp-similarity/README deleted file mode 100644 index b535487..0000000 --- a/opennlp-similarity/README +++ /dev/null @@ -1,138 +0,0 @@ -Apache OpenNLP ${pom.version} -=============================== - - -Building from the Source Distribution -------------------------------------- - -At least Maven 3.0.0 is required for building. - -To build everything go into the opennlp directory and run the following command: - mvn clean install - -The results of the build will be placed in: - opennlp-distr/target/apache-opennlp-[version]-bin.tar-gz (or .zip) - -What is in Similarity component in Apache OpenNLP ${pom.version} ---------------------------------------- -SIMILARITY COMPONENT of OpenNLP - -1. Introduction -This component does text relevance assessment. It takes two portions of texts (phrases, sentences, paragraphs) and returns a similarity score. -Similarity component can be used on top of search to improve relevance, computing similarity score between a question and all search results (snippets). -Also, this component is useful for web mining of images, videos, forums, blogs, and other media with textual descriptions. Such applications as content generation -and filtering meaningless speech recognition results are included in the sample applications of this component. - Relevance assessment is based on machine learning of syntactic parse trees (constituency trees, http://en.wikipedia.org/wiki/Parse_tree). -The similarity score is calculated as the size of all maximal common sub-trees for sentences from a pair of texts ( -www.aaai.org/ocs/index.php/WS/AAAIW11/paper/download/3971/4187, www.aaai.org/ocs/index.php/FLAIRS/FLAIRS11/paper/download/2573/3018, -www.aaai.org/ocs/index.php/SSS/SSS10/paper/download/1146/1448). - The objective of Similarity component is to give an application engineer as tool for text relevance which can be used as a black box, no need to understand - computational linguistics or machine learning. - - 2. Installation - Please refer to OpenNLP installation instructions - - 3. First use case of Similarity component: search - - To start with this component, please refer to SearchResultsProcessorTest.java in package opennlp.tools.similarity.apps - public void testSearchOrder() runs web search using Bing API and improves search relevance. - Look at the code of - public List<HitBase> runSearch(String query) - and then at - private BingResponse calculateMatchScoreResortHits(BingResponse resp, String searchQuery) - which gets search results from Bing and re-ranks them based on computed similarity score. - - The main entry to Similarity component is - SentencePairMatchResult matchRes = sm.assessRelevance(snapshot, searchQuery); - where we pass the search query and the snapshot and obtain the similarity assessment structure which includes the similarity score. - - To run this test you need to obtain search API key from Bing at www.bing.com/developers/s/APIBasics.html and specify it in public class BingQueryRunner in - protected static final String APP_ID. - - 4. Solving a unique problem: content generation - To demonstrate the usability of Similarity component to tackle a problem which is hard to solve without a linguistic-based technology, - we introduce a content generation component: - RelatedSentenceFinder.java - - The entry point here is the function call - hits = f.generateContentAbout("Albert Einstein"); - which writes a biography of Albert Einstein by finding sentences on the web about various kinds of his activities (such as 'born', 'graduate', 'invented' etc.). - The key here is to compute similarity between the seed expression like "Albert Einstein invented relativity theory" and search result like - "Albert Einstein College of Medicine | Medical Education | Biomedical ... - www.einstein.yu.edu/Albert Einstein College of Medicine is one of the nation's premier institutions for medical education, ..." - and filter out irrelevant search results. - - This is done in function - public HitBase augmentWithMinedSentencesAndVerifyRelevance(HitBase item, String originalSentence, - List<String> sentsAll) - - SentencePairMatchResult matchRes = sm.assessRelevance(pageSentence + " " + title, originalSentence); - You can consult the results in gen.txt, where an essay on Einstein bio is written. - - These are examples of generated articles, given the article title - http://www.allvoices.com/contributed-news/9423860/content/81937916-ichie-sings-jazz-blues-contemporary-tunes - http://www.allvoices.com/contributed-news/9415063-britney-spears-femme-fatale-in-north-sf-bay-area - - 5. Solving a high-importance problem: filtering out meaningless speech recognition results. - Speech recognitions SDKs usually produce a number of phrases as results, such as - "remember to buy milk tomorrow from trader joes", - "remember to buy milk tomorrow from 3 to jones" - One can see that the former is meaningful, and the latter is meaningless (although similar in terms of how it is pronounced). - We use web mining and Similarity component to detect a meaningful option (a mistake caused by trying to interpret meaningless - request by a query understanding system such as Siri for iPhone can be costly). - - SpeechRecognitionResultsProcessor.java does the job: - public List<SentenceMeaningfullnessScore> runSearchAndScoreMeaningfulness(List<String> sents) - re-ranks the phrases in the order of decrease of meaningfulness. - - 6. Similarity component internals - in the package opennlp.tools.textsimilarity.chunker2matcher - ParserChunker2MatcherProcessor.java does parsing of two portions of text and matching the resultant parse trees to assess similarity between - these portions of text. - To run ParserChunker2MatcherProcessor - private static String MODEL_DIR = "resources/models"; - needs to be specified - - The key function - public SentencePairMatchResult assessRelevance(String para1, String para2) - takes two portions of text and does similarity assessment by finding the set of all maximum common subtrees - of the set of parse trees for each portion of text - - It splits paragraphs into sentences, parses them, obtained chunking information and produces grouped phrases (noun, evrn, prepositional etc.): - public synchronized List<List<ParseTreeChunk>> formGroupedPhrasesFromChunksForPara(String para) - - and then attempts to find common subtrees: - in ParseTreeMatcherDeterministic.java - List<List<ParseTreeChunk>> res = md.matchTwoSentencesGroupedChunksDeterministic(sent1GrpLst, sent2GrpLst) - - Phrase matching functionality is in package opennlp.tools.textsimilarity; - ParseTreeMatcherDeterministic.java: - Here's the key matching function which takes two phrases, aligns them and finds a set of maximum common sub-phrase - public List<ParseTreeChunk> generalizeTwoGroupedPhrasesDeterministic - - 7. Package structure - opennlp.tools.similarity.apps : 3 main applications - opennlp.tools.similarity.apps.utils: utilities for above applications - - opennlp.tools.textsimilarity.chunker2matcher: parser which converts text into a form for matching parse trees - opennlp.tools.textsimilarity: parse tree matching functionality - - - - -Requirements ------------- -Java 1.5 is required to run OpenNLP -Maven 3.0.0 is required for building it - -Known OSGi Issues ------------- -In an OSGi environment the following things are not supported: -- The coreference resolution component -- The ability to load a user provided feature generator class - -Note ----- -The current API contains still many deprecated methods, these -will be removed in one of our next releases, please -migrate to our new API. http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/pom.xml ---------------------------------------------------------------------- diff --git a/opennlp-similarity/pom.xml b/opennlp-similarity/pom.xml index 35b768b..a583e8e 100644 --- a/opennlp-similarity/pom.xml +++ b/opennlp-similarity/pom.xml @@ -1,25 +1,18 @@ <?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one - or more contributor license agreements. See the NOTICE file - distributed with this work for additional information - regarding copyright ownership. The ASF licenses this file - to you under the Apache License, Version 2.0 (the - "License"); you may not use this file except in compliance - with the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, - software distributed under the License is distributed on an - "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - KIND, either express or implied. See the License for the - specific language governing permissions and limitations - under the License. ---> - -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + you under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> <modelVersion>4.0.0</modelVersion> <parent> @@ -31,35 +24,52 @@ <groupId>org.apache.opennlp</groupId> <artifactId>opennlp-similarity</artifactId> - <version>0.0.1</version> + <version>0.1.0</version> <packaging>jar</packaging> <name>OpenNLP Tool Similarity distribution</name> <scm> - <connection>scm:svn:http://svn.apache.org/repos/asf/opennlp/sandbox/opennlp-similarity/tags/opennlp-similarity-0.0.1</connection> - <developerConnection>scm:svn:https://svn.apache.org/repos/asf/opennlp/sandbox/opennlp-similarity/tags/opennlp-similarity-0.0.1</developerConnection> - <url>http://svn.apache.org/viewvc/opennlp/tags/opennlp-similarity-0.0.1</url> + <connection>scm:svn:http://svn.apache.org/repos/asf/opennlp/sandbox/opennlp-similarity/tags/opennlp-similarity-0.0.1</connection> + <developerConnection>scm:svn:https://svn.apache.org/repos/asf/opennlp/sandbox/opennlp-similarity/tags/opennlp-similarity-0.0.1</developerConnection> + <url>http://svn.apache.org/viewvc/opennlp/tags/opennlp-similarity-1.1.0</url> </scm> <prerequisites> <maven>3.0</maven> </prerequisites> - + <distributionManagement> + <snapshotRepository> + <id>ossrh</id> + <url>https://oss.sonatype.org/content/repositories/snapshots</url> + </snapshotRepository> + </distributionManagement> + + <repositories> <repository> - <id>net.billylieurance</id> - <name>BillyLieuranceNet</name> - <url>http://www.billylieurance.net/maven2</url> - </repository> + <id>net.billylieurance</id> + <name>BillyLieuranceNet</name> + <url>http://www.billylieurance.net/maven2</url> + </repository> </repositories> + + <properties> + <nd4j.version>0.4-rc3.4</nd4j.version> + <dl4j.version>0.4-rc3.3</dl4j.version> + </properties> <dependencies> <dependency> - <groupId>org.apache.opennlp</groupId> - <artifactId>opennlp-tools</artifactId> - <version>1.5.2-incubating</version> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + <version>1.6.4</version> + </dependency> + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-tools</artifactId> + <version>1.5.2-incubating</version> </dependency> - + <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> @@ -77,11 +87,10 @@ <artifactId>json</artifactId> <version>20090211</version> </dependency> - <dependency> <groupId>org.apache.tika</groupId> - <artifactId>tika-core</artifactId> - <version>0.7</version> + <artifactId>tika-app</artifactId> + <version>1.6</version> </dependency> <dependency> <groupId>net.sf.opencsv</groupId> @@ -91,57 +100,179 @@ <dependency> <groupId>org.apache.lucene</groupId> <artifactId>lucene-core</artifactId> - <version>4.0.0-BETA</version> + <version>4.10.0</version> </dependency> - + <dependency> <groupId>org.apache.solr</groupId> <artifactId>solr-core</artifactId> - <version>4.0.0-BETA</version> + <version>4.10.0</version> + </dependency> + <dependency> + <groupId>commons-codec</groupId> + <artifactId>commons-codec</artifactId> + <version>1.7</version> </dependency> <dependency> - <groupId>commons-codec</groupId> - <artifactId>commons-codec</artifactId> - <version>1.7</version> + <groupId>commons-logging</groupId> + <artifactId>commons-logging</artifactId> + <version>1.1.1</version> </dependency> <dependency> - <groupId>commons-logging</groupId> - <artifactId>commons-logging</artifactId> - <version>1.1.1</version> + <groupId>commons-collections</groupId> + <artifactId>commons-collections</artifactId> + <version>3.1</version> </dependency> <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>httpclient</artifactId> - <version>4.2.1</version> - </dependency> - <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>httpclient-cache</artifactId> - <version>4.2.1</version> + <groupId>org.apache.commons</groupId> + <artifactId>commons-math3</artifactId> + <version>3.5</version> + </dependency> + + <dependency> + <groupId>org.apache.httpcomponents</groupId> + <artifactId>httpclient</artifactId> + <version>4.2.1</version> + </dependency> + <dependency> + <groupId>org.apache.httpcomponents</groupId> + <artifactId>httpclient-cache</artifactId> + <version>4.2.1</version> + </dependency> + <dependency> + <groupId>org.apache.httpcomponents</groupId> + <artifactId>httpcore</artifactId> + <version>4.2.1</version> + </dependency> + <dependency> + <groupId>org.apache.httpcomponents</groupId> + <artifactId>httpmime</artifactId> + <version>4.2.1</version> </dependency> <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>httpcore</artifactId> - <version>4.2.1</version> + <groupId>org.apache.httpcomponents</groupId> + <artifactId>fluent-hc</artifactId> + <version>4.2.1</version> </dependency> + <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>httpmime</artifactId> - <version>4.2.1</version> - </dependency> + <groupId>org.jgrapht</groupId> + <artifactId>jgrapht-jdk1.5</artifactId> + <version>0.7.3</version> + </dependency> <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>fluent-hc</artifactId> - <version>4.2.1</version> - </dependency> + <groupId>de.jollyday</groupId> + <artifactId>jollyday</artifactId> + <version>0.4.7</version> + </dependency> + <dependency> + <groupId>jgraph</groupId> + <artifactId>jgraph</artifactId> + <version>5.13.0.0</version> + </dependency> + <dependency> + <groupId>javax.mail</groupId> + <artifactId>mail</artifactId> + <version>1.4</version> + </dependency> + <dependency> + <groupId>com.restfb</groupId> + <artifactId>restfb</artifactId> + <version>1.6.12</version> + </dependency> + <dependency> + <groupId>com.memetix</groupId> + <artifactId>microsoft-translator-java-api</artifactId> + <version>0.3</version> + </dependency> + + <dependency> + <groupId>net.billylieurance.azuresearch</groupId> + <artifactId>azure-bing-search-java</artifactId> + <version>0.11.0</version> + </dependency> + <dependency> + <groupId>edu.mit</groupId> + <artifactId>jverbnet</artifactId> + <version>1.2.0</version> + <systemPath>${project.basedir}/lib/edu.mit.jverbnet-1.2.0.jar</systemPath> + <scope>system</scope> + </dependency> + <dependency> + <groupId>edu.stanford.nlp</groupId> + <artifactId>stanford-corenlp</artifactId> + <version>3.5.2</version> + </dependency> + <dependency> + <groupId>edu.stanford.nlp</groupId> + <artifactId>stanford-corenlp-model</artifactId> + <version>3.5.2</version> + <systemPath>${project.basedir}/lib/stanford-corenlp-3.5.2-models.jar</systemPath> + <scope>system</scope> + </dependency> + <dependency> + <groupId>edu.stanford.nlp</groupId> + <artifactId>ejml</artifactId> + <version>0.23</version> + <systemPath>${project.basedir}/lib/ejml-0.23.jar</systemPath> + <scope>system</scope> + </dependency> + <dependency> + <groupId>edu.stanford.nlp</groupId> + <artifactId>joda-time</artifactId> + <version>0.23</version> + <systemPath>${project.basedir}/lib/joda-time.jar</systemPath> + <scope>system</scope> + </dependency> + <dependency> + <groupId>edu.stanford.nlp</groupId> + <artifactId>jollyday</artifactId> + <version>0.23</version> + <systemPath>${project.basedir}/lib/jollyday.jar</systemPath> + <scope>system</scope> + </dependency> <dependency> - <groupId>net.billylieurance.azuresearch</groupId> - <artifactId>azure-bing-search-java</artifactId> - <version>0.11.0</version> + <groupId>edu.stanford.nlp</groupId> + <artifactId>xom</artifactId> + <version>0.23</version> + <systemPath>${project.basedir}/lib/xom.jar</systemPath> + <scope>system</scope> </dependency> - + <dependency> + <groupId>org.docx4j</groupId> + <artifactId>docx4j</artifactId> + <version>2.7.1</version> + </dependency> + + <dependency> + <groupId>org.clulab</groupId> + <artifactId>processors_2.11</artifactId> + <version>5.7.1</version> + </dependency> + <dependency> + <groupId>org.clulab</groupId> + <artifactId>processors_2.11</artifactId> + <version>5.7.1</version> + <classifier>models</classifier> + </dependency> + <dependency> + <groupId>org.deeplearning4j</groupId> + <artifactId>deeplearning4j-ui</artifactId> + <version>${dl4j.version}</version> + </dependency> + <dependency> + <groupId>org.deeplearning4j</groupId> + <artifactId>deeplearning4j-nlp</artifactId> + <version>${dl4j.version}</version> + </dependency> + <dependency> + <groupId>org.nd4j</groupId> + <artifactId>nd4j-jblas</artifactId> + <version>${nd4j.version}</version> + </dependency> + </dependencies> - + <build> <plugins> <plugin> @@ -150,10 +281,10 @@ <configuration> <source>1.5</source> <target>1.5</target> - <compilerArgument>-Xlint</compilerArgument> + <compilerArgument>-Xlint</compilerArgument> </configuration> </plugin> - + <plugin> <artifactId>maven-source-plugin</artifactId> <executions> @@ -183,70 +314,93 @@ </execution> </executions> </plugin> - <plugin> - <artifactId>maven-antrun-plugin</artifactId> - <version>1.6</version> - <executions> - <execution> - <id>generate checksums for binary artifacts</id> - <goals><goal>run</goal></goals> - <phase>verify</phase> - <configuration> - <target> - <checksum algorithm="sha1" format="MD5SUM"> - <fileset dir="${project.build.directory}"> - <include name="*.zip" /> - <include name="*.gz" /> - </fileset> - </checksum> - <checksum algorithm="md5" format="MD5SUM"> - <fileset dir="${project.build.directory}"> - <include name="*.zip" /> - <include name="*.gz" /> - </fileset> - </checksum> - </target> - </configuration> - </execution> - </executions> - </plugin> - <plugin> - <artifactId>maven-assembly-plugin</artifactId> - <executions> - <execution> - <id>src</id> - <goals> - <goal>single</goal> - </goals> - <phase>package</phase> - <configuration> - <descriptors> - <descriptor>src/main/assembly/assembly.xml</descriptor> - </descriptors> - </configuration> - </execution> - <execution> - <id>source-release-assembly</id> - <configuration> - <skipAssembly>true</skipAssembly> - <mavenExecutorId>forked-path</mavenExecutorId> - </configuration> - </execution> - </executions> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-gpg-plugin</artifactId> - <executions> - <execution> - <id>sign-artifacts</id> - <phase>verify</phase> - <goals> - <goal>sign</goal> - </goals> - </execution> - </executions> - </plugin> + <plugin> + <artifactId>maven-antrun-plugin</artifactId> + <version>1.6</version> + <executions> + <execution> + <id>generate checksums for binary artifacts</id> + <goals> + <goal>run</goal> + </goals> + <phase>verify</phase> + <configuration> + <target> + <checksum algorithm="sha1" format="MD5SUM"> + <fileset dir="${project.build.directory}"> + <include name="*.zip" /> + <include name="*.gz" /> + </fileset> + </checksum> + <checksum algorithm="md5" format="MD5SUM"> + <fileset dir="${project.build.directory}"> + <include name="*.zip" /> + <include name="*.gz" /> + </fileset> + </checksum> + </target> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <artifactId>maven-assembly-plugin</artifactId> + <executions> + <execution> + <id>src</id> + <goals> + <goal>single</goal> + </goals> + <phase>package</phase> + <configuration> + <descriptors> + <descriptor>src/main/assembly/assembly.xml</descriptor> + </descriptors> + </configuration> + </execution> + <execution> + <id>source-release-assembly</id> + <configuration> + <skipAssembly>true</skipAssembly> + <mavenExecutorId>forked-path</mavenExecutorId> + </configuration> + </execution> + </executions> + </plugin> + <!-- <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-gpg-plugin</artifactId> + <executions> + <execution> + <id>sign-artifacts</id> + <phase>verify</phase> + <goals> + <goal>sign</goal> + </goals> + </execution> + </executions> + </plugin> + --> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <version>3.1</version> + <configuration> + <source>1.8</source> + <target>1.8</target> + </configuration> + </plugin> + <plugin> + <groupId>org.sonatype.plugins</groupId> + <artifactId>nexus-staging-maven-plugin</artifactId> + <version>1.6.3</version> + <extensions>true</extensions> + <configuration> + <serverId>ossrh</serverId> + <nexusUrl>https://oss.sonatype.org/</nexusUrl> + <autoReleaseAfterClose>true</autoReleaseAfterClose> + </configuration> + </plugin> </plugins> </build> </project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/BingWebQueryRunnerThread.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/BingWebQueryRunnerThread.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/BingWebQueryRunnerThread.java index b75a13b..b712847 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/BingWebQueryRunnerThread.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/contentgen/multithreaded/BingWebQueryRunnerThread.java @@ -6,7 +6,7 @@ import java.util.List; import opennlp.tools.similarity.apps.BingQueryRunner; import opennlp.tools.similarity.apps.HitBase; - public class BingWebQueryRunnerThread extends BingQueryRunner implements Runnable{ +public class BingWebQueryRunnerThread extends BingQueryRunner implements Runnable{ private String query; private List<HitBase> results= new ArrayList<HitBase>(); http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/POStags.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/POStags.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/POStags.java index 45dadf9..fafdef0 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/POStags.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/POStags.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package opennlp.tools.apps.relevanceVocabs; public interface POStags { http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/PhraseProcessor.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/PhraseProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/PhraseProcessor.java index ae2772b..0d2ba00 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/PhraseProcessor.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/PhraseProcessor.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package opennlp.tools.apps.relevanceVocabs; import java.util.ArrayList; http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SentimentVocab.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SentimentVocab.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SentimentVocab.java index 150b3df..aced079 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SentimentVocab.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SentimentVocab.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package opennlp.tools.apps.relevanceVocabs; import java.util.HashMap; @@ -59,7 +76,7 @@ public class SentimentVocab { private static final String[] POSITIVE_NOUN_LIST = { "ability", "benefit", "character", "charm", "comfort", "discount", "dream", "elegance", "favourite", "feature", "improvement", "luck", "luxury", "offer", - "pro", "quality", "requirement", "usability" }; + "quality", "requirement", "usability" }; private static final String[] NEGATIVE_NOUN_LIST = { "blocker", "challenge", "complain", "complaint", "compromise", "con", http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SynonymListFilter.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SynonymListFilter.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SynonymListFilter.java index 7c12c9a..37f57e4 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SynonymListFilter.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SynonymListFilter.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package opennlp.tools.apps.relevanceVocabs; import java.io.BufferedReader; http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SynonymMap.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SynonymMap.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SynonymMap.java index 804fc2b..7e680de 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SynonymMap.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/SynonymMap.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package opennlp.tools.apps.relevanceVocabs; import java.io.IOException; @@ -12,50 +29,7 @@ import java.io.IOException; import java.util.TreeMap; import java.util.TreeSet; - /** - * Loads the <a target="_blank" - * href="http://www.cogsci.princeton.edu/~wn/">WordNet </a> prolog file <a - * href="http://www.cogsci.princeton.edu/2.0/WNprolog-2.0.tar.gz">wn_s.pl </a> - * into a thread-safe main-memory hash map that can be used for fast - * high-frequency lookups of synonyms for any given (lowercase) word string. - * <p> - * There holds: If B is a synonym for A (A -> B) then A is also a synonym for B (B -> A). - * There does not necessarily hold: A -> B, B -> C then A -> C. - * <p> - * Loading typically takes some 1.5 secs, so should be done only once per - * (server) program execution, using a singleton pattern. Once loaded, a - * synonym lookup via {@link #getSynonyms(String)}takes constant time O(1). - * A loaded default synonym map consumes about 10 MB main memory. - * An instance is immutable, hence thread-safe. - * <p> - * This implementation borrows some ideas from the Lucene Syns2Index demo that - * Dave Spencer originally contributed to Lucene. Dave's approach - * involved a persistent Lucene index which is suitable for occasional - * lookups or very large synonym tables, but considered unsuitable for - * high-frequency lookups of medium size synonym tables. - * <p> - * Example Usage: - * <pre> - * String[] words = new String[] { "hard", "woods", "forest", "wolfish", "xxxx"}; - * SynonymMap map = new SynonymMap(new FileInputStream("samples/fulltext/wn_s.pl")); - * for (int i = 0; i < words.length; i++) { - * String[] synonyms = map.getSynonyms(words[i]); - * System.out.println(words[i] + ":" + java.util.Arrays.asList(synonyms).toString()); - * } - * - * Example output: - * hard:[arduous, backbreaking, difficult, fermented, firmly, grueling, gruelling, heavily, heavy, intemperately, knockout, laborious, punishing, severe, severely, strong, toilsome, tough] - * woods:[forest, wood] - * forest:[afforest, timber, timberland, wood, woodland, woods] - * wolfish:[edacious, esurient, rapacious, ravening, ravenous, voracious, wolflike] - * xxxx:[] - * </pre> - * - * @see <a target="_blank" - * href="http://www.cogsci.princeton.edu/~wn/man/prologdb.5WN.html">prologdb - * man page </a> - * @see <a target="_blank" href="http://www.hostmon.com/rfc/advanced.jsp">Dave's synonym demo site</a> - */ + public class SynonymMap { /** the index data; Map<String word, String[] synonyms> */ @@ -73,7 +47,7 @@ import java.io.IOException; * @param input * the stream to read from (null indicates an empty synonym map) * @throws IOException - * if an error occured while reading the stream. + * if an error occurred while reading the stream. */ public SynonymMap(InputStream input) throws IOException { this.table = input == null ? new HashMap<String,String[]>(0) : read(toByteArray(input)); http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/WordDictionary.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/WordDictionary.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/WordDictionary.java index dbbec1d..cfae086 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/WordDictionary.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/relevanceVocabs/WordDictionary.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package opennlp.tools.apps.relevanceVocabs; import java.util.HashMap; http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/EmailSender.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/EmailSender.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/EmailSender.java index 0b99fc2..ac7cb95 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/EmailSender.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/utils/email/EmailSender.java @@ -14,7 +14,7 @@ import javax.activation.*; */ public class EmailSender { private static final long serialVersionUID = 1L; - private static final String mailboxAddress="[email protected]"; + private static final String mailboxAddress="[email protected]"; public boolean sendMail(String smtp, String user, String pass, InternetAddress from, InternetAddress[] to, InternetAddress[] cc, InternetAddress[] bcc, String subject, String body, String file) throws Exception { @@ -34,7 +34,7 @@ public class EmailSender { Properties props = new Properties(); props.put("mail.smtp.host", smtp); props.put("mail.smtp.auth", "true"); - props.put("mail.smtp.port", "587"); + props.put("mail.smtp.port", "465"); props.put("mail.smtp.starttls.enable", "true"); Authenticator auth = new SMTP_Authenticator (user, pass); Session session = Session.getInstance(props, auth); @@ -158,7 +158,8 @@ public class EmailSender { public static void main(String[] args){ EmailSender s = new EmailSender(); try { - s.sendMail("smtp.live.com", "[email protected]", "******", new InternetAddress("[email protected]"), new InternetAddress[]{new InternetAddress("[email protected]")}, new InternetAddress[]{}, new InternetAddress[]{}, + s.sendMail("smtp.rambler.ru", "[email protected]", "b06g93", + new InternetAddress("[email protected]"), new InternetAddress[]{new InternetAddress("[email protected]")}, new InternetAddress[]{}, new InternetAddress[]{}, "Generated content for you", "body", null); } catch (AddressException e) { // TODO Auto-generated catch block http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/ProfileReaderWriter.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/ProfileReaderWriter.java b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/ProfileReaderWriter.java index 9081e1a..694da0a 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/ProfileReaderWriter.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/jsmlearning/ProfileReaderWriter.java @@ -123,6 +123,31 @@ public class ProfileReaderWriter { e.printStackTrace(); } } + public static void appendReport( List<String[]> allLines, String reportName){ + List<String[]> previous; + try { + previous = readProfiles(reportName); + allLines.addAll(previous); + } catch (Exception e1) { + System.out.println("Creating file "+reportName); + } + + CSVWriter writer = null; + try { + writer = new CSVWriter(new PrintWriter(reportName)); + } catch (FileNotFoundException e) { + e.printStackTrace(); + } + + writer.writeAll(allLines); + + try { + writer.flush(); + writer.close(); + } catch (IOException e) { + e.printStackTrace(); + } + } public static void writeReportListStr(List<String> res, String string) { // TODO Auto-generated method stub http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefsBuilder.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefsBuilder.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefsBuilder.java index 10e9683..8f215f7 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefsBuilder.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseCorefsBuilder.java @@ -19,8 +19,8 @@ import edu.stanford.nlp.util.*; public class ParseCorefsBuilder { protected static ParseCorefsBuilder instance; - private Annotation annotation; - StanfordCoreNLP pipeline; + protected Annotation annotation; + protected StanfordCoreNLP pipeline; CommunicativeActionsArcBuilder caFinder = new CommunicativeActionsArcBuilder(); /** @@ -35,9 +35,9 @@ public class ParseCorefsBuilder { return instance; } - ParseCorefsBuilder(){ + protected ParseCorefsBuilder(){ Properties props = new Properties(); - props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref"); + props.put("annotators", "tokenize, ssplit, pos, lemma, ner, parse, dcoref, sentiment"); pipeline = new StanfordCoreNLP(props); } @@ -104,30 +104,18 @@ public class ParseCorefsBuilder { new Pair<Integer, Integer>(njSentence,njWord), mi.mentionSpan, mj.mentionSpan, arcType); arcs.add(arc); - - /* - System.out.println("animacy = "+m.animacy); - System.out.println("mention span = "+m.mentionSpan); - System.out.println(" id = "+m.mentionID); - System.out.println(" position = "+m.position); - System.out.println(" start index = "+m.startIndex); - System.out.println(" end index = "+m.endIndex); - System.out.println(" mentionType = "+m.mentionType); - System.out.println(" number = = "+m.number); - */ } } - - } List<WordWordInterSentenceRelationArc> arcsCA = buildCAarcs(nodesThicket); + arcs.addAll(arcsCA); ParseThicket result = new ParseThicket(ptTrees, arcs); result.setNodesThicket(nodesThicket); return result; } - private List<WordWordInterSentenceRelationArc> buildCAarcs( + public List<WordWordInterSentenceRelationArc> buildCAarcs( List<List<ParseTreeNode>> nodesThicket) { List<WordWordInterSentenceRelationArc> arcs = new ArrayList<WordWordInterSentenceRelationArc>(); http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseThicket.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseThicket.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseThicket.java index e584d1e..8723e53 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseThicket.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseThicket.java @@ -13,6 +13,36 @@ public class ParseThicket { // then list for all sentences private List<List<ParseTreeNode>> sentenceNodes; + private List<Float> sentimentProfile; + + private String origText; + private List<List<ParseTreeNode>> phrases; + + + public List<Tree> getSentenceTrees() { + return sentenceTrees; + } + + public void setSentenceTrees(List<Tree> sentenceTrees) { + this.sentenceTrees = sentenceTrees; + } + + public List<List<ParseTreeNode>> getSentenceNodes() { + return sentenceNodes; + } + + public void setSentenceNodes(List<List<ParseTreeNode>> sentenceNodes) { + this.sentenceNodes = sentenceNodes; + } + + public String getOrigText() { + return origText; + } + + public void setOrigText(String origText) { + this.origText = origText; + } + public List<Tree> getSentences() { return sentenceTrees; } @@ -53,6 +83,22 @@ public class ParseThicket { public String toString(){ return this.sentenceTrees+"\n"+this.arcs; } + + public void setPhrases(List<List<ParseTreeNode>> phrs) { + this.phrases = phrs; + } + + public List<List<ParseTreeNode>> getPhrases() { + return phrases; + } + + public List<Float> getSentimentProfile() { + return sentimentProfile; + } + + public void setSentimentProfile(List<Float> sentimentProfile) { + this.sentimentProfile = sentimentProfile; + } http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseTreeNode.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseTreeNode.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseTreeNode.java index 528eb4d..689a4b8 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseTreeNode.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/ParseTreeNode.java @@ -2,25 +2,92 @@ package opennlp.tools.parse_thicket; import java.util.ArrayList; import java.util.List; +import java.util.Map; public class ParseTreeNode implements IGeneralizer<ParseTreeNode>{ - String word; - // this is the POS tag of the token - String pos; - // this is the NER label of the token - String ne; - Integer id; - //PhraseType - String phraseType; - - public enum PhraseType {NP("NP"), VP("VP"), PRP("PRP"); - private PhraseType(final String text) { - this.text = text; - } - private final String text; - - } - + String word; // word in normal form, lemma + // this is the POS tag of the token + String pos; + // this is the NER label of the token + String ne; + Integer id; + //PhraseType + String phraseType; + Map<String, Object> attributes; + String normalizedWord; + String syntacticDependence; + String originalWord; //what actually occurs in a sentence + + String head; + String label; + String modifier; + + + + public String getOriginalWord() { + return originalWord; + } + + public void setOriginalWord(String originalWord) { + this.originalWord = originalWord; + } + + public String getHead() { + return head; + } + + public void setHead(String head) { + this.head = head; + } + + public String getLabel() { + return label; + } + + public void setLabel(String label) { + this.label = label; + } + + public String getModifier() { + return modifier; + } + + public void setModifier(String modifier) { + this.modifier = modifier; + } + + public String getNormalizedWord() { + return normalizedWord; + } + + public void setNormalizedWord(String normalizedWord) { + this.normalizedWord = normalizedWord; + } + + public String getSyntacticDependence() { + return syntacticDependence; + } + + public void setSyntacticDependence(String syntacticDependence) { + this.syntacticDependence = syntacticDependence; + } + + public Map<String, Object> getAttributes() { + return attributes; + } + + public void setAttributes(Map<String, Object> attributes) { + this.attributes = attributes; + } + + public enum PhraseType {NP("NP"), VP("VP"), PRP("PRP"); + private PhraseType(final String text) { + this.text = text; + } + private final String text; + + } + public ParseTreeNode(String word, String pos, String ne, Integer id) { super(); this.word = word; @@ -28,15 +95,14 @@ public class ParseTreeNode implements IGeneralizer<ParseTreeNode>{ this.ne = ne; this.id = id; } - + public ParseTreeNode(String word, String pos) { super(); this.word = word; this.pos = pos; - this.ne = ne; - this.id = id; + } - + public String getPhraseType() { return phraseType; } @@ -67,7 +133,7 @@ public class ParseTreeNode implements IGeneralizer<ParseTreeNode>{ public void setId(Integer id) { this.id = id; } - + public String toString(){ StringBuffer buf = new StringBuffer(); if (id!=null) @@ -81,10 +147,27 @@ public class ParseTreeNode implements IGeneralizer<ParseTreeNode>{ return buf.toString(); } + public static String toTreeRepresentationString(List<ParseTreeNode> chList){ + StringBuffer buf = new StringBuffer(); + for(ParseTreeNode ch: chList){ + if (ch.getPos().startsWith(".") || ch.getPos().startsWith(",") || ch.getPos().startsWith(";") || ch.getPos().startsWith("!")) + continue; + buf.append( "("+ch.getWord()+ " " + ch.getPos() + ")" ); + } + return buf.toString().trim(); + } + public static String toWordString(List<ParseTreeNode> chList){ + String buf = ""; + for(ParseTreeNode ch: chList){ + buf+=ch.getWord()+ " "; + } + return buf.trim(); + } + @Override public List<ParseTreeNode> generalize(Object o1, Object o2) { List<ParseTreeNode> result = new ArrayList<ParseTreeNode>(); - + ParseTreeNode w1 = (ParseTreeNode) o1; ParseTreeNode w2 = (ParseTreeNode) o2; String posGen = generalizePOS(w1.pos, w2.pos); @@ -95,7 +178,7 @@ public class ParseTreeNode implements IGeneralizer<ParseTreeNode>{ result.add(newNode); return result; } - + public String generalizeWord(String lemma1, String lemma2){ if (lemma1.equals(lemma2)) return lemma1; @@ -105,49 +188,49 @@ public class ParseTreeNode implements IGeneralizer<ParseTreeNode>{ return "*"; //TODO return "*"; - + } - + public String generalizePOS(String pos1, String pos2) { - if ((pos1.startsWith("NN") && pos2.equals("NP") || pos2.startsWith("NN") - && pos1.equals("NP"))) { - return "NN"; - } - if ((pos1.startsWith("NN") && pos2.equals("VBG") || pos2.startsWith("VBG") - && pos1.equals("NN"))) { - return "NN"; - } - - if ((pos1.startsWith("NN") && pos2.equals("ADJP") || pos2.startsWith("NN") - && pos1.equals("ADJP"))) { - return "NN"; - } - if ((pos1.equals("IN") && pos2.equals("TO") || pos1.equals("TO") - && pos2.equals("IN"))) { - return "IN"; - } - // VBx vs VBx = VB (does not matter which form for verb) - if (pos1.startsWith("VB") && pos2.startsWith("VB")) { - return "VB"; - } - - // ABx vs ABy always gives AB - if (pos1.equalsIgnoreCase(pos2)) { - return pos1; - } - if (pos1.length() > 2) { - pos1 = pos1.substring(0, 2); - } - - if (pos2.length() > 2) { - pos2 = pos2.substring(0, 2); - } - if (pos1.equalsIgnoreCase(pos2)) { - return pos1 + "*"; - } - return null; - } - - + if ((pos1.startsWith("NN") && pos2.equals("NP") || pos2.startsWith("NN") + && pos1.equals("NP"))) { + return "NN"; + } + if ((pos1.startsWith("NN") && pos2.equals("VBG") || pos2.startsWith("VBG") + && pos1.equals("NN"))) { + return "NN"; + } + + if ((pos1.startsWith("NN") && pos2.equals("ADJP") || pos2.startsWith("NN") + && pos1.equals("ADJP"))) { + return "NN"; + } + if ((pos1.equals("IN") && pos2.equals("TO") || pos1.equals("TO") + && pos2.equals("IN"))) { + return "IN"; + } + // VBx vs VBx = VB (does not matter which form for verb) + if (pos1.startsWith("VB") && pos2.startsWith("VB")) { + return "VB"; + } + + // ABx vs ABy always gives AB + if (pos1.equalsIgnoreCase(pos2)) { + return pos1; + } + if (pos1.length() > 2) { + pos1 = pos1.substring(0, 2); + } + + if (pos2.length() > 2) { + pos2 = pos2.substring(0, 2); + } + if (pos1.equalsIgnoreCase(pos2)) { + return pos1 + "*"; + } + return null; + } + + }; http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/WordWordInterSentenceRelationArc.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/WordWordInterSentenceRelationArc.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/WordWordInterSentenceRelationArc.java index db7905d..265a3fa 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/WordWordInterSentenceRelationArc.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/WordWordInterSentenceRelationArc.java @@ -61,7 +61,7 @@ public class WordWordInterSentenceRelationArc { } public String toString(){ - return "<sent="+codeFrom.getFirst()+"-word="+codeFrom.getSecond()+".."+lemmaFrom+"> ===> "+ + return arcType.toString()+"&<sent="+codeFrom.getFirst()+"-word="+codeFrom.getSecond()+".."+lemmaFrom+"> ===> "+ "<sent="+codeTo.getFirst()+"-word="+codeTo.getSecond()+".."+lemmaTo+">"; } http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/MultiSentenceSearchResultsProcessor.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/MultiSentenceSearchResultsProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/MultiSentenceSearchResultsProcessor.java index ce4b600..edd164f 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/MultiSentenceSearchResultsProcessor.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/apps/MultiSentenceSearchResultsProcessor.java @@ -73,7 +73,7 @@ public class MultiSentenceSearchResultsProcessor { hit.setSource(match.toString()); } if (score < 2){ // attempt to match with snippet, if not much luck with original text - match = matcher.assessRelevanceCache(pageSentsAndSnippet[0] , + match = matcher.assessRelevanceCache(pageSentsAndSnippet[1] , searchQuery); score = parseTreeChunkListScorer.getParseTreeChunkListScore(match); } @@ -161,7 +161,7 @@ public class MultiSentenceSearchResultsProcessor { LOG.info("No search results for query '" + query); return null; } - ProfileReaderWriter.writeReport(reportData, "resultsForQuery_"+query.replace(' ', '_')+".csv"); + //ProfileReaderWriter.writeReport(reportData, "resultsForQuery_"+query.replace(' ', '_')+".csv"); return hits; } http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceExtendedForestSearchResultsProcessorSetFormer.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceExtendedForestSearchResultsProcessorSetFormer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceExtendedForestSearchResultsProcessorSetFormer.java index eb67724..c568035 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceExtendedForestSearchResultsProcessorSetFormer.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceExtendedForestSearchResultsProcessorSetFormer.java @@ -66,7 +66,7 @@ public class MultiSentenceExtendedForestSearchResultsProcessorSetFormer extends private List<HitBase> formTreeForestDataSet( List<HitBase> hits, String query, boolean isPositive) { List<HitBase> newHitList = new ArrayList<HitBase>(), newHitListReRanked = new ArrayList<HitBase>(); - // form the training set from original documets. Since search results are ranked, we set the first half as positive set, + // form the training set from original documents. Since search results are ranked, we set the first half as positive set, //and the second half as negative set. // after re-classification, being re-ranked, the search results might end up in a different set List<String[]> treeBankBuffer = new ArrayList<String[]>(); @@ -117,7 +117,6 @@ public class MultiSentenceExtendedForestSearchResultsProcessorSetFormer extends treeBankBuffer.add(new String[] {posOrNeg+" |BT| "+t.toString()+ " |ET|"}); } } catch (Exception e) { - // TODO Auto-generated catch block e.printStackTrace(); } return treeBankBuffer; http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedSearchResultsProcessor.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedSearchResultsProcessor.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedSearchResultsProcessor.java index df6189d..39d348e 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedSearchResultsProcessor.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/MultiSentenceKernelBasedSearchResultsProcessor.java @@ -90,7 +90,7 @@ public class MultiSentenceKernelBasedSearchResultsProcessor extends MultiSenten private List<HitBase> filterOutIrrelevantHitsByTreeKernelLearning( List<HitBase> hits, String query) { List<HitBase> newHitList = new ArrayList<HitBase>(), newHitListReRanked = new ArrayList<HitBase>(); - // form the training set from original documets. Since search results are ranked, we set the first half as positive set, + // form the training set from original documents. Since search results are ranked, we set the first half as positive set, //and the second half as negative set. // after re-classification, being re-ranked, the search results might end up in a different set List<String[]> treeBankBuffer = new ArrayList<String[]>(); http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/PT2ExtendedTreeForestBuilder.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/PT2ExtendedTreeForestBuilder.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/PT2ExtendedTreeForestBuilder.java index 9c1c44a..fb5eed8 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/PT2ExtendedTreeForestBuilder.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/PT2ExtendedTreeForestBuilder.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package opennlp.tools.parse_thicket.kernel_interface; import java.util.ArrayList; @@ -32,6 +49,22 @@ public class PT2ExtendedTreeForestBuilder { return treeBankBuffer; } + private String formTrainingSetFromTextOneLine(String para, boolean positive){ + String prefix = null; + if (positive) + prefix=" 1 "; + else + prefix=" -1 "; + + ParseThicket pt = matcher.buildParseThicketFromTextWithRST(para); + List<Tree> forest = pt.getSentences(); + String line = prefix; + for(Tree t: forest){ + line+= "|BT| "+t.toString()+ " |ET| "; + } + return line; + } + public void formPosNegTrainingSet(String pos, String neg, String path){ List<String[]> list = formTrainingSetFromText(pos, true), negList= formTrainingSetFromText(neg, false); @@ -50,8 +83,6 @@ public class PT2ExtendedTreeForestBuilder { ProfileReaderWriter.writeReport(treeBankBuffer, path+"unknown.txt", ' '); tkRunner.runClassifier(path, "unknown.txt", modelFileName, "classifier_output.txt"); - - } http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/SnippetToParagraphFull.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/SnippetToParagraphFull.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/SnippetToParagraphFull.java index 4cf3b34..d6a295f 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/SnippetToParagraphFull.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/SnippetToParagraphFull.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package opennlp.tools.parse_thicket.kernel_interface; import java.util.ArrayList; http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeExtenderByAnotherLinkedTree.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeExtenderByAnotherLinkedTree.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeExtenderByAnotherLinkedTree.java index 47e474f..c980f9f 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeExtenderByAnotherLinkedTree.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeExtenderByAnotherLinkedTree.java @@ -1,29 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package opennlp.tools.parse_thicket.kernel_interface; import java.util.ArrayList; import java.util.List; +import java.util.logging.Logger; import opennlp.tools.jsmlearning.ProfileReaderWriter; import opennlp.tools.parse_thicket.ParseThicket; import opennlp.tools.parse_thicket.ParseTreeNode; +import opennlp.tools.parse_thicket.VerbNetProcessor; import opennlp.tools.parse_thicket.WordWordInterSentenceRelationArc; import opennlp.tools.parse_thicket.matching.Matcher; import opennlp.tools.parse_thicket.matching.PT2ThicketPhraseBuilder; import edu.stanford.nlp.trees.Tree; public class TreeExtenderByAnotherLinkedTree extends PT2ThicketPhraseBuilder { + private static Logger log = Logger + .getLogger("opennlp.tools.parse_thicket.kernel_interface.TreeExtenderByAnotherLinkedTree"); public List<String> buildForestForCorefArcs(ParseThicket pt){ List<String> results = new ArrayList<String>(); for(WordWordInterSentenceRelationArc arc: pt.getArcs()){ - if (!arc.getArcType().getType().startsWith("coref")) - continue; + //if (!arc.getArcType().getType().startsWith("coref")) + // continue; int fromSent = arc.getCodeFrom().getFirst(); int toSent = arc.getCodeTo().getFirst(); + if (fromSent <1 || toSent <1 ) // TODO problem in sentence enumeration => skip building extended trees + return results; + String wordFrom = arc.getLemmaFrom(); String wordTo = arc.getLemmaTo(); - List<Tree> trees = getASubtreeWithRootAsNodeForWord1(pt.getSentences().get(fromSent-1), pt.getSentences().get(fromSent-1), new String[]{ wordFrom}); + List<Tree> trees = getASubtreeWithRootAsNodeForWord1(pt.getSentences().get(fromSent-1), + pt.getSentences().get(fromSent-1), new String[]{ wordFrom}); if (trees==null || trees.size()<1) continue; System.out.println(trees); @@ -32,13 +57,52 @@ public class TreeExtenderByAnotherLinkedTree extends PT2ThicketPhraseBuilder { System.out.println(sb.toString()); results.add(sb.toString()); } - /* - List<String[]> treeBankBuffer = new ArrayList<String[]>(); - for(String t: results){ - treeBankBuffer.add(new String[] {" 0 |BT|"+t.toString()+ "|ET|"}); + // if no arcs then orig sentences + if (results.isEmpty()){ + for(Tree t: pt.getSentences()){ + results.add(t.toString()); + } + } + return results; + } + // sentences in pt are enumerarted starting from 0; + //this func works with Sista version of Stanford NLP and sentences are coded from 0 + public List<String> buildForestForRSTArcs(ParseThicket pt){ + List<String> results = new ArrayList<String>(); + for(WordWordInterSentenceRelationArc arc: pt.getArcs()){ + // TODO - uncomment + //if (!arc.getArcType().getType().startsWith("rst")) + // continue; + int fromSent = arc.getCodeFrom().getFirst(); + int toSent = arc.getCodeTo().getFirst(); + + String wordFrom = arc.getLemmaFrom(); + String wordTo = arc.getLemmaTo(); + + if (wordFrom == null || wordFrom.length()<1 || wordTo == null || wordTo.length()<1) + log.severe("Empty lemmas for RST arc "+ arc); + + List<Tree> trees = getASubtreeWithRootAsNodeForWord1(pt.getSentences().get(fromSent), + pt.getSentences().get(fromSent), new String[]{ wordFrom}); + if (trees==null || trees.size()<1) + continue; + System.out.println(trees); + StringBuilder sb = new StringBuilder(10000); + Tree tree = trees.get(0); + // instead of phrase type for the root of the tree, we want to put the RST relation name + if (arc.getArcType().getType().startsWith("rst")) + tree.setValue(arc.getArcType().getSubtype()); + + toStringBuilderExtenderByAnotherLinkedTree1(sb, pt.getSentences().get(toSent), tree, new String[]{wordTo}); + System.out.println(sb.toString()); + results.add(sb.toString()); + } + // if no arcs then orig sentences + if (results.isEmpty()){ + for(Tree t: pt.getSentences()){ + results.add(t.toString()); + } } - ProfileReaderWriter.writeReport(treeBankBuffer, "C:\\stanford-corenlp\\tree_kernel\\unknownForest.txt", ' '); - */ return results; } @@ -75,8 +139,6 @@ public class TreeExtenderByAnotherLinkedTree extends PT2ThicketPhraseBuilder { } sb.append(' '); toStringBuilderExtenderByAnotherLinkedTree1(sb, treeToInsert, null, null); - int z=0; z++; - } else { for (Tree kid : kids) { sb.append(' '); @@ -90,6 +152,7 @@ public class TreeExtenderByAnotherLinkedTree extends PT2ThicketPhraseBuilder { } } + // given a parse tree and a public List<Tree> getASubtreeWithRootAsNodeForWord1(Tree tree, Tree currentSubTree, String[] corefWords){ if (currentSubTree.isLeaf()){ return null; @@ -97,26 +160,23 @@ public class TreeExtenderByAnotherLinkedTree extends PT2ThicketPhraseBuilder { List<Tree> result = null; Tree[] kids = currentSubTree.children(); if (kids != null) { - boolean bInsert=false; + boolean bFound=false; String word = corefWords[corefWords.length-1]; - for (Tree kid : kids) { - if (bInsert){ + if (bFound){ result.add(kid); } else { - String phraseStr = kid.toString(); phraseStr=phraseStr.replace(")", ""); - if (phraseStr.endsWith(word)){ - bInsert=true; + if (phraseStr.endsWith(word)){ // found + bFound=true; result = new ArrayList<Tree>(); } } } - if (bInsert){ + if (bFound){ return result; } - // if not a selected node, proceed with iteration for (Tree kid : kids) { List<Tree> ts = getASubtreeWithRootAsNodeForWord1(tree, kid, corefWords); @@ -128,7 +188,7 @@ public class TreeExtenderByAnotherLinkedTree extends PT2ThicketPhraseBuilder { return null; } - + // now obsolete public Tree[] getASubtreeWithRootAsNodeForWord(Tree tree, Tree currentSubTree, String[] corefWords){ if (currentSubTree.isLeaf()){ return null; @@ -238,7 +298,7 @@ public class TreeExtenderByAnotherLinkedTree extends PT2ThicketPhraseBuilder { } } - private StringBuilder toStringBuilder(StringBuilder sb, Tree t) { + public StringBuilder toStringBuilder(StringBuilder sb, Tree t) { if (t.isLeaf()) { if (t.label() != null) { sb.append(t.label().value()); @@ -263,22 +323,25 @@ public class TreeExtenderByAnotherLinkedTree extends PT2ThicketPhraseBuilder { } public static void main(String[] args){ + VerbNetProcessor p = VerbNetProcessor. + getInstance("/Users/borisgalitsky/Documents/workspace/deepContentInspection/src/test/resources"); + Matcher matcher = new Matcher(); TreeExtenderByAnotherLinkedTree extender = new TreeExtenderByAnotherLinkedTree(); ParseThicket pt = matcher.buildParseThicketFromTextWithRST(//"I went to the forest to look for a tree. I found out that it was thick and green"); - "Iran refuses to accept the UN proposal to end its dispute over its work on nuclear weapons."+ + "Iran refuses to accept the UN proposal to end its dispute over its work on nuclear weapons. "+ "UN nuclear watchdog passes a resolution condemning Iran for developing its second uranium enrichment site in secret. " + "A recent IAEA report presented diagrams that suggested Iran was secretly working on nuclear weapons. " + "Iran envoy says its nuclear development is for peaceful purpose, and the material evidence against it has been fabricated by the US. "); List<String> results = extender.buildForestForCorefArcs(pt); System.out.println(results); - System.exit(0); + //System.exit(0); List<Tree> forest = pt.getSentences(); - List<Tree> trees = extender.getASubtreeWithRootAsNodeForWord1(forest.get(1), forest.get(1), new String[]{"it"}); + List<Tree> trees = extender.getASubtreeWithRootAsNodeForWord1(forest.get(1), forest.get(1), new String[]{"its"}); System.out.println(trees); StringBuilder sb = new StringBuilder(10000); extender.toStringBuilderExtenderByAnotherLinkedTree1(sb, forest.get(0), trees.get(0), new String[]{"the", "forest"}); http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelRunner.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelRunner.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelRunner.java index f00904f..294fb38 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelRunner.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/kernel_interface/TreeKernelRunner.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package opennlp.tools.parse_thicket.kernel_interface; import java.io.BufferedReader; @@ -30,11 +47,18 @@ public class TreeKernelRunner { public void runLearner(String dir, String learning_file, String model_file) { + if (!dir.endsWith("/")) + dir+="/"; + String[] runString = new String[]{dir+"svm_learn","-t", "5","-j","2","-W","A", dir+learning_file, dir+model_file}; + runEXE(runString, dir); + } + public void runLearnerWin(String dir, String learning_file, String model_file) + { dir = dir.replace('/', '\\'); if (!dir.endsWith("\\")) dir+="\\"; - String[] runString = new String[]{dir+"svm_learn.exe","-t", "5", dir+learning_file, dir+model_file}; + String[] runString = new String[]{dir+"svm_learn.exe","-t", "5","-j","2","-W","A", dir+learning_file, dir+model_file}; runEXE(runString, dir); } @@ -42,6 +66,13 @@ public class TreeKernelRunner { //svm_classify example_file model_file predictions_file public void runClassifier(String dir, String example_file, String model_file, String predictions_file) { + if (!dir.endsWith("/")) + dir+="/"; + String[] runString = new String[]{dir+"svm_classify", dir+example_file, dir+model_file, dir+predictions_file}; + runEXE(runString, dir); + } + public void runClassifierWin(String dir, String example_file, String model_file, String predictions_file) + { dir = dir.replace('/', '\\'); if (!dir.endsWith("\\")) http://git-wip-us.apache.org/repos/asf/opennlp-sandbox/blob/9aa270c1/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/GeneralizationListReducer.java ---------------------------------------------------------------------- diff --git a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/GeneralizationListReducer.java b/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/GeneralizationListReducer.java deleted file mode 100644 index ef0569a..0000000 --- a/opennlp-similarity/src/main/java/opennlp/tools/parse_thicket/matching/GeneralizationListReducer.java +++ /dev/null @@ -1,148 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package opennlp.tools.parse_thicket.matching; - -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; - -public class GeneralizationListReducer { - public List<ParseTreePath> applyFilteringBySubsumption_OLD( - List<ParseTreePath> result) { - List<ParseTreePath> resultDupl = new ArrayList<ParseTreePath>(); - resultDupl.addAll(new HashSet<ParseTreePath>(result)); - result = resultDupl; - if (result.size() < 2) - return result; // nothing to reduce - List<ParseTreePath> resultReduced = new ArrayList<ParseTreePath>(); - int size = result.size(); - for (int i = 0; i < size; i++) { - Boolean bSubChunk = false; - for (int j = 0; j < size; j++) { - if (i == j) { - continue; - } - if (result.get(j).isASubChunk(result.get(i))) { - bSubChunk = true; - } - } - if (!bSubChunk) - resultReduced.add(result.get(i)); - } - - if (resultReduced.size() < 1) { - System.err.println("Wrong subsumption reduction"); - } - - if (resultReduced.size() > 1) { - int z = 0; - z++; - } - return resultReduced; - - } - - public List<ParseTreePath> applyFilteringBySubsumptionOLD( - List<ParseTreePath> result) { - List<ParseTreePath> resultDupl = null; - if (result.size() < 2) - return result; // nothing to reduce - List<ParseTreePath> resultReduced = new ArrayList<ParseTreePath>(); - int size = result.size(); - resultDupl = new ArrayList<ParseTreePath>(result); - for (int s = 0; s < size; s++) { - for (int i = 0; i < resultDupl.size(); i++) { - Boolean bStop = false; - for (int j = 0; j < resultDupl.size(); j++) { - if (i == j) { - continue; - } - if (result.get(j).isASubChunk(result.get(i)) - && !result.get(i).isASubChunk(result.get(j))) { - resultDupl.remove(i); - bStop = true; - break; - } - } - if (bStop) { - break; - } - } - } - resultReduced = resultDupl; - if (resultReduced.size() < 1) { - System.err.println("Wrong subsumption reduction"); - } - - if (resultReduced.size() > 1) { - int z = 0; - z++; - } - return resultReduced; - - } - - public List<ParseTreePath> applyFilteringBySubsumption( - List<ParseTreePath> result) { - List<Integer> resultDuplIndex = new ArrayList<Integer>(); - List<ParseTreePath> resultReduced = new ArrayList<ParseTreePath>(); - - if (result.size() < 2) { - return result; // nothing to reduce - } - // remove empty - for (ParseTreePath ch : result) { - if (ch.getLemmas().size() > 0) { - resultReduced.add(ch); - } - } - result = resultReduced; - - for (int i = 0; i < result.size(); i++) { - for (int j = i + 1; j < result.size(); j++) { - if (i == j) { - continue; - } - if (result.get(j).isASubChunk(result.get(i))) { - resultDuplIndex.add(i); - } else if (result.get(i).isASubChunk(result.get(j))) { - resultDuplIndex.add(j); - } - } - - } - resultReduced = new ArrayList<ParseTreePath>(); - for (int i = 0; i < result.size(); i++) { - if (!resultDuplIndex.contains(i)) { - resultReduced.add(result.get(i)); - } - } - - if (resultReduced.size() < 1) { - System.err.println("Wrong subsumption reduction"); - resultReduced = result; - } - - return resultReduced; - - } - - // testing sub-chunk functionality and - // elimination more general according to subsumption relation - -}
