This is an automated email from the ASF dual-hosted git repository. mawiesne pushed a commit to branch Reduce-Compile-and-Runtime-dependencies-in-Similarity-component in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
commit 13346cd5576fc8df8c18a2da01de6ea0dc10fba3 Author: Martin Wiesner <[email protected]> AuthorDate: Wed Jul 10 14:25:27 2024 +0200 Reduce compile and runtime dependency in Similarity Component --- opennlp-similarity/pom.xml | 1054 ++++++++++---------- .../review_builder/MachineTranslationWrapper.java | 22 +- .../tools/doc_classifier/DocClassifier.java | 43 +- ...cClassifierTrainingSetMultilingualExtender.java | 3 +- .../DocClassifierTrainingSetVerifier.java | 18 +- .../tools/textsimilarity/ParseTreeChunk.java | 3 +- 6 files changed, 539 insertions(+), 604 deletions(-) diff --git a/opennlp-similarity/pom.xml b/opennlp-similarity/pom.xml index bb8aa6e..4c47672 100644 --- a/opennlp-similarity/pom.xml +++ b/opennlp-similarity/pom.xml @@ -12,573 +12,533 @@ language governing permissions and limitations under the License. --> <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> - <modelVersion>4.0.0</modelVersion> - <parent> - <groupId>org.apache.opennlp</groupId> - <artifactId>opennlp-sandbox</artifactId> - <version>2.3.4-SNAPSHOT</version> - </parent> + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-sandbox</artifactId> + <version>2.3.4-SNAPSHOT</version> + </parent> - <artifactId>opennlp-similarity</artifactId> - <version>2.3.4-SNAPSHOT</version> - <packaging>jar</packaging> + <artifactId>opennlp-similarity</artifactId> + <version>2.3.4-SNAPSHOT</version> + <packaging>jar</packaging> - <name>Apache OpenNLP Tool Similarity distribution</name> - - <properties> - <dl4j.version>1.0.0-M2.1</dl4j.version> - <hdf5.version>1.14.3-1.5.10</hdf5.version> - <javacpp.version>1.5.10</javacpp.version> - <openblas.version>0.3.26-1.5.10</openblas.version> - </properties> + <name>Apache OpenNLP Similarity distribution</name> - <repositories> - <repository> - <id>central</id> - <name>Maven Central Repository</name> - <url>https://repo1.maven.org/maven2</url> - </repository> - <repository> - <id>billylieurance-net</id> - <url>https://www.billylieurance.net/maven2</url> - <snapshots> - <enabled>false</enabled> - </snapshots> - </repository> - </repositories> + <properties> + <dl4j.version>1.0.0-M2.1</dl4j.version> + <hdf5.version>1.14.3-1.5.10</hdf5.version> + <javacpp.version>1.5.10</javacpp.version> + <openblas.version>0.3.26-1.5.10</openblas.version> + </properties> - <dependencyManagement> - <dependencies> - <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>httpclient</artifactId> - <version>4.5.14</version> - </dependency> - <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>httpclient-cache</artifactId> - <version>4.5.14</version> - </dependency> - <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>httpcore</artifactId> - <version>4.4.16</version> - </dependency> - <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>httpmime</artifactId> - <version>4.5.14</version> - </dependency> - <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>fluent-hc</artifactId> - <version>4.5.14</version> - </dependency> - <!-- Required to avoid IllegalAccessError by Lombok during compilation --> - <dependency> - <groupId>org.projectlombok</groupId> - <artifactId>lombok</artifactId> - <version>1.18.34</version> - </dependency> - </dependencies> - </dependencyManagement> + <repositories> + <repository> + <id>central</id> + <name>Maven Central Repository</name> + <url>https://repo1.maven.org/maven2</url> + </repository> + <repository> + <id>billylieurance-net</id> + <url>https://www.billylieurance.net/maven2</url> + <snapshots> + <enabled>false</enabled> + </snapshots> + </repository> + </repositories> - <dependencies> - <dependency> - <groupId>org.apache.opennlp</groupId> - <artifactId>opennlp-tools</artifactId> - </dependency> + <dependencyManagement> + <dependencies> + <dependency> + <groupId>org.apache.httpcomponents</groupId> + <artifactId>httpclient</artifactId> + </dependency> + <dependency> + <groupId>org.apache.httpcomponents</groupId> + <artifactId>httpclient-cache</artifactId> + </dependency> + <dependency> + <groupId>org.apache.httpcomponents</groupId> + <artifactId>httpcore</artifactId> + </dependency> + <dependency> + <groupId>org.apache.httpcomponents</groupId> + <artifactId>httpmime</artifactId> + </dependency> + <dependency> + <groupId>org.apache.httpcomponents</groupId> + <artifactId>fluent-hc</artifactId> + </dependency> + <!-- Required to avoid IllegalAccessError by Lombok during compilation --> + <dependency> + <groupId>org.projectlombok</groupId> + <artifactId>lombok</artifactId> + </dependency> + </dependencies> + </dependencyManagement> - <dependency> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-api</artifactId> - </dependency> + <dependencies> + <dependency> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-tools</artifactId> + </dependency> - <dependency> - <groupId>org.apache.logging.log4j</groupId> - <artifactId>log4j-api</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.apache.logging.log4j</groupId> - <artifactId>log4j-core</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.apache.logging.log4j</groupId> - <artifactId>log4j-slf4j2-impl</artifactId> - <scope>test</scope> - </dependency> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-api</artifactId> + </dependency> - <dependency> - <groupId>org.junit.jupiter</groupId> - <artifactId>junit-jupiter-api</artifactId> - </dependency> + <dependency> + <groupId>commons-lang</groupId> + <artifactId>commons-lang</artifactId> + </dependency> + <dependency> + <groupId>commons-codec</groupId> + <artifactId>commons-codec</artifactId> + </dependency> + <dependency> + <groupId>commons-collections</groupId> + <artifactId>commons-collections</artifactId> + </dependency> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-math3</artifactId> + </dependency> + <dependency> + <groupId>org.json</groupId> + <artifactId>json</artifactId> + <version>20240303</version> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-app</artifactId> + <version>2.9.2</version> + </dependency> + <dependency> + <groupId>net.sf.opencsv</groupId> + <artifactId>opencsv</artifactId> + <version>2.3</version> + </dependency> - <dependency> - <groupId>org.junit.jupiter</groupId> - <artifactId>junit-jupiter-engine</artifactId> - </dependency> + <dependency> + <groupId>org.apache.solr</groupId> + <artifactId>solr-core</artifactId> + <version>8.11.3</version> + <exclusions> + <exclusion> + <groupId>org.apache.hadoop</groupId> + <artifactId>*</artifactId> + </exclusion> + <exclusion> + <groupId>org.eclipse.jetty</groupId> + <artifactId>*</artifactId> + </exclusion> + <exclusion> + <groupId>org.eclipse.jetty.http2</groupId> + <artifactId>*</artifactId> + </exclusion> + </exclusions> + </dependency> - <dependency> - <groupId>org.junit.jupiter</groupId> - <artifactId>junit-jupiter-params</artifactId> - </dependency> + <dependency> + <groupId>javax.mail</groupId> + <artifactId>mail</artifactId> + <version>1.4.7</version> + </dependency> + <dependency> + <groupId>com.restfb</groupId> + <artifactId>restfb</artifactId> + <version>1.49.0</version> + </dependency> - <dependency> - <groupId>commons-lang</groupId> - <artifactId>commons-lang</artifactId> - </dependency> - <dependency> - <groupId>commons-codec</groupId> - <artifactId>commons-codec</artifactId> - </dependency> - <dependency> - <groupId>commons-logging</groupId> - <artifactId>commons-logging</artifactId> - </dependency> - <dependency> - <groupId>commons-collections</groupId> - <artifactId>commons-collections</artifactId> - </dependency> - <dependency> - <groupId>org.apache.commons</groupId> - <artifactId>commons-math3</artifactId> - </dependency> + <dependency> + <groupId>net.billylieurance.azuresearch</groupId> + <artifactId>azure-bing-search-java</artifactId> + <version>0.13.0</version> + </dependency> - <dependency> - <groupId>org.json</groupId> - <artifactId>json</artifactId> - <version>20240303</version> - </dependency> - <dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-app</artifactId> - <version>2.9.2</version> - </dependency> - <dependency> - <groupId>net.sf.opencsv</groupId> - <artifactId>opencsv</artifactId> - <version>2.3</version> - </dependency> + <dependency> + <groupId>edu.mit</groupId> + <artifactId>jverbnet</artifactId> + <version>1.2.0.1</version> + <exclusions> + <exclusion> + <groupId>ch.qos.logback</groupId> + <artifactId>logback-core</artifactId> + </exclusion> + <exclusion> + <groupId>ch.qos.logback</groupId> + <artifactId>logback-classic</artifactId> + </exclusion> + <!-- Avoids problems with conflicting slf4j bindings at runtime --> + <exclusion> + <groupId>org.slf4j</groupId> + <artifactId>log4j-over-slf4j</artifactId> + </exclusion> + </exclusions> + </dependency> - <dependency> - <groupId>org.apache.solr</groupId> - <artifactId>solr-core</artifactId> - <version>8.11.3</version> - </dependency> - - <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>httpclient</artifactId> - </dependency> - <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>httpclient-cache</artifactId> - </dependency> - <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>httpcore</artifactId> - </dependency> - <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>httpmime</artifactId> - </dependency> - <dependency> - <groupId>org.apache.httpcomponents</groupId> - <artifactId>fluent-hc</artifactId> - </dependency> + <dependency> + <groupId>org.docx4j</groupId> + <artifactId>docx4j</artifactId> + <version>6.1.2</version> + <exclusions> + <!-- Exclusion here as log4j version 2 bindings are used during tests/runtime--> + <exclusion> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-log4j12</artifactId> + </exclusion> + <exclusion> + <groupId>log4j</groupId> + <artifactId>log4j</artifactId> + </exclusion> + </exclusions> + </dependency> - <dependency> - <groupId>org.jgrapht</groupId> - <artifactId>jgrapht-jdk1.5</artifactId> - <version>0.7.3</version> - </dependency> - <dependency> - <groupId>de.jollyday</groupId> - <artifactId>jollyday</artifactId> - <version>0.5.10</version> - </dependency> - <dependency> - <groupId>jgraph</groupId> - <artifactId>jgraph</artifactId> - <version>5.13.0.0</version> - </dependency> - <dependency> - <groupId>javax.mail</groupId> - <artifactId>mail</artifactId> - <version>1.4.7</version> - </dependency> - <dependency> - <groupId>com.restfb</groupId> - <artifactId>restfb</artifactId> - <version>1.49.0</version> - </dependency> - <dependency> - <groupId>com.memetix</groupId> - <artifactId>microsoft-translator-java-api</artifactId> - <version>0.6.2</version> - </dependency> + <dependency> + <groupId>org.deeplearning4j</groupId> + <artifactId>deeplearning4j-core</artifactId> + <version>${dl4j.version}</version> + <exclusions> + <!-- Excluded to avoid irrelevant platforms dependencies, see profiles --> + <exclusion> + <groupId>org.bytedeco</groupId> + <artifactId>openblas-platform</artifactId> + </exclusion> + <exclusion> + <groupId>org.bytedeco</groupId> + <artifactId>hdf5-platform</artifactId> + </exclusion> + <!-- Not required for NLP applications --> + <exclusion> + <groupId>org.datavec</groupId> + <artifactId>datavec-data-image</artifactId> + </exclusion> + </exclusions> + </dependency> + <dependency> + <groupId>org.deeplearning4j</groupId> + <artifactId>deeplearning4j-ui</artifactId> + <version>${dl4j.version}</version> + </dependency> + <dependency> + <groupId>org.deeplearning4j</groupId> + <artifactId>deeplearning4j-nlp</artifactId> + <version>${dl4j.version}</version> + </dependency> - <dependency> - <groupId>net.billylieurance.azuresearch</groupId> - <artifactId>azure-bing-search-java</artifactId> - <version>0.13.0</version> - </dependency> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>openblas</artifactId> + <version>${openblas.version}</version> + </dependency> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>javacpp</artifactId> + <version>${javacpp.version}</version> + </dependency> - <dependency> - <groupId>edu.mit</groupId> - <artifactId>jverbnet</artifactId> - <version>1.2.0.1</version> - <exclusions> - <exclusion> - <groupId>ch.qos.logback</groupId> - <artifactId>logback-core</artifactId> - </exclusion> - <exclusion> - <groupId>ch.qos.logback</groupId> - <artifactId>logback-classic</artifactId> - </exclusion> - <!-- Avoids problems with conflicting slf4j bindings at runtime --> - <exclusion> - <groupId>org.slf4j</groupId> - <artifactId>log4j-over-slf4j</artifactId> - </exclusion> - </exclusions> - </dependency> - - <dependency> - <groupId>org.docx4j</groupId> - <artifactId>docx4j</artifactId> - <version>6.1.2</version> - <exclusions> - <!-- Exclusion here as log4j version 2 bindings are used during tests/runtime--> - <exclusion> - <groupId>org.slf4j</groupId> - <artifactId>slf4j-log4j12</artifactId> - </exclusion> - <exclusion> - <groupId>log4j</groupId> - <artifactId>log4j</artifactId> - </exclusion> - </exclusions> - </dependency> + <!-- TEST --> + <dependency> + <groupId>org.junit.jupiter</groupId> + <artifactId>junit-jupiter-api</artifactId> + </dependency> + <dependency> + <groupId>org.junit.jupiter</groupId> + <artifactId>junit-jupiter-engine</artifactId> + </dependency> + <dependency> + <groupId>org.junit.jupiter</groupId> + <artifactId>junit-jupiter-params</artifactId> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-api</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-core</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-slf4j2-impl</artifactId> + <scope>test</scope> + </dependency> + </dependencies> - <dependency> - <groupId>org.deeplearning4j</groupId> - <artifactId>deeplearning4j-core</artifactId> - <version>${dl4j.version}</version> - <exclusions> - <!-- Excluded to avoid irrelevant platforms dependencies, see profiles --> - <exclusion> - <groupId>org.bytedeco</groupId> - <artifactId>openblas-platform</artifactId> - </exclusion> - <exclusion> - <groupId>org.bytedeco</groupId> - <artifactId>hdf5-platform</artifactId> - </exclusion> - <!-- Not required for NLP applications --> - <exclusion> - <groupId>org.datavec</groupId> - <artifactId>datavec-data-image</artifactId> - </exclusion> - </exclusions> - </dependency> - <dependency> - <groupId>org.deeplearning4j</groupId> - <artifactId>deeplearning4j-ui</artifactId> - <version>${dl4j.version}</version> - </dependency> - <dependency> - <groupId>org.deeplearning4j</groupId> - <artifactId>deeplearning4j-nlp</artifactId> - <version>${dl4j.version}</version> - </dependency> + <profiles> + <profile> + <id>platform-win-x64</id> + <activation> + <os> + <family>Windows</family> + <arch>x64</arch> + </os> + </activation> + <dependencies> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>javacpp</artifactId> + <version>${javacpp.version}</version> + <classifier>windows-x86_64</classifier> + <scope>runtime</scope> + </dependency> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>openblas</artifactId> + <version>${openblas.version}</version> + <classifier>windows-x86_64</classifier> + <scope>runtime</scope> + </dependency> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>hdf5</artifactId> + <version>${hdf5.version}</version> + <classifier>windows-x86_64</classifier> + <scope>runtime</scope> + </dependency> + </dependencies> + </profile> + <profile> + <id>platform-win-x86</id> + <activation> + <os> + <family>Windows</family> + <arch>x86</arch> + </os> + </activation> + <dependencies> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>javacpp</artifactId> + <version>${javacpp.version}</version> + <classifier>windows-x86</classifier> + <scope>runtime</scope> + </dependency> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>openblas</artifactId> + <version>${openblas.version}</version> + <classifier>windows-x86</classifier> + <scope>runtime</scope> + </dependency> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>hdf5</artifactId> + <version>${hdf5.version}</version> + <classifier>windows-x86</classifier> + <scope>runtime</scope> + </dependency> + </dependencies> + </profile> + <profile> + <id>platform-linux-x64</id> + <activation> + <os> + <family>unix</family> + <name>Linux</name> + <arch>amd64</arch> + </os> + </activation> + <dependencies> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>javacpp</artifactId> + <version>${javacpp.version}</version> + <classifier>linux-x86_64</classifier> + <scope>runtime</scope> + </dependency> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>openblas</artifactId> + <version>${openblas.version}</version> + <classifier>linux-x86_64</classifier> + <scope>runtime</scope> + </dependency> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>hdf5</artifactId> + <version>${hdf5.version}</version> + <classifier>linux-x86_64</classifier> + <scope>runtime</scope> + </dependency> + </dependencies> + </profile> + <profile> + <id>platform-macosx-x64</id> + <activation> + <os> + <family>Mac</family> + <arch>x64</arch> + </os> + </activation> + <dependencies> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>javacpp</artifactId> + <version>${javacpp.version}</version> + <classifier>macosx-x86_64</classifier> + <scope>runtime</scope> + </dependency> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>openblas</artifactId> + <version>${openblas.version}</version> + <classifier>macosx-x86_64</classifier> + <scope>runtime</scope> + </dependency> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>hdf5</artifactId> + <version>${hdf5.version}</version> + <classifier>macosx-x86_64</classifier> + <scope>runtime</scope> + </dependency> + </dependencies> + </profile> + <profile> + <id>platform-macosx-aarch64</id> + <activation> + <os> + <family>mac</family> + <arch>aarch64</arch> + </os> + </activation> + <dependencies> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>javacpp</artifactId> + <version>${javacpp.version}</version> + <classifier>macosx-arm64</classifier> + <scope>runtime</scope> + </dependency> + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>openblas</artifactId> + <version>${openblas.version}</version> + <classifier>macosx-arm64</classifier> + <scope>runtime</scope> + </dependency> + <!-- Not available for this platform, yet...--> + <!-- + <dependency> + <groupId>org.bytedeco</groupId> + <artifactId>hdf5</artifactId> + <version>${hdf5.version}</version> + <classifier>macosx-arm64</classifier> + <scope>runtime</scope> + </dependency> + --> + </dependencies> + </profile> + </profiles> - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>openblas</artifactId> - <version>${openblas.version}</version> - </dependency> - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>javacpp</artifactId> - <version>${javacpp.version}</version> - </dependency> - </dependencies> + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-compiler-plugin</artifactId> + <configuration> + <source>${maven.compiler.source}</source> + <target>${maven.compiler.target}</target> + <compilerArgument>-Xlint</compilerArgument> + </configuration> + </plugin> - <profiles> - <profile> - <id>platform-win-x64</id> - <activation> - <os> - <family>Windows</family> - <arch>x64</arch> - </os> - </activation> - <dependencies> - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>javacpp</artifactId> - <version>${javacpp.version}</version> - <classifier>windows-x86_64</classifier> - <scope>runtime</scope> - </dependency> - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>openblas</artifactId> - <version>${openblas.version}</version> - <classifier>windows-x86_64</classifier> - <scope>runtime</scope> - </dependency> - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>hdf5</artifactId> - <version>${hdf5.version}</version> - <classifier>windows-x86_64</classifier> - <scope>runtime</scope> - </dependency> - </dependencies> - </profile> - <profile> - <id>platform-win-x86</id> - <activation> - <os> - <family>Windows</family> - <arch>x86</arch> - </os> - </activation> - <dependencies> - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>javacpp</artifactId> - <version>${javacpp.version}</version> - <classifier>windows-x86</classifier> - <scope>runtime</scope> - </dependency> - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>openblas</artifactId> - <version>${openblas.version}</version> - <classifier>windows-x86</classifier> - <scope>runtime</scope> - </dependency> - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>hdf5</artifactId> - <version>${hdf5.version}</version> - <classifier>windows-x86</classifier> - <scope>runtime</scope> - </dependency> - </dependencies> - </profile> - <profile> - <id>platform-linux-x64</id> - <activation> - <os> - <family>unix</family> - <name>Linux</name> - <arch>amd64</arch> - </os> - </activation> - <dependencies> - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>javacpp</artifactId> - <version>${javacpp.version}</version> - <classifier>linux-x86_64</classifier> - <scope>runtime</scope> - </dependency> - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>openblas</artifactId> - <version>${openblas.version}</version> - <classifier>linux-x86_64</classifier> - <scope>runtime</scope> - </dependency> - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>hdf5</artifactId> - <version>${hdf5.version}</version> - <classifier>linux-x86_64</classifier> - <scope>runtime</scope> - </dependency> - </dependencies> - </profile> - <profile> - <id>platform-macosx-x64</id> - <activation> - <os> - <family>Mac</family> - <arch>x64</arch> - </os> - </activation> - <dependencies> - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>javacpp</artifactId> - <version>${javacpp.version}</version> - <classifier>macosx-x86_64</classifier> - <scope>runtime</scope> - </dependency> - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>openblas</artifactId> - <version>${openblas.version}</version> - <classifier>macosx-x86_64</classifier> - <scope>runtime</scope> - </dependency> - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>hdf5</artifactId> - <version>${hdf5.version}</version> - <classifier>macosx-x86_64</classifier> - <scope>runtime</scope> - </dependency> - </dependencies> - </profile> - <profile> - <id>platform-macosx-aarch64</id> - <activation> - <os> - <family>mac</family> - <arch>aarch64</arch> - </os> - </activation> - <dependencies> - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>javacpp</artifactId> - <version>${javacpp.version}</version> - <classifier>macosx-arm64</classifier> - <scope>runtime</scope> - </dependency> - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>openblas</artifactId> - <version>${openblas.version}</version> - <classifier>macosx-arm64</classifier> - <scope>runtime</scope> - </dependency> - <!-- Not available for this platform, yet...--> - <!-- - <dependency> - <groupId>org.bytedeco</groupId> - <artifactId>hdf5</artifactId> - <version>${hdf5.version}</version> - <classifier>macosx-arm64</classifier> - <scope>runtime</scope> - </dependency> - --> - </dependencies> - </profile> - </profiles> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-surefire-plugin</artifactId> + <configuration> + <argLine>-Xmx2048m -Dfile.encoding=UTF-8</argLine> + <forkCount>${opennlp.forkCount}</forkCount> + <reuseForks>false</reuseForks> + <failIfNoSpecifiedTests>false</failIfNoSpecifiedTests> + <excludes> + <exclude>**/*IT.java</exclude> + </excludes> + </configuration> + </plugin> - <build> - <plugins> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-compiler-plugin</artifactId> - <configuration> - <source>${maven.compiler.source}</source> - <target>${maven.compiler.target}</target> - <compilerArgument>-Xlint</compilerArgument> - </configuration> - </plugin> + <plugin> + <artifactId>maven-source-plugin</artifactId> + <executions> + <execution> + <id>create-source-jar</id> + <goals> + <goal>jar</goal> + </goals> + <phase>package</phase> + </execution> + </executions> + </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-surefire-plugin</artifactId> - <configuration> - <argLine>-Xmx2048m -Dfile.encoding=UTF-8</argLine> - <forkCount>${opennlp.forkCount}</forkCount> - <reuseForks>false</reuseForks> - <failIfNoSpecifiedTests>false</failIfNoSpecifiedTests> - <excludes> - <exclude>**/*IT.java</exclude> - </excludes> - </configuration> - </plugin> + <plugin> + <artifactId>maven-antrun-plugin</artifactId> + <executions> + <execution> + <id>generate checksums for binary artifacts</id> + <goals> + <goal>run</goal> + </goals> + <phase>verify</phase> + <configuration> + <target> + <checksum algorithm="sha1" format="MD5SUM"> + <fileset dir="${project.build.directory}"> + <include name="*.zip" /> + <include name="*.gz" /> + </fileset> + </checksum> + <checksum algorithm="md5" format="MD5SUM"> + <fileset dir="${project.build.directory}"> + <include name="*.zip" /> + <include name="*.gz" /> + </fileset> + </checksum> + </target> + </configuration> + </execution> + </executions> + </plugin> + <plugin> + <artifactId>maven-assembly-plugin</artifactId> + <executions> + <execution> + <id>src</id> + <goals> + <goal>single</goal> + </goals> + <phase>package</phase> + <configuration> + <descriptors> + <descriptor>src/main/assembly/assembly.xml</descriptor> + </descriptors> + </configuration> + </execution> + <execution> + <id>source-release-assembly</id> + <configuration> + <skipAssembly>true</skipAssembly> + <mavenExecutorId>forked-path</mavenExecutorId> + </configuration> + </execution> + </executions> + </plugin> - <plugin> - <artifactId>maven-source-plugin</artifactId> - <executions> - <execution> - <id>create-source-jar</id> - <goals> - <goal>jar</goal> - </goals> - <phase>package</phase> - </execution> - </executions> - </plugin> - - <plugin> - <artifactId>maven-antrun-plugin</artifactId> - <executions> - <execution> - <id>generate checksums for binary artifacts</id> - <goals> - <goal>run</goal> - </goals> - <phase>verify</phase> - <configuration> - <target> - <checksum algorithm="sha1" format="MD5SUM"> - <fileset dir="${project.build.directory}"> - <include name="*.zip" /> - <include name="*.gz" /> - </fileset> - </checksum> - <checksum algorithm="md5" format="MD5SUM"> - <fileset dir="${project.build.directory}"> - <include name="*.zip" /> - <include name="*.gz" /> - </fileset> - </checksum> - </target> - </configuration> - </execution> - </executions> - </plugin> - <plugin> - <artifactId>maven-assembly-plugin</artifactId> - <executions> - <execution> - <id>src</id> - <goals> - <goal>single</goal> - </goals> - <phase>package</phase> - <configuration> - <descriptors> - <descriptor>src/main/assembly/assembly.xml</descriptor> - </descriptors> - </configuration> - </execution> - <execution> - <id>source-release-assembly</id> - <configuration> - <skipAssembly>true</skipAssembly> - <mavenExecutorId>forked-path</mavenExecutorId> - </configuration> - </execution> - </executions> - </plugin> - - <plugin> - <groupId>org.sonatype.plugins</groupId> - <artifactId>nexus-staging-maven-plugin</artifactId> - <version>1.7.0</version> - <extensions>true</extensions> - <configuration> - <serverId>ossrh</serverId> - <nexusUrl>https://oss.sonatype.org/</nexusUrl> - <autoReleaseAfterClose>true</autoReleaseAfterClose> - </configuration> - </plugin> - </plugins> - </build> + <plugin> + <groupId>org.sonatype.plugins</groupId> + <artifactId>nexus-staging-maven-plugin</artifactId> + <version>1.7.0</version> + <extensions>true</extensions> + <configuration> + <serverId>ossrh</serverId> + <nexusUrl>https://oss.sonatype.org/</nexusUrl> + <autoReleaseAfterClose>true</autoReleaseAfterClose> + </configuration> + </plugin> + </plugins> + </build> </project> \ No newline at end of file diff --git a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/MachineTranslationWrapper.java b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/MachineTranslationWrapper.java index 2db4f12..8f08443 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/MachineTranslationWrapper.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/apps/review_builder/MachineTranslationWrapper.java @@ -36,6 +36,7 @@ public class MachineTranslationWrapper { public String translate(String sentence, String lang2lang) { if (sentence==null) return null; + String request = TRANSLATOR_URL + sentence.replace(' ','+') + "&langpair="+lang2lang;//"en|es"; try { URL urlC = new URI(request).toURL(); @@ -43,17 +44,18 @@ public class MachineTranslationWrapper { String line; StringBuilder result = new StringBuilder(); - BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream())); - int count = 0; - while ((line = reader.readLine()) != null) - { - result.append(line); - count++; + try (BufferedReader reader = new BufferedReader(new InputStreamReader(connection.getInputStream()))) { + int count = 0; + while ((line = reader.readLine()) != null) + { + result.append(line); + count++; + } + JSONObject rootObject = new JSONObject(result.toString()); + JSONObject findObject = rootObject.getJSONObject("responseData"); + String transl = findObject.getString("translatedText"); + return URLDecoder.decode(transl, StandardCharsets.UTF_8); } - JSONObject rootObject = new JSONObject(result.toString()); - JSONObject findObject = rootObject.getJSONObject("responseData"); - String transl = findObject.getString("translatedText"); - return URLDecoder.decode(transl, StandardCharsets.UTF_8); } catch (IOException | URISyntaxException | JSONException e) { e.printStackTrace(); diff --git a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java index ccd9f63..41bec16 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifier.java @@ -29,8 +29,6 @@ import opennlp.tools.similarity.apps.utils.ValueSortMap; import opennlp.tools.textsimilarity.TextProcessor; import org.apache.commons.lang.StringUtils; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; @@ -44,30 +42,25 @@ import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.json.JSONObject; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class DocClassifier { - private static final Log LOGGER = LogFactory.getLog(DocClassifier.class); + private static final Logger LOGGER = LoggerFactory.getLogger(DocClassifier.class); public static final String DOC_CLASSIFIER_KEY = "doc_class"; public static final String RESOURCE_DIR = null; private Map<String, Float> scoredClasses; - public static final Float MIN_TOTAL_SCORE_FOR_CATEGORY = 0.3f; //3.0f; protected static IndexReader indexReader = null; protected static IndexSearcher indexSearcher = null; // resource directory plus the index folder - private static final String INDEX_PATH = RESOURCE_DIR - + ClassifierTrainingSetIndexer.INDEX_PATH; + private static final String INDEX_PATH = RESOURCE_DIR + ClassifierTrainingSetIndexer.INDEX_PATH; // http://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm private static final int MAX_DOCS_TO_USE_FOR_CLASSIFY = 10, // 10 similar - // docs for - // nearest - // neighbor - // settings - + // docs for nearest neighbor settings MAX_CATEG_RESULTS = 2; private static final float BEST_TO_NEX_BEST_RATIO = 2.0f; // to accumulate classif results @@ -112,7 +105,7 @@ public class DocClassifier { } } - public DocClassifier(String inputFilename, JSONObject inputJSON) { + public DocClassifier(String inputFilename) { scoredClasses = new HashMap<>(); } @@ -131,18 +124,15 @@ public class DocClassifier { Query query; try { query = parser.parse(queryStr); - } catch (ParseException e2) { - return results; } TopDocs hits = null; // TopDocs search(Query, int) // Finds the top n hits for query. try { - hits = indexSearcher - .search(query, MAX_DOCS_TO_USE_FOR_CLASSIFY + 2); + hits = indexSearcher.search(query, MAX_DOCS_TO_USE_FOR_CLASSIFY + 2); } catch (IOException e1) { - LOGGER.error("problem searching index \n" + e1); + LOGGER.error("problem searching index \n", e1); } LOGGER.debug("Found " + hits.totalHits + " hits for " + queryStr); int count = 0; @@ -175,8 +165,7 @@ public class DocClassifier { } try { scoredClasses = ValueSortMap.sortMapByValue(scoredClasses, false); - List<String> resultsAll = new ArrayList<>( - scoredClasses.keySet()), resultsAboveThresh = new ArrayList<>(); + List<String> resultsAll = new ArrayList<>(scoredClasses.keySet()), resultsAboveThresh = new ArrayList<>(); for (String key : resultsAll) { if (scoredClasses.get(key) > MIN_TOTAL_SCORE_FOR_CATEGORY) resultsAboveThresh.add(key); @@ -211,15 +200,11 @@ public class DocClassifier { } - - - public static String formClassifQuery(String pageContentReader, int maxRes) { // We want to control which delimiters we substitute. For example '_' & // \n we retain - pageContentReader = pageContentReader.replaceAll("[^A-Za-z0-9 _\\n]", - ""); + pageContentReader = pageContentReader.replaceAll("[^A-Za-z0-9 _\\n]", ""); Scanner in = new Scanner(pageContentReader); in.useDelimiter("\\s+"); @@ -258,11 +243,9 @@ public class DocClassifier { } } - /* * Main entry point for classifying sentences */ - public List<String> getEntityOrClassFromText(String content) { List<String> sentences = TextProcessor.splitToSentences(content); @@ -284,7 +267,6 @@ public class DocClassifier { LOGGER.debug(sentence + " => " + classifResults); } } - } catch (Exception e) { LOGGER.error("Problem classifying sentence\n " + e); } @@ -294,11 +276,10 @@ public class DocClassifier { aggrResults = localCats.getFrequentTags(); - LOGGER.debug(localCats.getFrequentTags()); + LOGGER.debug(localCats.getFrequentTags().toString()); } catch (Exception e) { - LOGGER.error("Problem aggregating search results\n" + e); + LOGGER.error("Problem aggregating search results\n", e); } return aggrResults; } - } diff --git a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java index 90501ad..29a5107 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetMultilingualExtender.java @@ -33,7 +33,6 @@ import java.util.List; import org.apache.commons.io.FileUtils; import org.apache.commons.lang.StringUtils; -import org.json.JSONObject; /* * This utility gets 'training_corpus' as input and creates a new version of training_corpus with verified files. @@ -56,7 +55,7 @@ public class DocClassifierTrainingSetMultilingualExtender { public DocClassifierTrainingSetMultilingualExtender(String resource) { - classifier = new DocClassifier("", new JSONObject()); + classifier = new DocClassifier(""); } private final int FRAGMENT_LENGTH = 500; diff --git a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java index 4da160a..95c2b27 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/doc_classifier/DocClassifierTrainingSetVerifier.java @@ -26,33 +26,28 @@ import opennlp.tools.jsmlearning.ProfileReaderWriter; import org.apache.commons.io.FileUtils; import org.apache.tika.Tika; import org.apache.tika.exception.TikaException; -import org.json.JSONObject; /* * This utility gets 'training_corpus' as input and creates a new version of training_corpus with verified files. * Verified => classified by existing training set as only belonging to its target category, no other categories, not empty. */ public class DocClassifierTrainingSetVerifier { + + private static final int FRAGMENT_LENGTH = 500; public static String projectHome = new File(".").getAbsolutePath(); - public static String resourceDir = new File(".").getAbsolutePath().replace("/.", "") + "/src/main/resources"; + public static String resourceDir = projectHome.replace("/.", "") + "/src/main/resources"; DocClassifier classifier; private String sourceDir = null, destinationDir = null; - protected final ArrayList<File> queue = new ArrayList<>(); - protected final Tika tika = new Tika(); - public DocClassifierTrainingSetVerifier(String resource) { - - - classifier = new DocClassifier("", new JSONObject()); + public DocClassifierTrainingSetVerifier(String resource) { + classifier = new DocClassifier(""); } - private static final int FRAGMENT_LENGTH = 500; protected void addFiles(File file) { - try { if (!file.exists()) { System.out.println(file + " does not exist."); @@ -90,8 +85,7 @@ public class DocClassifierTrainingSetVerifier { //if (f.getName().indexOf(".html")<0) //continue; - classifier = new DocClassifier("", new JSONObject()); - + classifier = new DocClassifier(""); content = tika.parseToString(f); diff --git a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java index 409172b..8224273 100644 --- a/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java +++ b/opennlp-similarity/src/main/java/opennlp/tools/textsimilarity/ParseTreeChunk.java @@ -421,11 +421,10 @@ public class ParseTreeChunk implements Serializable { } public boolean equals(ParseTreeChunk ch) { - List<String> lems = ch.getLemmas(); - List<String> poss = ch.POSs; return ListUtils.isEqualList(ch.getLemmas(), this.lemmas) && ListUtils.isEqualList(ch.getPOSs(), this.POSs); } + @Override public String toString() { StringBuilder buf = new StringBuilder(" ["); if (mainPOS != null)
