This is an automated email from the ASF dual-hosted git repository.
rzo1 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
The following commit(s) were added to refs/heads/master by this push:
new 0c0af3f OPENNLP-1634 - Move OpenNLP Brat Annotator back to Sandbox
0c0af3f is described below
commit 0c0af3f6a5ca29c2c127fbead42c1cdd7a801967
Author: Richard Zowalla <[email protected]>
AuthorDate: Tue Oct 29 13:51:45 2024 +0100
OPENNLP-1634 - Move OpenNLP Brat Annotator back to Sandbox
---
opennlp-brat-annotator/pom.xml | 119 ++++++++++++++++++
.../src/main/bin/brat-annotation-service | 56 +++++++++
.../src/main/bin/brat-annotation-service.bat | 51 ++++++++
.../java/opennlp/bratann/NameFinderAnnService.java | 102 +++++++++++++++
.../java/opennlp/bratann/NameFinderResource.java | 138 +++++++++++++++++++++
pom.xml | 1 +
6 files changed, 467 insertions(+)
diff --git a/opennlp-brat-annotator/pom.xml b/opennlp-brat-annotator/pom.xml
new file mode 100644
index 0000000..75beb96
--- /dev/null
+++ b/opennlp-brat-annotator/pom.xml
@@ -0,0 +1,119 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more
contributor
+ license agreements. See the NOTICE file distributed with this work for
additional
+ information regarding copyright ownership. The ASF licenses this file to
+ you under the Apache License, Version 2.0 (the "License"); you may not
use
+ this file except in compliance with the License. You may obtain a copy
of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless
required
+ by applicable law or agreed to in writing, software distributed under
the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0
http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-sandbox</artifactId>
+ <version>2.4.1-SNAPSHOT</version>
+ </parent>
+
+ <artifactId>opennlp-brat-annotator</artifactId>
+ <packaging>jar</packaging>
+
+ <name>Apache OpenNLP Brat Annotator</name>
+
+ <properties>
+
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ <jackson.version>2.18.0</jackson.version>
+ <jersey.version>3.1.9</jersey.version>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.glassfish.jersey.containers</groupId>
+ <artifactId>jersey-container-grizzly2-http</artifactId>
+ <version>${jersey.version}</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.glassfish.jersey.media</groupId>
+ <artifactId>jersey-media-json-jackson</artifactId>
+ <version>${jersey.version}</version>
+ <scope>runtime</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-annotations</artifactId>
+ <version>${jackson.version}</version>
+ <scope>runtime</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>com.fasterxml.jackson.core</groupId>
+ <artifactId>jackson-databind</artifactId>
+ <version>${jackson.version}</version>
+ <scope>runtime</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>com.fasterxml.jackson.module</groupId>
+ <artifactId>jackson-module-jaxb-annotations</artifactId>
+ <version>${jackson.version}</version>
+ <scope>runtime</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.opennlp</groupId>
+ <artifactId>opennlp-tools</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.junit.jupiter</groupId>
+ <artifactId>junit-jupiter-api</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.junit.jupiter</groupId>
+ <artifactId>junit-jupiter-engine</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-simple</artifactId>
+ <version>${slf4j.version}</version>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <artifactId>maven-assembly-plugin</artifactId>
+ <configuration>
+ <descriptorRefs>
+
<descriptorRef>jar-with-dependencies</descriptorRef>
+ </descriptorRefs>
+ </configuration>
+ <executions>
+ <execution>
+ <id>make-assembly</id>
+ <phase>package</phase>
+ <goals>
+ <goal>single</goal>
+ </goals>
+ </execution>
+ </executions>
+ </plugin>
+ </plugins>
+ </build>
+</project>
diff --git a/opennlp-brat-annotator/src/main/bin/brat-annotation-service
b/opennlp-brat-annotator/src/main/bin/brat-annotation-service
new file mode 100755
index 0000000..eac9566
--- /dev/null
+++ b/opennlp-brat-annotator/src/main/bin/brat-annotation-service
@@ -0,0 +1,56 @@
+#!/bin/sh
+
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# Note: Do not output anything in this script file, any output
+# may be inadvertantly placed in any output files if
+# output redirection is used.
+
+# determine OPENNLP_HOME - $0 may be a symlink to OpenNLP's home
+PRG="$0"
+
+while [ -h "$PRG" ] ; do
+ ls=`ls -ld "$PRG"`
+ link=`expr "$ls" : '.*-> \(.*\)$'`
+ if expr "$link" : '/.*' > /dev/null; then
+ PRG="$link"
+ else
+ PRG="`dirname "$PRG"`/$link"
+ fi
+done
+
+saveddir=`pwd`
+
+OPENNLP_HOME=`dirname "$PRG"`/..
+
+# make it fully qualified
+OPENNLP_HOME=`cd "$OPENNLP_HOME" && pwd`
+
+cd "$saveddir"
+
+if [ -z "$JAVACMD" ] ; then
+ if [ -n "$JAVA_HOME" ] ; then
+ JAVACMD="$JAVA_HOME/bin/java"
+ else
+ JAVACMD="`which java`"
+ fi
+fi
+
+CLASSPATH=$(echo $OPENNLP_HOME/lib/*.jar | tr ' ' ':')
+
+$JAVACMD -Xmx1024m -Dlog4j.configurationFile="$OPENNLP_HOME/conf/log4j2.xml"
-cp "$CLASSPATH" opennlp.bratann.NameFinderAnnService $@
diff --git a/opennlp-brat-annotator/src/main/bin/brat-annotation-service.bat
b/opennlp-brat-annotator/src/main/bin/brat-annotation-service.bat
new file mode 100755
index 0000000..289248b
--- /dev/null
+++ b/opennlp-brat-annotator/src/main/bin/brat-annotation-service.bat
@@ -0,0 +1,51 @@
+@ECHO off
+
+REM # Licensed to the Apache Software Foundation (ASF) under one
+REM # or more contributor license agreements. See the NOTICE file
+REM # distributed with this work for additional information
+REM # regarding copyright ownership. The ASF licenses this file
+REM # to you under the Apache License, Version 2.0 (the
+REM # "License"); you may not use this file except in compliance
+REM # with the License. You may obtain a copy of the License at
+REM #
+REM # http://www.apache.org/licenses/LICENSE-2.0
+REM #
+REM # Unless required by applicable law or agreed to in writing,
+REM # software distributed under the License is distributed on an
+REM # # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+REM # KIND, either express or implied. See the License for the
+REM # specific language governing permissions and limitations
+REM # under the License.
+
+REM # Note: Do not output anything in this script file, any output
+REM # may be inadvertantly placed in any output files if
+REM # output redirection is used.
+SETLOCAL
+
+IF "%JAVA_CMD%" == "" (
+ IF "%JAVA_HOME%" == "" (
+ SET JAVA_CMD=java
+ ) ELSE (
+ REM # Keep JAVA_HOME to short-name without spaces
+ FOR %%A IN ("%JAVA_HOME%") DO SET JAVA_CMD=%%~sfA\bin\java
+ )
+)
+
+REM # Should work with Windows XP and greater. If not, specify the path to
where it is installed.
+IF "%OPENNLP_HOME%" == "" (
+ SET OPENNLP_HOME=%~sp0..
+) ELSE (
+ REM # Keep OPENNLP_HOME to short-name without spaces
+ FOR %%A IN ("%OPENNLP_HOME%") DO SET OPENNLP_HOME=%%~sfA
+)
+setLocal EnableDelayedExpansion
+set CLASSPATH="
+
+FOR %%A IN ("%OPENNLP_HOME%\lib\*.jar") DO (
+ set CLASSPATH=!CLASSPATH!;%%A
+)
+set CLASSPATH=!CLASSPATH!"
+
+%JAVA_CMD% -Xmx1024m
"-Dlog4j.configurationFile=%OPENNLP_HOME%\conf\log4j2.xml" -cp %CLASSPATH%
opennlp.bratann.NameFinderAnnService %*
+
+ENDLOCAL
\ No newline at end of file
diff --git
a/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderAnnService.java
b/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderAnnService.java
new file mode 100644
index 0000000..1735cb8
--- /dev/null
+++
b/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderAnnService.java
@@ -0,0 +1,102 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.bratann;
+
+import java.io.File;
+import java.net.URI;
+import java.util.Arrays;
+import java.util.List;
+
+import jakarta.ws.rs.core.UriBuilder;
+import org.glassfish.jersey.grizzly2.httpserver.GrizzlyHttpServerFactory;
+import org.glassfish.jersey.server.ResourceConfig;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.sentdetect.NewlineSentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+
+public class NameFinderAnnService {
+
+ private static final Logger LOG =
LoggerFactory.getLogger(NameFinderAnnService.class);
+ static SentenceDetector sentenceDetector = new NewlineSentenceDetector();
+ static Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
+ static TokenNameFinder[] nameFinders;
+
+ public static void main(String[] args) throws Exception {
+
+ if (args.length == 0) {
+ LOG.info("Usage:");
+ LOG.info("[NameFinderAnnService -serverPort port] [-tokenizerModel file]
"
+ + "[-ruleBasedTokenizer whitespace|simple] "
+ + "[-sentenceDetectorModel file] namefinderFile|nameFinderURI");
+ return;
+ }
+
+ List<String> argList = Arrays.asList(args);
+
+ int serverPort = 8080;
+ int serverPortIndex = argList.indexOf("-serverPort") + 1;
+
+ if (serverPortIndex > 0 && serverPortIndex < args.length) {
+ serverPort = Integer.parseInt(args[serverPortIndex]);
+ }
+
+ int sentenceModelIndex = argList.indexOf("-sentenceDetectorModel") + 1;
+ if (sentenceModelIndex > 0 && sentenceModelIndex < args.length) {
+ sentenceDetector = new SentenceDetectorME(
+ new SentenceModel(new File(args[sentenceModelIndex])));
+ }
+
+ int ruleBasedTokenizerIndex = argList.indexOf("-ruleBasedTokenizer") + 1;
+
+ if (ruleBasedTokenizerIndex > 0 && ruleBasedTokenizerIndex < args.length) {
+ if ("whitespace".equals(args[ruleBasedTokenizerIndex])) {
+ tokenizer = WhitespaceTokenizer.INSTANCE;
+ } else if ("simple".equals(args[ruleBasedTokenizerIndex])) {
+ tokenizer = SimpleTokenizer.INSTANCE;
+ } else {
+ LOG.error("unknown tokenizer: {}", args[ruleBasedTokenizerIndex]);
+ return;
+ }
+ }
+
+ int tokenizerModelIndex = argList.indexOf("-tokenizerModel") + 1;
+ if (tokenizerModelIndex > 0 && tokenizerModelIndex < args.length) {
+ tokenizer = new TokenizerME(
+ new TokenizerModel(new File(args[tokenizerModelIndex])));
+ }
+
+ nameFinders = new TokenNameFinder[] {new NameFinderME(
+ new TokenNameFinderModel(new File(args[args.length - 1])))};
+
+ URI baseUri =
UriBuilder.fromUri("http://localhost/").port(serverPort).build();
+ ResourceConfig config = new ResourceConfig(NameFinderResource.class);
+ GrizzlyHttpServerFactory.createHttpServer(baseUri, config);
+ }
+}
diff --git
a/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderResource.java
b/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderResource.java
new file mode 100644
index 0000000..f824c18
--- /dev/null
+++
b/opennlp-brat-annotator/src/main/java/opennlp/bratann/NameFinderResource.java
@@ -0,0 +1,138 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.bratann;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import jakarta.ws.rs.Consumes;
+import jakarta.ws.rs.POST;
+import jakarta.ws.rs.Path;
+import jakarta.ws.rs.Produces;
+import jakarta.ws.rs.QueryParam;
+import jakarta.ws.rs.core.MediaType;
+
+import opennlp.tools.namefind.TokenNameFinder;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.Span;
+
+@Path("/ner")
+public class NameFinderResource {
+
+ private final SentenceDetector sentDetect =
NameFinderAnnService.sentenceDetector;
+ private final Tokenizer tokenizer = NameFinderAnnService.tokenizer;
+ private final TokenNameFinder[] nameFinders =
NameFinderAnnService.nameFinders;
+
+ private static int findNextNonWhitespaceChar(CharSequence s, int
beginOffset, int endOffset) {
+ for (int i = beginOffset; i < endOffset; i++) {
+ if (!Character.isSpaceChar(s.charAt(i))) {
+ return i;
+ }
+ }
+ return -1;
+ }
+
+ @POST
+ @Consumes(MediaType.TEXT_PLAIN)
+ @Produces(MediaType.APPLICATION_JSON)
+ public Map<String, NameAnn> findNames(@QueryParam("model") String modelName,
String text) {
+ Span[] sentenceSpans = sentDetect.sentPosDetect(text);
+ Map<String, NameAnn> map = new HashMap<>();
+
+ int indexCounter = 0;
+
+ for (Span sentenceSpan : sentenceSpans) {
+
+ String sentenceText = sentenceSpan.getCoveredText(text).toString();
+
+ // offset of sentence gets lost here!
+ Span[] tokenSpans = tokenizer.tokenizePos(sentenceText);
+
+ String[] tokens = Span.spansToStrings(tokenSpans, sentenceText);
+
+ for (TokenNameFinder nameFinder : nameFinders) {
+ Span[] names = nameFinder.find(tokens);
+
+ for (Span name : names) {
+
+ int beginOffset = tokenSpans[name.getStart()].getStart() +
sentenceSpan.getStart();
+ int endOffset = tokenSpans[name.getEnd() - 1].getEnd() +
sentenceSpan.getStart();
+
+ // create a list of new line indexes
+ List<Integer> newLineIndexes = new ArrayList<>();
+
+ // TODO: Code needs to handle case that there are multiple new lines
+ // in a row
+
+ boolean inNewLineSequence = false;
+ for (int ci = beginOffset; ci < endOffset; ci++) {
+ if (text.charAt(ci) == '\n' || text.charAt(ci) == '\r') {
+ if (!inNewLineSequence) {
+ newLineIndexes.add(ci);
+ }
+ inNewLineSequence = true;
+ } else {
+ inNewLineSequence = false;
+ }
+ }
+
+ List<String> textSegments = new ArrayList<>();
+ List<int[]> spanSegments = new ArrayList<>();
+
+ int segmentBegin = beginOffset;
+
+ for (int newLineOffset : newLineIndexes) {
+ // create segment from begin to offset
+ textSegments.add(text.substring(segmentBegin, newLineOffset));
+ spanSegments.add(new int[] {segmentBegin, newLineOffset});
+
+ segmentBegin = findNextNonWhitespaceChar(text, newLineOffset + 1,
+ endOffset);
+
+ if (segmentBegin == -1) {
+ break;
+ }
+ }
+
+ // create left over segment
+ if (segmentBegin != -1) {
+ textSegments.add(text.substring(segmentBegin, endOffset));
+ spanSegments.add(new int[] {segmentBegin, endOffset});
+ }
+
+ NameAnn ann = new NameAnn();
+ ann.texts = textSegments.toArray(new String[0]);
+ ann.offsets = spanSegments.toArray(new int[spanSegments.size()][]);
+ ann.type = name.getType();
+
+ map.put(Integer.toString(indexCounter++), ann);
+ }
+ }
+ }
+ return map;
+ }
+
+ public static class NameAnn {
+ public int[][] offsets;
+ public String[] texts;
+ public String type;
+ }
+}
diff --git a/pom.xml b/pom.xml
index 6717b3a..18279d0 100644
--- a/pom.xml
+++ b/pom.xml
@@ -99,6 +99,7 @@
<module>mallet-addon</module>
<module>modelbuilder-addon</module>
<module>nlp-utils</module>
+ <module>opennlp-brat-annotator</module>
<module>opennlp-coref</module>
<module>opennlp-dl</module>
<module>opennlp-similarity</module>