This is an automated email from the ASF dual-hosted git repository. rzo1 pushed a commit to branch OPENNLP-1638 in repository https://gitbox.apache.org/repos/asf/opennlp-models.git
commit 666e3627e2ec887e50ae161e0ff17f13c54bddd8 Author: Richard Zowalla <[email protected]> AuthorDate: Wed Nov 6 15:03:00 2024 +0100 OPENNLP-1638 - Add initial training script --- .../opennlp-models-training-ud/pom.xml | 35 +++++++ .../src/main/resources/ud-train.sh | 115 +++++++++++++++++++++ opennlp-models-training/pom.xml | 41 ++++++++ pom.xml | 1 + 4 files changed, 192 insertions(+) diff --git a/opennlp-models-training/opennlp-models-training-ud/pom.xml b/opennlp-models-training/opennlp-models-training-ud/pom.xml new file mode 100644 index 0000000..949cab6 --- /dev/null +++ b/opennlp-models-training/opennlp-models-training-ud/pom.xml @@ -0,0 +1,35 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models-training</artifactId> + <version>1.1.1-SNAPSHOT</version> + </parent> + + <artifactId>opennlp-models-training-ud</artifactId> + <name>Apache OpenNLP Models :: Training :: Universal Dependencies</name> + +</project> \ No newline at end of file diff --git a/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.sh b/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.sh new file mode 100755 index 0000000..3d3c8e7 --- /dev/null +++ b/opennlp-models-training/opennlp-models-training-ud/src/main/resources/ud-train.sh @@ -0,0 +1,115 @@ +#!/bin/bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +# This script facilitates training OpenNLP models on Universal Dependencies (UD) 2.7 data. + +# Script configuration +UD_HOME="./" +OPENNLP_VERSION="opennlp-2.4.0" +OPENNLP_VERSION_NUMERIC="2.4.0" +OPENNLP_MODEL_VERSION="1.1" +OPENNLP_HOME="./apache-opennlp-2.4.0" +OUTPUT_MODELS="./ud-models-2.4.0" +GPG_PUBLIC_KEY="" # the public key from the OPENNLP KEYS file in short form. +EVAL_AFTER_TRAINING="true" +CREATE_RELEASE="true" +ENCODING="UTF-8" + + +# Model(s) to train +declare -a MODELS=("English|en|EWT" "Dutch|nl|Alpino" "French|fr|GSD" "German|de|GSD" "Italian|it|VIT" "Bulgarian|bg|BTB" "Czech|cs|PDT" "Croatian|hr|SET" "Danish|da|DDT" "Estonian|et|EDT" "Finnish|fi|TDT" "Latvian|lv|LVTB" "Norwegian|no|Bokmaal" "Polish|pl|PDB" "Portuguese|pt|GSD" "Romanian|ro|RRT" "Russian|ru|GSD" "Serbian|sr|SET" "Slovenian|sl|SSJ" "Spanish|es|GSD" "Slovak|sk|SNK" "Swedish|sv|Talbanken" "Ukrainian|uk|IU") + +# Create output directory +mkdir -p ${OUTPUT_MODELS} + +for i in "${MODELS[@]}" +do + + echo $i + LANG=`echo $i | cut -d'|' -f1` + LANGCODE=`echo $i | cut -d'|' -f2` + SUBSET=`echo $i | cut -d'|' -f3` + SUBSETLC=`echo ${SUBSET} | tr '[:upper:]' '[:lower:]'` + + # Tokenizer model + echo -e "\nTraining tokenizer model ${SUBSET} ${LANG}..." + ${OPENNLP_HOME}/bin/opennlp TokenizerTrainer.conllu -model ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin -lang ${LANGCODE} -data ./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu -encoding ${ENCODING} > ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.train + + if [ ${EVAL_AFTER_TRAINING} == "true" ]; then + echo -e "\nEvaluating tokenizer model ${SUBSET} ${LANG}..." + ${OPENNLP_HOME}/bin/opennlp TokenizerMEEvaluator.conllu -model ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin -data ./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-test.conllu -encoding ${ENCODING} > ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval + fi + + if [ ${CREATE_RELEASE} == "true" ]; then + echo -e "\nCreating hashes and ASC signature for tokenizer model ${SUBSET} ${LANG}..." + sha512sum ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin > ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha512 + sha256sum ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin > ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha256 + gpg --default-key $GPG_PUBLIC_KEY --armor --output ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.asc --detach-sign ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-tokens-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin + fi + + # Sentence model + echo -e "\nTraining sentence model ${SUBSET} ${LANG}..." + ${OPENNLP_HOME}/bin/opennlp SentenceDetectorTrainer.conllu -model ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin -lang ${LANGCODE} -data ./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu -encoding ${ENCODING} -sentencesPerSample 10 > ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.train + + if [ ${EVAL_AFTER_TRAINING} == "true" ]; then + echo -e "Evaluating sentence model ${SUBSET} ${LANG}..." + ${OPENNLP_HOME}/bin/opennlp SentenceDetectorEvaluator.conllu -model ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin -data ./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-test.conllu -encoding ${ENCODING} -sentencesPerSample 10 > ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval + fi + + if [ ${CREATE_RELEASE} == "true" ]; then + echo -e "\nCreating hashes and ASC signature for sentence model ${SUBSET} ${LANG}..." + sha512sum ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin > ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha512 + sha256sum ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin > ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha256 + gpg --default-key $GPG_PUBLIC_KEY --armor --output ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.asc --detach-sign ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-sentence-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin + fi + + # POS model + echo -e "\nTraining POS model ${SUBSET} ${LANG}..." + ${OPENNLP_HOME}/bin/opennlp POSTaggerTrainer.conllu -model ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin -data ./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-train.conllu -encoding ${ENCODING} -lang ${LANGCODE} > ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval > ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VE [...] + + if [ ${EVAL_AFTER_TRAINING} == "true" ]; then + echo -e "\nEvaluating POS model ${SUBSET} ${LANG}..." + ${OPENNLP_HOME}/bin/opennlp POSTaggerEvaluator.conllu -model ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin -data ./ud-treebanks-v2.14/UD_${LANG}-${SUBSET}/${LANGCODE}_${SUBSETLC}-ud-test.conllu -encoding ${ENCODING} > ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.eval + fi + + if [ ${CREATE_RELEASE} == "true" ]; then + echo -e "\nCreating hashes and ASC signature for POS model ${SUBSET} ${LANG}..." + sha512sum ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin > ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha512 + sha256sum ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin > ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.sha256 + gpg --default-key $GPG_PUBLIC_KEY --armor --output ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin.asc --detach-sign ${OUTPUT_MODELS}/opennlp-${LANGCODE}-ud-${SUBSETLC}-pos-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.bin + fi + +done + +if [ ${CREATE_RELEASE} == "true" ]; then + cd ${OUTPUT_MODELS}; + echo -e "\nCreate ZIP with eval and train logs." + zip opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip *train *.eval + sha512sum opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip > opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip.sha512 + sha256sum opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip > opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip.sha256 + gpg --default-key $GPG_PUBLIC_KEY --armor --output opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip.asc --detach-sign opennlp-training-eval-logs-${OPENNLP_MODEL_VERSION}-${OPENNLP_VERSION_NUMERIC}.zip + + echo -e "\nRemove the path from sha512 and sha256 checksum files" + # Remove the path from sha512 and sha256 checksum files + sed -i "" "s|${OUTPUT_MODELS}/||" *.sha512 + sed -i "" "s|${OUTPUT_MODELS}/||" *.sha256 + +fi \ No newline at end of file diff --git a/opennlp-models-training/pom.xml b/opennlp-models-training/pom.xml new file mode 100644 index 0000000..700e050 --- /dev/null +++ b/opennlp-models-training/pom.xml @@ -0,0 +1,41 @@ +<?xml version="1.0" encoding="UTF-8"?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.opennlp</groupId> + <artifactId>opennlp-models</artifactId> + <version>1.1.1-SNAPSHOT</version> + </parent> + + <packaging>pom</packaging> + + <artifactId>opennlp-models-training</artifactId> + <name>Apache OpenNLP Models :: Training</name> + + <modules> + <module>opennlp-models-training-ud</module> + </modules> + +</project> \ No newline at end of file diff --git a/pom.xml b/pom.xml index 32c1a06..f0d0f89 100644 --- a/pom.xml +++ b/pom.xml @@ -278,6 +278,7 @@ <module>opennlp-models-pos</module> <module>opennlp-models-test</module> <module>opennlp-models-tokenizer</module> + <module>opennlp-models-training</module> </modules> </project>
