This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch opennlp-2.x
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/opennlp-2.x by this push:
new 117d27f7 OPENNLP-1782: Add tagging examples to verify French POS model
(#863)
117d27f7 is described below
commit 117d27f7c453780a109ee832d45eb953e72e6f1a
Author: meriam2303 <[email protected]>
AuthorDate: Tue Nov 11 14:49:32 2025 +0100
OPENNLP-1782: Add tagging examples to verify French POS model (#863)
* adds French sample sentence and pos tags, incl. arabic+maghrebi stub
examples for existing tests
* adds French constant
* inits French resources for test context
---------
Co-authored-by: Richard Zowalla <[email protected]>
(cherry picked from commit 237a7713cc6d59e914741f58395fa4068ef311e6)
---
.../java/opennlp/tools/postag/POSTaggerMEIT.java | 21 +++++++++++++++++++--
1 file changed, 19 insertions(+), 2 deletions(-)
diff --git
a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMEIT.java
b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMEIT.java
index d901654a..931e5c6f 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMEIT.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMEIT.java
@@ -43,6 +43,7 @@ public class POSTaggerMEIT {
private static final String GERMAN = "de";
private static final String POLISH = "pl";
private static final String PORTUGUESE = "pt";
+ private static final String FRENCH = "fr";
private static final Map<String, Tokenizer> TOKENIZERS = new HashMap<>();
private static final Map<String, POSTagger> TAGGERS = new HashMap<>();
@@ -51,7 +52,7 @@ public class POSTaggerMEIT {
@BeforeAll
public static void initResources() throws IOException {
- List<String> langs = List.of(CATALAN, ENGLISH, GERMAN, POLISH, PORTUGUESE);
+ final List<String> langs = List.of(CATALAN, ENGLISH, FRENCH, GERMAN,
POLISH, PORTUGUESE);
for (String langCode: langs) {
TOKENIZERS.put(langCode, new ThreadSafeTokenizerME(langCode));
TAGGERS.put(langCode, new ThreadSafePOSTaggerME(langCode));
@@ -142,7 +143,7 @@ public class POSTaggerMEIT {
"Un gran embossament d'aire fred es comença a despenjar cap al centre
d'Europa.",
// OpenNLP, different at: idx pos 2, 3, 5, and 13(+14) -> however,
only pos 5 is "wrong" (ref)
new String[]{"DET", "ADJ", "NOUN", "ADP", "NOUN", "ADJ", "PRON",
"VERB", "ADP", "VERB", "NOUN",
- "ADP+DET", "NOUN", "ADP", "PROPN", "PUNCT"})
+ "ADP+DET", "NOUN", "ADP", "PROPN", "PUNCT"}),
// REFERENCE ("gold"):
// "DET", "ADJ", "NOUN", "ADP", "NOUN", "ADJ", "PRON", "VERB", "ADP",
"VERB", "NOUN", "ADP+DET",
// "NOUN", "ADP", "PROPN", "PUNCT"})
@@ -152,6 +153,22 @@ public class POSTaggerMEIT {
// "NOUN", "PROPN", "PROPN", "PUNCT"
// ok! , ok! , ??? , ??? , ok! , ok! , ok! , ok! , ok! ,
ok! , ok! , ok! + ok! ,
// ok! , ??? , ok! , ok!
+ // via: @meriam2303 , original by Guillaume Musso:
+ // La jeune fille et la nuit, S.469
+ Arguments.of(FRENCH, 0,
+ "Vivre avec elle me faisait souffrir, mais vivre sans elle
m'aurait tué.",
+ new String[] {"VERB", "ADP", "PRON", "PRON", "VERB", "VERB",
"PUNCT", "CCONJ", "VERB",
+ "ADP", "PRON", "PRON", "AUX", "VERB", "PUNCT"})
+ // via @meriam2303, original by Hind Choueykh Ben Salah
+ // التجريد في الشّعر العربي , S. 42
+ //Arguments.of(ARABIC,0,
+ //"عشق أبو نواس جارية تدعى جنان",
+ //new String[]{"VERB","PROPN","NOUN","VERB","PROPN"})
+ // via @meriam2303, original by Mohamed Laarousi Elmetoui
+ // التوت المر , S.7
+ //Arguments.of(MARGHREBI_ARABIC_FRENCH,0,
+ //"Wassa3 belek ya baba...",
+ //new String[]{"VERB","NOUN","ITNJ","NOUN","PUNCT"})
);
}
}