Author: rwesten
Date: Thu Dec 5 17:26:01 2013
New Revision: 1548212
URL: http://svn.apache.org/r1548212
Log:
STANBOL-1231: added POS Tag and Phrase Tag mapping for the Treebank+ Tagset in
the opennlp-pos and opennlp-chunker engine.
Modified:
stanbol/trunk/enhancement-engines/opennlp/opennlp-chunker/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/chunker/model/PhraseTagSetRegistry.java
stanbol/trunk/enhancement-engines/opennlp/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/model/PosTagSetRegistry.java
Modified:
stanbol/trunk/enhancement-engines/opennlp/opennlp-chunker/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/chunker/model/PhraseTagSetRegistry.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/opennlp/opennlp-chunker/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/chunker/model/PhraseTagSetRegistry.java?rev=1548212&r1=1548211&r2=1548212&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/opennlp/opennlp-chunker/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/chunker/model/PhraseTagSetRegistry.java
(original)
+++
stanbol/trunk/enhancement-engines/opennlp/opennlp-chunker/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/chunker/model/PhraseTagSetRegistry.java
Thu Dec 5 17:26:01 2013
@@ -24,6 +24,7 @@ import opennlp.tools.chunker.Chunker;
import org.apache.stanbol.enhancer.nlp.model.tag.TagSet;
import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
/**
* Registry for {@link PhraseTag} {@link TagSet}s used by OpenNLP
@@ -75,4 +76,22 @@ public class PhraseTagSetRegistry {
DEFAULT.addTag(new PhraseTag("PP",
LexicalCategory.PronounOrDeterminer));
getInstance().add(DEFAULT);
}
+
+ public static final TagSet<PhraseTag> FRENCH = new TagSet<PhraseTag>(
+ "French Treebank+ Phrase TagSet", "fr");
+
+ static {
+ FRENCH.addTag(new PhraseTag("AP", LexicalCategory.Adjective));
+ FRENCH.addTag(new PhraseTag("AdP",LexicalCategory.Adverb));
+ FRENCH.addTag(new PhraseTag("COORD",LexicalCategory.Conjuction));
+ FRENCH.addTag(new PhraseTag("NP",LexicalCategory.Noun));
+ FRENCH.addTag(new PhraseTag("PP",
LexicalCategory.PronounOrDeterminer));
+ FRENCH.addTag(new PhraseTag("VN",LexicalCategory.Verb));
+ FRENCH.addTag(new PhraseTag("VPinf",LexicalCategory.Verb));
+ FRENCH.addTag(new PhraseTag("VPpart",LexicalCategory.Verb));
+ FRENCH.addTag(new PhraseTag("Ssub"));
+ FRENCH.addTag(new PhraseTag("Srel"));
+ FRENCH.addTag(new PhraseTag("Sint"));
+ getInstance().add(FRENCH);
+ }
}
Modified:
stanbol/trunk/enhancement-engines/opennlp/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/model/PosTagSetRegistry.java
URL:
http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/opennlp/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/model/PosTagSetRegistry.java?rev=1548212&r1=1548211&r2=1548212&view=diff
==============================================================================
---
stanbol/trunk/enhancement-engines/opennlp/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/model/PosTagSetRegistry.java
(original)
+++
stanbol/trunk/enhancement-engines/opennlp/opennlp-pos/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/pos/model/PosTagSetRegistry.java
Thu Dec 5 17:26:01 2013
@@ -228,11 +228,11 @@ public final class PosTagSetRegistry {
SWEDISH.addTag(new PosTag("HV",LexicalCategory.Verb)); //"ha(va)"
(have)
SWEDISH.addTag(new PosTag("WV",LexicalCategory.Verb)); //"vilja" (want)
SWEDISH.addTag(new PosTag("QV",LexicalCategory.Verb)); //"kunna" (can)
- SWEDISH.addTag(new PosTag("MV",LexicalCategory.Verb)); //"måste"
(must)
+ SWEDISH.addTag(new PosTag("MV",LexicalCategory.Verb)); //"m��ste"
(must)
SWEDISH.addTag(new PosTag("KV",LexicalCategory.Verb)); // locution
"komma att" (periphrastic future)
SWEDISH.addTag(new PosTag("SV",LexicalCategory.Verb)); //"skola"
(will, shall)
- SWEDISH.addTag(new PosTag("GV",LexicalCategory.Verb)); //"göra" (do,
make)
- SWEDISH.addTag(new PosTag("FV",LexicalCategory.Verb)); //få" (get)
+ SWEDISH.addTag(new PosTag("GV",LexicalCategory.Verb)); //"g��ra"
(do, make)
+ SWEDISH.addTag(new PosTag("FV",LexicalCategory.Verb)); //f��" (get)
SWEDISH.addTag(new PosTag("VV",LexicalCategory.Verb)); //all other
verbs
SWEDISH.addTag(new PosTag("TP",Pos.PastParticiple)); //PerfectParticle
SWEDISH.addTag(new PosTag("SP",Pos.PresentParticiple));
@@ -257,5 +257,56 @@ public final class PosTagSetRegistry {
SWEDISH.addTag(new PosTag("XX"));
getInstance().add(SWEDISH);
}
-
+ /**
+ * POS tags used by the French Treebank as described in
+ * <a
href="http://alpage.inria.fr/statgram/frdep/Publications/crabbecandi-taln2008-final.pdf">
+ * Expériences dâanalyse syntaxique statistique du français</a> page
8.<p>
+ * Note that this Tagset was originally introduced by Crabb Ìe & Candito,
2008
+ * but the linked paper contains a nice tabular overview of it.
+ */
+ public static final TagSet<PosTag> FRENCH = new TagSet<PosTag>("Treebank+
French","fr");
+
+ static {
+ //Cat C
+ FRENCH.addTag(new PosTag("CS",Pos.SubordinatingConjunction));
+ FRENCH.addTag(new PosTag("CC",Pos.CoordinatingConjunction));
+ //Cat CL
+ FRENCH.addTag(new PosTag("CLO", Pos.PersonalPronoun)); //Clitic
+ FRENCH.addTag(new PosTag("CLS", Pos.PersonalPronoun)); //Clitic
+ FRENCH.addTag(new PosTag("CLR", Pos.PersonalPronoun)); //Clitic
+ //Cat P
+ FRENCH.addTag(new PosTag("P",Pos.Preposition));
+ FRENCH.addTag(new PosTag("P+D")); //no cat
+ FRENCH.addTag(new PosTag("P+PRO")); //no cat
+ //Cat I
+ FRENCH.addTag(new PosTag("I", LexicalCategory.Interjection)); //no cat
+ //Cat PONCT
+ FRENCH.addTag(new PosTag("PONCT",LexicalCategory.Punctuation));
+ //Cat ET
+ FRENCH.addTag(new PosTag("ET", Pos.Foreign));
+ //Cat A
+ FRENCH.addTag(new PosTag("ADJ",LexicalCategory.Adjective));
+ FRENCH.addTag(new PosTag("ADJWH",LexicalCategory.Adjective));
+ //Cat ADV
+ FRENCH.addTag(new PosTag("ADV",LexicalCategory.Adverb));
+ FRENCH.addTag(new PosTag("ADVWH",LexicalCategory.Adverb));
+ //Cat PRO
+ FRENCH.addTag(new PosTag("PRO",Pos.StrongPersonalPronoun)); //Strong
Pronoun
+ FRENCH.addTag(new PosTag("PROWH",Pos.StrongPersonalPronoun)); //Strong
Pronoun
+ FRENCH.addTag(new PosTag("PROREL",Pos.StrongPersonalPronoun));
//Strong Pronoun
+ //Cat D
+ FRENCH.addTag(new PosTag("DET",Pos.Determiner));
+ FRENCH.addTag(new PosTag("DETWH",Pos.Determiner));
+ //Cat N
+ FRENCH.addTag(new PosTag("NC", Pos.CommonNoun));
+ FRENCH.addTag(new PosTag("NPP", Pos.ProperNoun));
+ //Cat V
+ FRENCH.addTag(new PosTag("V",Pos.IndicativeVerb));
+ FRENCH.addTag(new PosTag("VIMP",Pos.ImperativeVerb));
+ FRENCH.addTag(new PosTag("VINF",Pos.Infinitive));
+ FRENCH.addTag(new PosTag("VS",Pos.SubjunctiveVerb));
+ FRENCH.addTag(new PosTag("VPP",Pos.PastParticiple));
+ FRENCH.addTag(new PosTag("VPR", Pos.PresentParticiple)); //Verb
Present?
+ getInstance().add(FRENCH);
+ }
}