Author: rwesten
Date: Wed Sep 28 13:42:36 2011
New Revision: 1176881
URL: http://svn.apache.org/viewvc?rev=1176881&view=rev
Log:
deactivated the German Tokenizer because it sometimes create wrong tokens such
as including leading and tailing " to words (e.g. "Kung Fu Panda" -> ["Kung,
Fu, Panda"]
Modified:
incubator/stanbol/trunk/data/opennlp/lang/de/download_models.xml
Modified: incubator/stanbol/trunk/data/opennlp/lang/de/download_models.xml
URL:
http://svn.apache.org/viewvc/incubator/stanbol/trunk/data/opennlp/lang/de/download_models.xml?rev=1176881&r1=1176880&r2=1176881&view=diff
==============================================================================
--- incubator/stanbol/trunk/data/opennlp/lang/de/download_models.xml (original)
+++ incubator/stanbol/trunk/data/opennlp/lang/de/download_models.xml Wed Sep 28
13:42:36 2011
@@ -21,10 +21,17 @@
Plugin to download OpenNLP Models from the Web
</description>
+ <!-- Removed the German Tokenizer, because it does sometime create "wrong"
+ tokens. In the absence of this model the SimpleTokenizer will be used by
+ default. -->
<target name="download">
- <copy todir="${target.directory}" flatten="true">
+ <!-- Added this delete to ensure that already downloaded versions of the
+ German Tokenizer models are removed. -->
+ <delete dir="${target.directory}" includes="de-token.bin" />
+ <copy todir="${target.directory}" flatten="true">
<resources>
- <url url="${model.url}/de-token.bin"/>
+ <!-- url url="${model.url}/de-token.bin"/>
+ -->
<url url="${model.url}/de-sent.bin"/>
<url url="${model.url}/de-pos-perceptron.bin"/>
<!-- no Chunker for german