jav...

rwesten Thu, 13 Oct 2011 10:57:53 -0700

Modified: 
incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/schema.xml
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/schema.xml?rev=1183014&r1=1183013&r2=1183014&view=diff
==============================================================================
--- 
incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/schema.xml
 (original)
+++ 
incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/schema.xml
 Thu Oct 13 17:57:25 2011
@@ -1,21 +1,21 @@
-<?xml version="1.0" encoding="UTF-8" ?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
 <!--  
  This is the Solr schema file. This file should be named "schema.xml" and
  should be in the conf directory under the solr home
@@ -32,25 +32,25 @@
  to specific requirements. See the comments within this schema for more
  details!
 
- For more information, on how to customize this file, please see
- http://wiki.apache.org/solr/SchemaXml
-
--->
-
-<schema name="Apache Stanbol SolrYard Schema" version="1.3">
+ For more information, on how to customize this file, please see
+ http://wiki.apache.org/solr/SchemaXml
+
+-->
+
+<schema name="Apache Stanbol SolrYard Schema" version="1.3">
   <!--
     The SolrYard supports a list of types that is reflected by
     "fieldType" specifications within this schema.
     See the specific fieldType definition for more information
   -->
-  <types>
+  <types>
     <!-- 
       This fieldType is used to store values with the dataType "xsd:string".
       It is NOT used for natural language texts. Assume that this data type is
       used for ISBN numbers, article numbers, string representations of
       unsupported data types ...
     -->
-    <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
+    <!-- The StrField type is not analyzed, but indexed/stored verbatim. -->
     <fieldType name="string" class="solr.StrField" sortMissingLast="true" 
omitNorms="false"/>    
 
     <!-- 
@@ -66,14 +66,14 @@
         <filter class="solr.LowerCaseFilterFactory" />
       </analyzer>
     </fieldType>
-
-
+
+
     <!-- boolean type: "true" or "false" used to store values with the 
datatype "xsd:boolean" -->
-    <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" 
omitNorms="true"/>
+    <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" 
omitNorms="true"/>
     <!--Binary data type. The data should be sent/retrieved in as Base64 
encoded Strings.
         Currently not used by the SolrYard implementation, but reserved for 
future use. -->
-    <fieldtype name="binary" class="solr.BinaryField"/>
-
+    <fieldtype name="binary" class="solr.BinaryField"/>
+
     <!--
       Default numeric and date field types. By default used to index numeric 
values.
       Note that the "solr.TrieIntField" does support indexing values at various
@@ -83,32 +83,32 @@
       for all numeric fields of that types. See Solr documentation for
       suitable values and examples.
     -->
-    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" 
omitNorms="true" positionIncrementGap="0"/>
-    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" 
omitNorms="true" positionIncrementGap="0"/>
-    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" 
omitNorms="true" positionIncrementGap="0"/>
-    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" 
omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="int" class="solr.TrieIntField" precisionStep="0" 
omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" 
omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="long" class="solr.TrieLongField" precisionStep="0" 
omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" 
omitNorms="true" positionIncrementGap="0"/>
     <fieldType name="date" class="solr.TrieDateField" omitNorms="true" 
precisionStep="0" positionIncrementGap="0"/>
-
-    <!--
-     Numeric field types that index each value at various levels of precision
-     to accelerate range queries when the number of values between the range
-     endpoints is large. See the javadoc for NumericRangeQuery for internal
-     implementation details.
-
-     Smaller precisionStep values (specified in bits) will lead to more tokens
-     indexed per value, slightly larger index size, and faster range queries.
-     A precisionStep of 0 disables indexing at different precision levels.
-    -->
-    <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" 
omitNorms="true" positionIncrementGap="0"/>
-    <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" 
omitNorms="true" positionIncrementGap="0"/>
-    <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" 
omitNorms="true" positionIncrementGap="0"/>
-    <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" 
omitNorms="true" positionIncrementGap="0"/>
+
+    <!--
+     Numeric field types that index each value at various levels of precision
+     to accelerate range queries when the number of values between the range
+     endpoints is large. See the javadoc for NumericRangeQuery for internal
+     implementation details.
+
+     Smaller precisionStep values (specified in bits) will lead to more tokens
+     indexed per value, slightly larger index size, and faster range queries.
+     A precisionStep of 0 disables indexing at different precision levels.
+    -->
+    <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" 
omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" 
omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" 
omitNorms="true" positionIncrementGap="0"/>
+    <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" 
omitNorms="true" positionIncrementGap="0"/>
     <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" 
precisionStep="6" positionIncrementGap="0"/>
-
-    <!-- not used
+
+    <!-- not used
     <fieldType name="random" class="solr.RandomSortField" indexed="true" />
-     -->
-
+     -->
+
     <!-- 
       Natural Language Texts
       
@@ -129,158 +129,201 @@
       together with string values within a special field to support searches 
for
       texts without an specified language.
     -->
-
+
     <!-- A text field that only splits on whitespace for exact matching of 
words 
-        Currently not used. May be used as an alternative to the textgen 
fieldType.
-    <fieldType name="text_ws" class="solr.TextField" 
positionIncrementGap="100">
-      <analyzer>
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-      </analyzer>
-    </fieldType> -->
-
+        Currently not used. May be used as an alternative to the textgen 
fieldType.
+    <fieldType name="text_ws" class="solr.TextField" 
positionIncrementGap="100">
+      <analyzer>
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+      </analyzer>
+    </fieldType> -->
+
     <!--
         Default Text field configuration that comes with the Solr 
distribution. Currently
         Not used by the SolrYard
         
           
-        A text field that uses WordDelimiterFilter to enable splitting and 
matching of
-        words on case-change, alpha numeric boundaries, and non-alphanumeric 
chars,
-        so that a query of "wifi" or "wi fi" could match a document containing 
"Wi-Fi".
-        Synonyms and stopwords are customized by external files, and stemming 
is enabled.
-        The attribute autoGeneratePhraseQueries="true" (the default) causes 
words that get split to
-        form phrase queries. For example, WordDelimiterFilter splitting 
text:pdp-11 will cause the parser
-        to generate text:"pdp 11" rather than (text:PDP OR text:11).
-        NOTE: autoGeneratePhraseQueries="true" tends to not work well for non 
whitespace delimited languages.
-    <fieldType name="text_en" class="solr.TextField" 
positionIncrementGap="100" autoGeneratePhraseQueries="true" omitNorms="false">
-      <analyzer type="index">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" enablePositionIncrements="true" />
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" 
generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" 
splitOnCaseChange="1"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.KeywordMarkerFilterFactory" 
protected="protwords.txt"/>
-        <filter class="solr.PorterStemFilterFactory"/>
-      </analyzer>
-      <analyzer type="query">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" enablePositionIncrements="true"/>
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" 
generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" 
splitOnCaseChange="1"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.KeywordMarkerFilterFactory" 
protected="protwords.txt"/>
-        <filter class="solr.PorterStemFilterFactory"/>
-      </analyzer>
+        A text field that uses WordDelimiterFilter to enable splitting and 
matching of
+        words on case-change, alpha numeric boundaries, and non-alphanumeric 
chars,
+        so that a query of "wifi" or "wi fi" could match a document containing 
"Wi-Fi".
+        Synonyms and stopwords are customized by external files, and stemming 
is enabled.
+        The attribute autoGeneratePhraseQueries="true" (the default) causes 
words that get split to
+        form phrase queries. For example, WordDelimiterFilter splitting 
text:pdp-11 will cause the parser
+        to generate text:"pdp 11" rather than (text:PDP OR text:11).
+        NOTE: autoGeneratePhraseQueries="true" tends to not work well for non 
whitespace delimited languages.
+    <fieldType name="text_en" class="solr.TextField" 
positionIncrementGap="100" autoGeneratePhraseQueries="true" omitNorms="false">
+      <analyzer type="index">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" enablePositionIncrements="true" />
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" 
generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" 
splitOnCaseChange="1"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" 
protected="protwords.txt"/>
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" enablePositionIncrements="true"/>
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" 
generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" 
splitOnCaseChange="1"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.KeywordMarkerFilterFactory" 
protected="protwords.txt"/>
+        <filter class="solr.PorterStemFilterFactory"/>
+      </analyzer>
     </fieldType>  -->
-
-
-    <!-- 
-         This is the default fieldType used for english language texts.
-         
-         Less flexible matching, but less false matches.  Probably not ideal 
for product names,
-         but may be good for SKUs.  Can insert dashes in the wrong place and 
still match. -->
-    <fieldType name="text_en_Tight" class="solr.TextField" 
positionIncrementGap="100" omitNorms="false">
-      <analyzer>
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="false"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt"/>
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" 
generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.KeywordMarkerFilterFactory" 
protected="protwords.txt"/>
-        <filter class="solr.EnglishMinimalStemFilterFactory"/>
-        <!-- this filter can remove any duplicate tokens that appear at the 
same position - sometimes
-             possible with WordDelimiterFilter in conjuncton with stemming. -->
-        <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
-      </analyzer>
-    </fieldType>
-
-
-    <!-- 
-         The default for any language without a special field definition.
 
-         A general unstemmed text field - good if one does not know the 
language of the field -->
-    <fieldType name="textgen" class="solr.TextField" 
positionIncrementGap="100" omitNorms="false">
-      <analyzer type="index">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" enablePositionIncrements="true"/>
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" 
generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" 
splitOnCaseChange="0"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-      </analyzer>
-      <analyzer type="query">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" enablePositionIncrements="true"/>
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" 
generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" 
splitOnCaseChange="0"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-      </analyzer>
-    </fieldType>
-
-
-    <!-- A general unstemmed text field that indexes tokens normally and also
-         reversed (via ReversedWildcardFilterFactory), to enable more 
efficient 
+
+      <!-- 
+       ENGLISH
+       
+       This is the default fieldType used for english language texts.
+       
+       Less flexible matching, but less false matches.  Probably not ideal for 
product names,
+       but may be good for SKUs.  Can insert dashes in the wrong place and 
still match. -->
+      <fieldType name="text_en_Tight" class="solr.TextField" 
positionIncrementGap="100" omitNorms="false">
+          <analyzer>
+              <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+              <filter class="solr.SynonymFilterFactory" 
synonyms="synonyms.txt" ignoreCase="true" expand="false"/>
+              <filter class="solr.HyphenatedWordsFilterFactory"/>
+              <filter class="solr.LowerCaseFilterFactory"/>
+              <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" enablePositionIncrements="true"/>
+              <filter class="solr.WordDelimiterFilterFactory" 
generateWordParts="0" generateNumberParts="0" catenateWords="1" 
catenateNumbers="1" catenateAll="0"/>
+              <filter class="solr.KeywordMarkerFilterFactory" 
protected="protwords.txt"/>
+              <filter class="solr.EnglishMinimalStemFilterFactory"/>
+              <!-- this filter can remove any duplicate tokens that appear at 
the same position - sometimes
+               possible with WordDelimiterFilter in conjuncton with stemming. 
-->
+              <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+          </analyzer>
+      </fieldType>
+      
+      <!--
+       GERMAN
+       
+       This is the default fieldType used for gernam language texts.
+       
+       Less flexible matching, but less false matches.  Probably not ideal for 
product names,
+       but may be good for SKUs.  Can insert dashes in the wrong place and 
still match. -->
+      <fieldType name="text_de" class="solr.TextField" 
positionIncrementGap="100" omitNorms="false">
+          <analyzer type="index">
+              <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+              <filter class="solr.SynonymFilterFactory" 
synonyms="synonyms_de.txt" ignoreCase="true" expand="false"/>
+              <filter class="solr.HyphenatedWordsFilterFactory"/>
+              <filter class="solr.WordDelimiterFilterFactory" 
generateWordParts="1" generateNumberParts="1" catenateWords="1" 
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
+              <filter class="solr.LowerCaseFilterFactory"/>
+              <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords_de.txt" enablePositionIncrements="true" />
+              <filter class="solr.GermanStemFilterFactory"/>
+              <!-- this filter can remove any duplicate tokens that appear at 
the same position - sometimes
+               possible with WordDelimiterFilter in conjuncton with stemming. 
-->
+              <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+          </analyzer>
+          <analyzer type="query">
+              <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+              <filter class="solr.WordDelimiterFilterFactory" 
generateWordParts="1" generateNumberParts="1" catenateWords="0" 
catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/>
+              <filter class="solr.LowerCaseFilterFactory"/>
+              <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords_de.txt" enablePositionIncrements="true" />
+              <filter class="solr.GermanStemFilterFactory"/>   
+              <filter class="solr.RemoveDuplicatesTokenFilterFactory"/>
+          </analyzer>
+      </fieldType>
+      
+      
+      <!-- 
+       GENERIC (no specific lanugage support)
+       
+       The default for any language without a special field definition.
+       
+       Uses the ICUTokenizer and tries to convert alphabetic, numeric, and 
symbolic Unicode characters which 
+       are not in the first 127 ASCII characters (the "Basic Latin" Unicode 
block) into their ASCII 
+       equivalents, if one exists. (STANBOL-
+       (see 
http://lucene.apache.org/java/2_9_1/api/all/org/apache/lucene/analysis/ASCIIFoldingFilter.html)
+       
+       -->
+      <fieldType name="textgen" class="solr.TextField" 
positionIncrementGap="100" omitNorms="false">
+          <analyzer type="index">
+              <tokenizer class="solr.ICUTokenizerFactory"/>
+              <filter class="solr.ASCIIFoldingFilterFactory"/>
+              <filter class="solr.HyphenatedWordsFilterFactory"/>
+              <!--        <filter class="solr.StopFilterFactory" 
ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> -->
+              <!--        <filter class="solr.WordDelimiterFilterFactory" 
generateWordParts="1" generateNumberParts="1" catenateWords="1" 
catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/> -->
+              <filter class="solr.LowerCaseFilterFactory"/>
+          </analyzer>
+          <analyzer type="query">
+              <tokenizer class="solr.ICUTokenizerFactory"/>
+              <filter class="solr.SynonymFilterFactory" 
synonyms="synonyms.txt" ignoreCase="true" expand="true"/>
+              <filter class="solr.ASCIIFoldingFilterFactory"/>
+              <!--        <filter class="solr.StopFilterFactory" 
ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> -->
+              <!--        <filter class="solr.WordDelimiterFilterFactory" 
generateWordParts="1" generateNumberParts="1" catenateWords="0" 
catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/> -->
+              <filter class="solr.LowerCaseFilterFactory"/>
+          </analyzer>
+      </fieldType>
+
+
+    <!-- A general unstemmed text field that indexes tokens normally and also
+         reversed (via ReversedWildcardFilterFactory), to enable more 
efficient 
         leading wildcard queries.
      
-     Not used 
-    <fieldType name="text_rev" class="solr.TextField" 
positionIncrementGap="100" omitNorms="false">
-      <analyzer type="index">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" enablePositionIncrements="true" />
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" 
generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" 
splitOnCaseChange="0"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" 
maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
-      </analyzer>
-      <analyzer type="query">
-        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
-        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="true"/>
-        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" enablePositionIncrements="true"/>
-        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" 
generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" 
splitOnCaseChange="0"/>
-        <filter class="solr.LowerCaseFilterFactory"/>
-      </analyzer>
-    </fieldType> -->
-
+     Not used 
+    <fieldType name="text_rev" class="solr.TextField" 
positionIncrementGap="100" omitNorms="false">
+      <analyzer type="index">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" enablePositionIncrements="true" />
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" 
generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" 
splitOnCaseChange="0"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+        <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" 
maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/>
+      </analyzer>
+      <analyzer type="query">
+        <tokenizer class="solr.WhitespaceTokenizerFactory"/>
+        <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" 
ignoreCase="true" expand="true"/>
+        <filter class="solr.StopFilterFactory" ignoreCase="true" 
words="stopwords.txt" enablePositionIncrements="true"/>
+        <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" 
generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" 
splitOnCaseChange="0"/>
+        <filter class="solr.LowerCaseFilterFactory"/>
+      </analyzer>
+    </fieldType> -->
+
     <!-- A KeywordTokenizer that does not include some properties of the 
source text.
          
          TODO:
           - This might be usefull for searching labels
           - Rename to label if used for that
           - Add 0-9 to the regex patter to preserve numbers
-         
-      -->
-    <fieldType name="alphaOnlySort" class="solr.TextField" 
sortMissingLast="true" omitNorms="false">
-      <analyzer>
-        <!-- KeywordTokenizer does not tokenize -->
-        <tokenizer class="solr.KeywordTokenizerFactory"/>
-        <filter class="solr.LowerCaseFilterFactory" />
-        <filter class="solr.TrimFilterFactory" />
-        <filter class="solr.PatternReplaceFilterFactory" pattern="([^a-z])" 
replacement="" replace="all" />
-      </analyzer>
-    </fieldType>
-    
+         
+      -->
+    <fieldType name="alphaOnlySort" class="solr.TextField" 
sortMissingLast="true" omitNorms="false">
+      <analyzer>
+        <!-- KeywordTokenizer does not tokenize -->
+        <tokenizer class="solr.KeywordTokenizerFactory"/>
+        <filter class="solr.LowerCaseFilterFactory" />
+        <filter class="solr.TrimFilterFactory" />
+        <filter class="solr.PatternReplaceFilterFactory" pattern="([^a-z])" 
replacement="" replace="all" />
+      </analyzer>
+    </fieldType>
+    
     <!-- not used
-    <fieldtype name="phonetic" stored="false" indexed="true" 
class="solr.TextField" >
-      <analyzer>
-        <tokenizer class="solr.StandardTokenizerFactory"/>
-        <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
-      </analyzer>
-    </fieldtype> -->
-
-    <fieldType name="text_path" class="solr.TextField" 
positionIncrementGap="100" omitNorms="false">
-      <analyzer>
-        <tokenizer class="solr.PathHierarchyTokenizerFactory"/>
-      </analyzer>
-    </fieldType>
-
-    <!-- since fields of this type are by default not stored or indexed,
-         any data added to them will be ignored outright.  --> 
-    <fieldtype name="ignored" stored="false" indexed="false" 
multiValued="true" class="solr.StrField" />
-
-    <!-- Spatial features are not yet supported by the Entityhub
-    <fieldType name="point" class="solr.PointType" dimension="2" 
subFieldSuffix="_d"/>
-    <fieldType name="location" class="solr.LatLonType" 
subFieldSuffix="_coordinate"/>
-    <fieldtype name="geohash" class="solr.GeoHashField"/>
+    <fieldtype name="phonetic" stored="false" indexed="true" 
class="solr.TextField" >
+      <analyzer>
+        <tokenizer class="solr.StandardTokenizerFactory"/>
+        <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/>
+      </analyzer>
+    </fieldtype> -->
+
+    <fieldType name="text_path" class="solr.TextField" 
positionIncrementGap="100" omitNorms="false">
+      <analyzer>
+        <tokenizer class="solr.PathHierarchyTokenizerFactory"/>
+      </analyzer>
+    </fieldType>
+
+    <!-- since fields of this type are by default not stored or indexed,
+         any data added to them will be ignored outright.  --> 
+    <fieldtype name="ignored" stored="false" indexed="false" 
multiValued="true" class="solr.StrField" />
+
+    <!-- Spatial features are not yet supported by the Entityhub
+    <fieldType name="point" class="solr.PointType" dimension="2" 
subFieldSuffix="_d"/>
+    <fieldType name="location" class="solr.LatLonType" 
subFieldSuffix="_coordinate"/>
+    <fieldtype name="geohash" class="solr.GeoHashField"/>
      -->
- </types>
-
-
- <fields>
+ </types>
+
+
+ <fields>
    <!-- 
      For Information about the different attributes for fields
      see http://wiki.apache.org/solr/SchemaXml. 
@@ -309,7 +352,7 @@
      Do not change this definition!
    -->
    <field name="_domain" type="string" indexed="true" stored="false" 
multiValued="true"/>
-   
+   
    <!--
      DBPedia specific Field definitions 
    -->
@@ -326,10 +369,10 @@
 
    <!-- Do index, but not store abstracts -->
    <field name="@en/dbp-ont:abstract/"  type="text_en_Tight"  indexed="true" 
stored="false" multiValued="true" omitNorms="false" termVectors="true"/>
-   <field name="@de/dbp-ont:abstract/"  type="textgen"  indexed="true" 
stored="false" multiValued="true" omitNorms="false" termVectors="true"/>
+   <field name="@de/dbp-ont:abstract/"  type="text_de"  indexed="true" 
stored="false" multiValued="true" omitNorms="false" termVectors="true"/>
    <field name="@it/dbp-ont:abstract/"  type="textgen"  indexed="true" 
stored="false" multiValued="true" omitNorms="false" termVectors="true"/>
    <field name="@fr/dbp-ont:abstract/"  type="textgen"  indexed="true" 
stored="false" multiValued="true" omitNorms="false" termVectors="true"/>
-
+
 
    <!-- 
      Dynamic field definitions (used if a field name is not found)
@@ -398,11 +441,16 @@
        use @en-GB/* and @en* to have a special field type for 
          en-GB and one for other english text
    -->
-   <!-- 
-     Dynamic field for english languages.
-     Note that the prefix "@en*" matches also "@en-GB" and "@en-US"
-   -->
-   <dynamicField name="@en*"  type="text_en_Tight" indexed="true" 
stored="true" multiValued="true" omitNorms="false"/>
+     <!-- 
+      Dynamic field for english languages.
+      Note that the prefix "@en*" matches also "@en-GB" and "@en-US"
+      -->
+     <dynamicField name="@en*"  type="text_en_Tight" indexed="true" 
stored="true" multiValued="true" omitNorms="false"/>
+     <!-- 
+      Dynamic field for GERMAN languages.
+      Note that the prefix "@en*" matches also "@en-GB" and "@en-US"
+      -->
+     <dynamicField name="@de*"  type="text_de" indexed="true" stored="true" 
multiValued="true" omitNorms="false"/>
    <!-- 
      The "@*" catches all the other languages including "@/" 
      (default language) used for texts without a defined language
@@ -457,21 +505,21 @@
      Do not change this definition!
    -->
    <dynamicField name="_config/*" type="string" indexed="false" 
multiValued="true"/>
-   
- </fields>
-
+   
+ </fields>
+
  <!-- 
    Field to use to determine and enforce document uniqueness.
    -->
- <uniqueKey>uri</uniqueKey>
-
+ <uniqueKey>uri</uniqueKey>
+
  <!-- 
    field for the QueryParser to use when an explicit fieldname is absent.
    The SolrYard does currently not take advantage of this. However it can
    be used when directly accessing the SolrYard.
  -->
- <defaultSearchField>_text</defaultSearchField>
-
+ <defaultSearchField>_text</defaultSearchField>
+
  <!--
    The SolrYard explizitly adds AND and OR for all boolean terms in
    generated queries. So changing that should have no influence on
@@ -479,13 +527,13 @@
    
    SolrQueryParser configuration: defaultOperator="AND|OR" 
  -->
- <solrQueryParser defaultOperator="OR"/>
-
+ <solrQueryParser defaultOperator="OR"/>
+
   <!--
     The SolrYard Implementation assumes the following copyField commands.
     This commands MUST NOT be removed! 
    -->
-
+
    <!-- 
      Values of all fields that represent natural language texts
      or string values are copied to the default search field
@@ -504,7 +552,7 @@
      all references to it)
    -->
    <copyField source="ref/*" dest="_ref"/>
-       
-
-
-</schema>
+       
+
+
+</schema>


Added: 
incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/stopwords_de.txt
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/stopwords_de.txt?rev=1183014&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/stopwords_de.txt
 (added)
+++ 
incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/stopwords_de.txt
 Thu Oct 13 17:57:25 2011
@@ -0,0 +1,237 @@
+# A German stop word list. Comments begin with vertical bar. Each stop
+# word is at the start of a line.
+
+# The number of forms in this list is reduced significantly by passing it
+# through the German stemmer.
+
+
+aber
+alle
+allem
+allen
+aller
+alles
+als
+also
+am
+an
+ander
+andere
+anderem
+anderen
+anderer
+anderes
+andern
+anders
+auch
+auf
+aus
+bei
+bin
+bis
+bist
+da
+damit
+dann
+der
+den
+des
+dem
+die
+das
+daÃ
+dass
+derselbe
+derselben
+denselben
+desselben
+demselben
+dieselbe
+dieselben
+dasselbe
+dazu
+dein
+deine
+deinem
+deinen
+deiner
+deines
+denn
+derer
+dessen
+dich
+dir
+du
+dies
+diese
+diesem
+diesen
+dieser
+dieses
+doch
+dort
+durch
+ein
+eine
+einem
+einen
+einer
+eines
+einig
+einige
+einigem
+einigen
+einiger
+einiges
+einmal
+er
+ihn
+ihm
+es
+etwas
+euer
+eure
+eurem
+euren
+eurer
+eures
+fÃ¼r
+gegen
+gewesen
+hab
+habe
+haben
+hat
+hatte
+hatten
+hier
+hin
+hinter
+ich
+mich
+mir
+ihr
+ihre
+ihrem
+ihren
+ihrer
+ihres
+euch
+im
+in
+indem
+ins
+ist
+jede
+jedem
+jeden
+jeder
+jedes
+jene
+jenem
+jenen
+jener
+jenes
+jetzt
+kann
+kein
+keine
+keinem
+keinen
+keiner
+keines
+kÃ¶nnen
+kÃ¶nnte
+machen
+man
+manche
+manchem
+manchen
+mancher
+manches
+mein
+meine
+meinem
+meinen
+meiner
+meines
+mit
+muss
+musste
+nach
+nicht
+nichts
+noch
+nun
+nur
+ob
+oder
+ohne
+sehr
+sein
+seine
+seinem
+seinen
+seiner
+seines
+selbst
+sich
+sie
+ihnen
+sind
+so
+solche
+solchem
+solchen
+solcher
+solches
+soll
+sollte
+sondern
+sonst
+Ã¼ber
+um
+und
+uns
+unse
+unserem
+unseren
+unser
+unseres
+unter
+viel
+vom
+von
+vor
+wÃ¤hrend
+war
+waren
+warst
+was
+#weg -> also a noun describing small streets
+weil
+weiter
+welche
+welchem
+welchen
+welcher
+welches
+wenn
+werde
+werden
+wie
+wieder
+will
+wir
+wird
+wirst
+wo
+wollen
+wollte
+wÃ¼rde
+wÃ¼rden
+zu
+zum
+zur
+zwar
+zwischen
\ No newline at end of file

Propchange: 
incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/stopwords_de.txt
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: 
incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/synonyms_de.txt
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/synonyms_de.txt?rev=1183014&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/synonyms_de.txt
 (added)
+++ 
incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/synonyms_de.txt
 Thu Oct 13 17:57:25 2011
@@ -0,0 +1,23 @@
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-----------------------------------------------------------------------
+# Some synonym groups specific to this example
+GB,gib,gigabyte,gigabytes
+MB,mib,megabyte,megabytes
+Television, Televisions, TV, TVs
+#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming
+#after us won't split it into two words.
+
+# Synonym mappings can be used for spelling correction too
+# pixima => pixma
+

Propchange: 
incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/synonyms_de.txt
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Modified: 
incubator/stanbol/trunk/entityhub/indexing/destination/solryard/pom.xml
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/destination/solryard/pom.xml?rev=1183014&r1=1183013&r2=1183014&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/indexing/destination/solryard/pom.xml 
(original)
+++ incubator/stanbol/trunk/entityhub/indexing/destination/solryard/pom.xml Thu 
Oct 13 17:57:25 2011
@@ -218,6 +218,21 @@
                        <artifactId>lucene-spatial</artifactId>
                        <scope>test</scope>
                </dependency>
+           <dependency>
+            <groupId>org.apache.lucene</groupId>
+                   <artifactId>lucene-icu</artifactId>
+                   <scope>test</scope>
+           </dependency>
+           <dependency>
+                   <groupId>org.apache.lucene</groupId>
+                   <artifactId>lucene-icu4j</artifactId>
+                   <scope>test</scope>
+           </dependency>
+           <dependency>
+                   <groupId>org.apache.solr</groupId>
+                   <artifactId>solr-analysis-extras</artifactId>
+                   <scope>test</scope>
+           </dependency>
                <dependency>
                        <groupId>jakarta-regexp</groupId>
                        <artifactId>jakarta-regexp</artifactId>

Modified: incubator/stanbol/trunk/entityhub/indexing/genericrdf/pom.xml
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/genericrdf/pom.xml?rev=1183014&r1=1183013&r2=1183014&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/indexing/genericrdf/pom.xml (original)
+++ incubator/stanbol/trunk/entityhub/indexing/genericrdf/pom.xml Thu Oct 13 
17:57:25 2011
@@ -130,11 +130,12 @@
                        <artifactId>xercesImpl</artifactId>
                        <scope>runtime</scope>
                </dependency>
+<!-- we have lucene-icu4j anyway
                <dependency>
                        <groupId>com.ibm.icu</groupId>
                        <artifactId>icu4j</artifactId>
                        <scope>runtime</scope>
-               </dependency>
+               </dependency>  -->
 
                <!-- dependencies of the solr yard destination -->
                <dependency>
@@ -247,6 +248,21 @@
                        <artifactId>lucene-spatial</artifactId>
                        <scope>runtime</scope>
                </dependency>
+        <dependency>
+          <groupId>org.apache.lucene</groupId>
+          <artifactId>lucene-icu</artifactId>
+          <scope>runtime</scope>
+        </dependency>
+        <dependency>
+          <groupId>org.apache.lucene</groupId>
+          <artifactId>lucene-icu4j</artifactId>
+          <scope>runtime</scope>
+        </dependency>
+           <dependency>
+             <groupId>org.apache.solr</groupId>
+             <artifactId>solr-analysis-extras</artifactId>
+             <scope>runtime</scope>
+           </dependency>
                <dependency>
                        <groupId>jakarta-regexp</groupId>
                        <artifactId>jakarta-regexp</artifactId>

Modified: incubator/stanbol/trunk/entityhub/yard/solr/pom.xml
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/yard/solr/pom.xml?rev=1183014&r1=1183013&r2=1183014&view=diff
==============================================================================
--- incubator/stanbol/trunk/entityhub/yard/solr/pom.xml (original)
+++ incubator/stanbol/trunk/entityhub/yard/solr/pom.xml Thu Oct 13 17:57:25 2011
@@ -238,6 +238,22 @@
       <scope>test</scope>
     </dependency>
     <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-icu</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.lucene</groupId>
+      <artifactId>lucene-icu4j</artifactId>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.solr</groupId>
+      <artifactId>solr-analysis-extras</artifactId>
+      <scope>test</scope>
+    </dependency>
+    
+    <dependency>
       <groupId>jakarta-regexp</groupId>
       <artifactId>jakarta-regexp</artifactId>
       <scope>test</scope>

Propchange: 
incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Thu Oct 13 17:57:25 2011
@@ -0,0 +1 @@
+default

Modified: 
incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/default.solrindex.zip
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/default.solrindex.zip?rev=1183014&r1=1183013&r2=1183014&view=diff
==============================================================================
Binary files - no diff available.

Modified: 
incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/entityhub.solrindex.zip
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/entityhub.solrindex.zip?rev=1183014&r1=1183013&r2=1183014&view=diff
==============================================================================
Binary files - no diff available.

Added: 
incubator/stanbol/trunk/launchers/full/src/main/resources/resources/config/org.apache.sling.commons.log.LogManager.factory.config-solrlogconfig.config
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/launchers/full/src/main/resources/resources/config/org.apache.sling.commons.log.LogManager.factory.config-solrlogconfig.config?rev=1183014&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/launchers/full/src/main/resources/resources/config/org.apache.sling.commons.log.LogManager.factory.config-solrlogconfig.config
 (added)
+++ 
incubator/stanbol/trunk/launchers/full/src/main/resources/resources/config/org.apache.sling.commons.log.LogManager.factory.config-solrlogconfig.config
 Thu Oct 13 17:57:25 2011
@@ -0,0 +1,4 @@
+org.apache.sling.commons.log.pattern="{0,date,dd.MM.yyyy\ HH:mm:ss.SSS}\ 
*{4}*\ [{2}]\ {3}\ {5}"
+org.apache.sling.commons.log.names=["org.apache.solr"]
+org.apache.sling.commons.log.level="warn"
+org.apache.sling.commons.log.file="logs/error.log"
\ No newline at end of file

Added: 
incubator/stanbol/trunk/launchers/stable/src/main/resources/resources/config/org.apache.sling.commons.log.LogManager.factory.config-solrlogconfig.config
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/launchers/stable/src/main/resources/resources/config/org.apache.sling.commons.log.LogManager.factory.config-solrlogconfig.config?rev=1183014&view=auto
==============================================================================
--- 
incubator/stanbol/trunk/launchers/stable/src/main/resources/resources/config/org.apache.sling.commons.log.LogManager.factory.config-solrlogconfig.config
 (added)
+++ 
incubator/stanbol/trunk/launchers/stable/src/main/resources/resources/config/org.apache.sling.commons.log.LogManager.factory.config-solrlogconfig.config
 Thu Oct 13 17:57:25 2011
@@ -0,0 +1,4 @@
+org.apache.sling.commons.log.pattern="{0,date,dd.MM.yyyy\ HH:mm:ss.SSS}\ 
*{4}*\ [{2}]\ {3}\ {5}"
+org.apache.sling.commons.log.names=["org.apache.solr"]
+org.apache.sling.commons.log.level="warn"
+org.apache.sling.commons.log.file="logs/error.log"
\ No newline at end of file

Modified: incubator/stanbol/trunk/parent/pom.xml
URL: 
http://svn.apache.org/viewvc/incubator/stanbol/trunk/parent/pom.xml?rev=1183014&r1=1183013&r2=1183014&view=diff
==============================================================================
--- incubator/stanbol/trunk/parent/pom.xml (original)
+++ incubator/stanbol/trunk/parent/pom.xml Thu Oct 13 17:57:25 2011
@@ -921,28 +921,28 @@
         <version>3.2.0</version>
         <scope>provided</scope>
     </dependency>
-    <!-- other Solr bundles currently not used
     <dependency>
         <groupId>org.apache.solr</groupId>
-        <artifactId>solr-clustering</artifactId>
+        <artifactId>solr-analysis-extras</artifactId>
         <version>3.2.0</version>
         <scope>provided</scope>
     </dependency>
+    <!-- other Solr bundles currently not used
     <dependency>
         <groupId>org.apache.solr</groupId>
-        <artifactId>solr-cell</artifactId>
+        <artifactId>solr-clustering</artifactId>
         <version>3.2.0</version>
         <scope>provided</scope>
     </dependency>
     <dependency>
         <groupId>org.apache.solr</groupId>
-        <artifactId>solr-carrot2-core</artifactId>
+        <artifactId>solr-cell</artifactId>
         <version>3.2.0</version>
         <scope>provided</scope>
     </dependency>
     <dependency>
         <groupId>org.apache.solr</groupId>
-        <artifactId>solr-analysis-extras</artifactId>
+        <artifactId>solr-carrot2-core</artifactId>
         <version>3.2.0</version>
         <scope>provided</scope>
     </dependency>   -->

svn commit: r1183014 [2/2] - in /incubator/stanbol/trunk: commons/opennlp/src/main/java/org/apache/stanbol/commons/opennlp/ commons/opennlp/src/test/java/org/apache/commons/opennlp/ commons/solr/ commons/stanboltools/bundledatafileprovider/src/main/jav...

Reply via email to