Modified: incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/schema.xml URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/schema.xml?rev=1183014&r1=1183013&r2=1183014&view=diff ============================================================================== --- incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/schema.xml (original) +++ incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/schema.xml Thu Oct 13 17:57:25 2011 @@ -1,21 +1,21 @@ -<?xml version="1.0" encoding="UTF-8" ?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - +<?xml version="1.0" encoding="UTF-8" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + <!-- This is the Solr schema file. This file should be named "schema.xml" and should be in the conf directory under the solr home @@ -32,25 +32,25 @@ to specific requirements. See the comments within this schema for more details! - For more information, on how to customize this file, please see - http://wiki.apache.org/solr/SchemaXml - ---> - -<schema name="Apache Stanbol SolrYard Schema" version="1.3"> + For more information, on how to customize this file, please see + http://wiki.apache.org/solr/SchemaXml + +--> + +<schema name="Apache Stanbol SolrYard Schema" version="1.3"> <!-- The SolrYard supports a list of types that is reflected by "fieldType" specifications within this schema. See the specific fieldType definition for more information --> - <types> + <types> <!-- This fieldType is used to store values with the dataType "xsd:string". It is NOT used for natural language texts. Assume that this data type is used for ISBN numbers, article numbers, string representations of unsupported data types ... --> - <!-- The StrField type is not analyzed, but indexed/stored verbatim. --> + <!-- The StrField type is not analyzed, but indexed/stored verbatim. --> <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="false"/> <!-- @@ -66,14 +66,14 @@ <filter class="solr.LowerCaseFilterFactory" /> </analyzer> </fieldType> - - + + <!-- boolean type: "true" or "false" used to store values with the datatype "xsd:boolean" --> - <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/> + <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/> <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings. Currently not used by the SolrYard implementation, but reserved for future use. --> - <fieldtype name="binary" class="solr.BinaryField"/> - + <fieldtype name="binary" class="solr.BinaryField"/> + <!-- Default numeric and date field types. By default used to index numeric values. Note that the "solr.TrieIntField" does support indexing values at various @@ -83,32 +83,32 @@ for all numeric fields of that types. See Solr documentation for suitable values and examples. --> - <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> - <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> - <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> - <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/> - - <!-- - Numeric field types that index each value at various levels of precision - to accelerate range queries when the number of values between the range - endpoints is large. See the javadoc for NumericRangeQuery for internal - implementation details. - - Smaller precisionStep values (specified in bits) will lead to more tokens - indexed per value, slightly larger index size, and faster range queries. - A precisionStep of 0 disables indexing at different precision levels. - --> - <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> - <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> - <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> - <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> + + <!-- + Numeric field types that index each value at various levels of precision + to accelerate range queries when the number of values between the range + endpoints is large. See the javadoc for NumericRangeQuery for internal + implementation details. + + Smaller precisionStep values (specified in bits) will lead to more tokens + indexed per value, slightly larger index size, and faster range queries. + A precisionStep of 0 disables indexing at different precision levels. + --> + <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/> - - <!-- not used + + <!-- not used <fieldType name="random" class="solr.RandomSortField" indexed="true" /> - --> - + --> + <!-- Natural Language Texts @@ -129,158 +129,201 @@ together with string values within a special field to support searches for texts without an specified language. --> - + <!-- A text field that only splits on whitespace for exact matching of words - Currently not used. May be used as an alternative to the textgen fieldType. - <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100"> - <analyzer> - <tokenizer class="solr.WhitespaceTokenizerFactory"/> - </analyzer> - </fieldType> --> - + Currently not used. May be used as an alternative to the textgen fieldType. + <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + </analyzer> + </fieldType> --> + <!-- Default Text field configuration that comes with the Solr distribution. Currently Not used by the SolrYard - A text field that uses WordDelimiterFilter to enable splitting and matching of - words on case-change, alpha numeric boundaries, and non-alphanumeric chars, - so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi". - Synonyms and stopwords are customized by external files, and stemming is enabled. - The attribute autoGeneratePhraseQueries="true" (the default) causes words that get split to - form phrase queries. For example, WordDelimiterFilter splitting text:pdp-11 will cause the parser - to generate text:"pdp 11" rather than (text:PDP OR text:11). - NOTE: autoGeneratePhraseQueries="true" tends to not work well for non whitespace delimited languages. - <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true" omitNorms="false"> - <analyzer type="index"> - <tokenizer class="solr.WhitespaceTokenizerFactory"/> - <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> - <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> - <filter class="solr.LowerCaseFilterFactory"/> - <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> - <filter class="solr.PorterStemFilterFactory"/> - </analyzer> - <analyzer type="query"> - <tokenizer class="solr.WhitespaceTokenizerFactory"/> - <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> - <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> - <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> - <filter class="solr.LowerCaseFilterFactory"/> - <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> - <filter class="solr.PorterStemFilterFactory"/> - </analyzer> + A text field that uses WordDelimiterFilter to enable splitting and matching of + words on case-change, alpha numeric boundaries, and non-alphanumeric chars, + so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi". + Synonyms and stopwords are customized by external files, and stemming is enabled. + The attribute autoGeneratePhraseQueries="true" (the default) causes words that get split to + form phrase queries. For example, WordDelimiterFilter splitting text:pdp-11 will cause the parser + to generate text:"pdp 11" rather than (text:PDP OR text:11). + NOTE: autoGeneratePhraseQueries="true" tends to not work well for non whitespace delimited languages. + <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100" autoGeneratePhraseQueries="true" omitNorms="false"> + <analyzer type="index"> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <filter class="solr.PorterStemFilterFactory"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <filter class="solr.PorterStemFilterFactory"/> + </analyzer> </fieldType> --> - - - <!-- - This is the default fieldType used for english language texts. - - Less flexible matching, but less false matches. Probably not ideal for product names, - but may be good for SKUs. Can insert dashes in the wrong place and still match. --> - <fieldType name="text_en_Tight" class="solr.TextField" positionIncrementGap="100" omitNorms="false"> - <analyzer> - <tokenizer class="solr.WhitespaceTokenizerFactory"/> - <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/> - <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt"/> - <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/> - <filter class="solr.LowerCaseFilterFactory"/> - <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> - <filter class="solr.EnglishMinimalStemFilterFactory"/> - <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes - possible with WordDelimiterFilter in conjuncton with stemming. --> - <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> - </analyzer> - </fieldType> - - - <!-- - The default for any language without a special field definition. - A general unstemmed text field - good if one does not know the language of the field --> - <fieldType name="textgen" class="solr.TextField" positionIncrementGap="100" omitNorms="false"> - <analyzer type="index"> - <tokenizer class="solr.WhitespaceTokenizerFactory"/> - <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> - <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/> - <filter class="solr.LowerCaseFilterFactory"/> - </analyzer> - <analyzer type="query"> - <tokenizer class="solr.WhitespaceTokenizerFactory"/> - <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> - <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> - <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/> - <filter class="solr.LowerCaseFilterFactory"/> - </analyzer> - </fieldType> - - - <!-- A general unstemmed text field that indexes tokens normally and also - reversed (via ReversedWildcardFilterFactory), to enable more efficient + + <!-- + ENGLISH + + This is the default fieldType used for english language texts. + + Less flexible matching, but less false matches. Probably not ideal for product names, + but may be good for SKUs. Can insert dashes in the wrong place and still match. --> + <fieldType name="text_en_Tight" class="solr.TextField" positionIncrementGap="100" omitNorms="false"> + <analyzer> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="false"/> + <filter class="solr.HyphenatedWordsFilterFactory"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="0" generateNumberParts="0" catenateWords="1" catenateNumbers="1" catenateAll="0"/> + <filter class="solr.KeywordMarkerFilterFactory" protected="protwords.txt"/> + <filter class="solr.EnglishMinimalStemFilterFactory"/> + <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes + possible with WordDelimiterFilter in conjuncton with stemming. --> + <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> + </analyzer> + </fieldType> + + <!-- + GERMAN + + This is the default fieldType used for gernam language texts. + + Less flexible matching, but less false matches. Probably not ideal for product names, + but may be good for SKUs. Can insert dashes in the wrong place and still match. --> + <fieldType name="text_de" class="solr.TextField" positionIncrementGap="100" omitNorms="false"> + <analyzer type="index"> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms_de.txt" ignoreCase="true" expand="false"/> + <filter class="solr.HyphenatedWordsFilterFactory"/> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_de.txt" enablePositionIncrements="true" /> + <filter class="solr.GermanStemFilterFactory"/> + <!-- this filter can remove any duplicate tokens that appear at the same position - sometimes + possible with WordDelimiterFilter in conjuncton with stemming. --> + <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords_de.txt" enablePositionIncrements="true" /> + <filter class="solr.GermanStemFilterFactory"/> + <filter class="solr.RemoveDuplicatesTokenFilterFactory"/> + </analyzer> + </fieldType> + + + <!-- + GENERIC (no specific lanugage support) + + The default for any language without a special field definition. + + Uses the ICUTokenizer and tries to convert alphabetic, numeric, and symbolic Unicode characters which + are not in the first 127 ASCII characters (the "Basic Latin" Unicode block) into their ASCII + equivalents, if one exists. (STANBOL- + (see http://lucene.apache.org/java/2_9_1/api/all/org/apache/lucene/analysis/ASCIIFoldingFilter.html) + + --> + <fieldType name="textgen" class="solr.TextField" positionIncrementGap="100" omitNorms="false"> + <analyzer type="index"> + <tokenizer class="solr.ICUTokenizerFactory"/> + <filter class="solr.ASCIIFoldingFilterFactory"/> + <filter class="solr.HyphenatedWordsFilterFactory"/> + <!-- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> --> + <!-- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/> --> + <filter class="solr.LowerCaseFilterFactory"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.ICUTokenizerFactory"/> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> + <filter class="solr.ASCIIFoldingFilterFactory"/> + <!-- <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> --> + <!-- <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/> --> + <filter class="solr.LowerCaseFilterFactory"/> + </analyzer> + </fieldType> + + + <!-- A general unstemmed text field that indexes tokens normally and also + reversed (via ReversedWildcardFilterFactory), to enable more efficient leading wildcard queries. - Not used - <fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100" omitNorms="false"> - <analyzer type="index"> - <tokenizer class="solr.WhitespaceTokenizerFactory"/> - <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> - <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/> - <filter class="solr.LowerCaseFilterFactory"/> - <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/> - </analyzer> - <analyzer type="query"> - <tokenizer class="solr.WhitespaceTokenizerFactory"/> - <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> - <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> - <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/> - <filter class="solr.LowerCaseFilterFactory"/> - </analyzer> - </fieldType> --> - + Not used + <fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100" omitNorms="false"> + <analyzer type="index"> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/> + <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/> + </analyzer> + <analyzer type="query"> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/> + <filter class="solr.LowerCaseFilterFactory"/> + </analyzer> + </fieldType> --> + <!-- A KeywordTokenizer that does not include some properties of the source text. TODO: - This might be usefull for searching labels - Rename to label if used for that - Add 0-9 to the regex patter to preserve numbers - - --> - <fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="false"> - <analyzer> - <!-- KeywordTokenizer does not tokenize --> - <tokenizer class="solr.KeywordTokenizerFactory"/> - <filter class="solr.LowerCaseFilterFactory" /> - <filter class="solr.TrimFilterFactory" /> - <filter class="solr.PatternReplaceFilterFactory" pattern="([^a-z])" replacement="" replace="all" /> - </analyzer> - </fieldType> - + + --> + <fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="false"> + <analyzer> + <!-- KeywordTokenizer does not tokenize --> + <tokenizer class="solr.KeywordTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory" /> + <filter class="solr.TrimFilterFactory" /> + <filter class="solr.PatternReplaceFilterFactory" pattern="([^a-z])" replacement="" replace="all" /> + </analyzer> + </fieldType> + <!-- not used - <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" > - <analyzer> - <tokenizer class="solr.StandardTokenizerFactory"/> - <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/> - </analyzer> - </fieldtype> --> - - <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100" omitNorms="false"> - <analyzer> - <tokenizer class="solr.PathHierarchyTokenizerFactory"/> - </analyzer> - </fieldType> - - <!-- since fields of this type are by default not stored or indexed, - any data added to them will be ignored outright. --> - <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" /> - - <!-- Spatial features are not yet supported by the Entityhub - <fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/> - <fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/> - <fieldtype name="geohash" class="solr.GeoHashField"/> + <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" > + <analyzer> + <tokenizer class="solr.StandardTokenizerFactory"/> + <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/> + </analyzer> + </fieldtype> --> + + <fieldType name="text_path" class="solr.TextField" positionIncrementGap="100" omitNorms="false"> + <analyzer> + <tokenizer class="solr.PathHierarchyTokenizerFactory"/> + </analyzer> + </fieldType> + + <!-- since fields of this type are by default not stored or indexed, + any data added to them will be ignored outright. --> + <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" /> + + <!-- Spatial features are not yet supported by the Entityhub + <fieldType name="point" class="solr.PointType" dimension="2" subFieldSuffix="_d"/> + <fieldType name="location" class="solr.LatLonType" subFieldSuffix="_coordinate"/> + <fieldtype name="geohash" class="solr.GeoHashField"/> --> - </types> - - - <fields> + </types> + + + <fields> <!-- For Information about the different attributes for fields see http://wiki.apache.org/solr/SchemaXml. @@ -309,7 +352,7 @@ Do not change this definition! --> <field name="_domain" type="string" indexed="true" stored="false" multiValued="true"/> - + <!-- DBPedia specific Field definitions --> @@ -326,10 +369,10 @@ <!-- Do index, but not store abstracts --> <field name="@en/dbp-ont:abstract/" type="text_en_Tight" indexed="true" stored="false" multiValued="true" omitNorms="false" termVectors="true"/> - <field name="@de/dbp-ont:abstract/" type="textgen" indexed="true" stored="false" multiValued="true" omitNorms="false" termVectors="true"/> + <field name="@de/dbp-ont:abstract/" type="text_de" indexed="true" stored="false" multiValued="true" omitNorms="false" termVectors="true"/> <field name="@it/dbp-ont:abstract/" type="textgen" indexed="true" stored="false" multiValued="true" omitNorms="false" termVectors="true"/> <field name="@fr/dbp-ont:abstract/" type="textgen" indexed="true" stored="false" multiValued="true" omitNorms="false" termVectors="true"/> - + <!-- Dynamic field definitions (used if a field name is not found) @@ -398,11 +441,16 @@ use @en-GB/* and @en* to have a special field type for en-GB and one for other english text --> - <!-- - Dynamic field for english languages. - Note that the prefix "@en*" matches also "@en-GB" and "@en-US" - --> - <dynamicField name="@en*" type="text_en_Tight" indexed="true" stored="true" multiValued="true" omitNorms="false"/> + <!-- + Dynamic field for english languages. + Note that the prefix "@en*" matches also "@en-GB" and "@en-US" + --> + <dynamicField name="@en*" type="text_en_Tight" indexed="true" stored="true" multiValued="true" omitNorms="false"/> + <!-- + Dynamic field for GERMAN languages. + Note that the prefix "@en*" matches also "@en-GB" and "@en-US" + --> + <dynamicField name="@de*" type="text_de" indexed="true" stored="true" multiValued="true" omitNorms="false"/> <!-- The "@*" catches all the other languages including "@/" (default language) used for texts without a defined language @@ -457,21 +505,21 @@ Do not change this definition! --> <dynamicField name="_config/*" type="string" indexed="false" multiValued="true"/> - - </fields> - + + </fields> + <!-- Field to use to determine and enforce document uniqueness. --> - <uniqueKey>uri</uniqueKey> - + <uniqueKey>uri</uniqueKey> + <!-- field for the QueryParser to use when an explicit fieldname is absent. The SolrYard does currently not take advantage of this. However it can be used when directly accessing the SolrYard. --> - <defaultSearchField>_text</defaultSearchField> - + <defaultSearchField>_text</defaultSearchField> + <!-- The SolrYard explizitly adds AND and OR for all boolean terms in generated queries. So changing that should have no influence on @@ -479,13 +527,13 @@ SolrQueryParser configuration: defaultOperator="AND|OR" --> - <solrQueryParser defaultOperator="OR"/> - + <solrQueryParser defaultOperator="OR"/> + <!-- The SolrYard Implementation assumes the following copyField commands. This commands MUST NOT be removed! --> - + <!-- Values of all fields that represent natural language texts or string values are copied to the default search field @@ -504,7 +552,7 @@ all references to it) --> <copyField source="ref/*" dest="_ref"/> - - - -</schema> + + + +</schema>
Added: incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/stopwords_de.txt URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/stopwords_de.txt?rev=1183014&view=auto ============================================================================== --- incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/stopwords_de.txt (added) +++ incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/stopwords_de.txt Thu Oct 13 17:57:25 2011 @@ -0,0 +1,237 @@ +# A German stop word list. Comments begin with vertical bar. Each stop +# word is at the start of a line. + +# The number of forms in this list is reduced significantly by passing it +# through the German stemmer. + + +aber +alle +allem +allen +aller +alles +als +also +am +an +ander +andere +anderem +anderen +anderer +anderes +andern +anders +auch +auf +aus +bei +bin +bis +bist +da +damit +dann +der +den +des +dem +die +das +daà +dass +derselbe +derselben +denselben +desselben +demselben +dieselbe +dieselben +dasselbe +dazu +dein +deine +deinem +deinen +deiner +deines +denn +derer +dessen +dich +dir +du +dies +diese +diesem +diesen +dieser +dieses +doch +dort +durch +ein +eine +einem +einen +einer +eines +einig +einige +einigem +einigen +einiger +einiges +einmal +er +ihn +ihm +es +etwas +euer +eure +eurem +euren +eurer +eures +für +gegen +gewesen +hab +habe +haben +hat +hatte +hatten +hier +hin +hinter +ich +mich +mir +ihr +ihre +ihrem +ihren +ihrer +ihres +euch +im +in +indem +ins +ist +jede +jedem +jeden +jeder +jedes +jene +jenem +jenen +jener +jenes +jetzt +kann +kein +keine +keinem +keinen +keiner +keines +können +könnte +machen +man +manche +manchem +manchen +mancher +manches +mein +meine +meinem +meinen +meiner +meines +mit +muss +musste +nach +nicht +nichts +noch +nun +nur +ob +oder +ohne +sehr +sein +seine +seinem +seinen +seiner +seines +selbst +sich +sie +ihnen +sind +so +solche +solchem +solchen +solcher +solches +soll +sollte +sondern +sonst +über +um +und +uns +unse +unserem +unseren +unser +unseres +unter +viel +vom +von +vor +während +war +waren +warst +was +#weg -> also a noun describing small streets +weil +weiter +welche +welchem +welchen +welcher +welches +wenn +werde +werden +wie +wieder +will +wir +wird +wirst +wo +wollen +wollte +würde +würden +zu +zum +zur +zwar +zwischen \ No newline at end of file Propchange: incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/stopwords_de.txt ------------------------------------------------------------------------------ svn:mime-type = text/plain Added: incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/synonyms_de.txt URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/synonyms_de.txt?rev=1183014&view=auto ============================================================================== --- incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/synonyms_de.txt (added) +++ incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/synonyms_de.txt Thu Oct 13 17:57:25 2011 @@ -0,0 +1,23 @@ +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#----------------------------------------------------------------------- +# Some synonym groups specific to this example +GB,gib,gigabyte,gigabytes +MB,mib,megabyte,megabytes +Television, Televisions, TV, TVs +#notice we use "gib" instead of "GiB" so any WordDelimiterFilter coming +#after us won't split it into two words. + +# Synonym mappings can be used for spelling correction too +# pixima => pixma + Propchange: incubator/stanbol/trunk/entityhub/indexing/dbpedia/src/main/resources/indexing/config/dbpedia/conf/synonyms_de.txt ------------------------------------------------------------------------------ svn:mime-type = text/plain Modified: incubator/stanbol/trunk/entityhub/indexing/destination/solryard/pom.xml URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/destination/solryard/pom.xml?rev=1183014&r1=1183013&r2=1183014&view=diff ============================================================================== --- incubator/stanbol/trunk/entityhub/indexing/destination/solryard/pom.xml (original) +++ incubator/stanbol/trunk/entityhub/indexing/destination/solryard/pom.xml Thu Oct 13 17:57:25 2011 @@ -218,6 +218,21 @@ <artifactId>lucene-spatial</artifactId> <scope>test</scope> </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-icu</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-icu4j</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.solr</groupId> + <artifactId>solr-analysis-extras</artifactId> + <scope>test</scope> + </dependency> <dependency> <groupId>jakarta-regexp</groupId> <artifactId>jakarta-regexp</artifactId> Modified: incubator/stanbol/trunk/entityhub/indexing/genericrdf/pom.xml URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/indexing/genericrdf/pom.xml?rev=1183014&r1=1183013&r2=1183014&view=diff ============================================================================== --- incubator/stanbol/trunk/entityhub/indexing/genericrdf/pom.xml (original) +++ incubator/stanbol/trunk/entityhub/indexing/genericrdf/pom.xml Thu Oct 13 17:57:25 2011 @@ -130,11 +130,12 @@ <artifactId>xercesImpl</artifactId> <scope>runtime</scope> </dependency> +<!-- we have lucene-icu4j anyway <dependency> <groupId>com.ibm.icu</groupId> <artifactId>icu4j</artifactId> <scope>runtime</scope> - </dependency> + </dependency> --> <!-- dependencies of the solr yard destination --> <dependency> @@ -247,6 +248,21 @@ <artifactId>lucene-spatial</artifactId> <scope>runtime</scope> </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-icu</artifactId> + <scope>runtime</scope> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-icu4j</artifactId> + <scope>runtime</scope> + </dependency> + <dependency> + <groupId>org.apache.solr</groupId> + <artifactId>solr-analysis-extras</artifactId> + <scope>runtime</scope> + </dependency> <dependency> <groupId>jakarta-regexp</groupId> <artifactId>jakarta-regexp</artifactId> Modified: incubator/stanbol/trunk/entityhub/yard/solr/pom.xml URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/yard/solr/pom.xml?rev=1183014&r1=1183013&r2=1183014&view=diff ============================================================================== --- incubator/stanbol/trunk/entityhub/yard/solr/pom.xml (original) +++ incubator/stanbol/trunk/entityhub/yard/solr/pom.xml Thu Oct 13 17:57:25 2011 @@ -238,6 +238,22 @@ <scope>test</scope> </dependency> <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-icu</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.lucene</groupId> + <artifactId>lucene-icu4j</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.solr</groupId> + <artifactId>solr-analysis-extras</artifactId> + <scope>test</scope> + </dependency> + + <dependency> <groupId>jakarta-regexp</groupId> <artifactId>jakarta-regexp</artifactId> <scope>test</scope> Propchange: incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/ ------------------------------------------------------------------------------ --- svn:ignore (added) +++ svn:ignore Thu Oct 13 17:57:25 2011 @@ -0,0 +1 @@ +default Modified: incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/default.solrindex.zip URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/default.solrindex.zip?rev=1183014&r1=1183013&r2=1183014&view=diff ============================================================================== Binary files - no diff available. Modified: incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/entityhub.solrindex.zip URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/core/entityhub.solrindex.zip?rev=1183014&r1=1183013&r2=1183014&view=diff ============================================================================== Binary files - no diff available. Added: incubator/stanbol/trunk/launchers/full/src/main/resources/resources/config/org.apache.sling.commons.log.LogManager.factory.config-solrlogconfig.config URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/launchers/full/src/main/resources/resources/config/org.apache.sling.commons.log.LogManager.factory.config-solrlogconfig.config?rev=1183014&view=auto ============================================================================== --- incubator/stanbol/trunk/launchers/full/src/main/resources/resources/config/org.apache.sling.commons.log.LogManager.factory.config-solrlogconfig.config (added) +++ incubator/stanbol/trunk/launchers/full/src/main/resources/resources/config/org.apache.sling.commons.log.LogManager.factory.config-solrlogconfig.config Thu Oct 13 17:57:25 2011 @@ -0,0 +1,4 @@ +org.apache.sling.commons.log.pattern="{0,date,dd.MM.yyyy\ HH:mm:ss.SSS}\ *{4}*\ [{2}]\ {3}\ {5}" +org.apache.sling.commons.log.names=["org.apache.solr"] +org.apache.sling.commons.log.level="warn" +org.apache.sling.commons.log.file="logs/error.log" \ No newline at end of file Added: incubator/stanbol/trunk/launchers/stable/src/main/resources/resources/config/org.apache.sling.commons.log.LogManager.factory.config-solrlogconfig.config URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/launchers/stable/src/main/resources/resources/config/org.apache.sling.commons.log.LogManager.factory.config-solrlogconfig.config?rev=1183014&view=auto ============================================================================== --- incubator/stanbol/trunk/launchers/stable/src/main/resources/resources/config/org.apache.sling.commons.log.LogManager.factory.config-solrlogconfig.config (added) +++ incubator/stanbol/trunk/launchers/stable/src/main/resources/resources/config/org.apache.sling.commons.log.LogManager.factory.config-solrlogconfig.config Thu Oct 13 17:57:25 2011 @@ -0,0 +1,4 @@ +org.apache.sling.commons.log.pattern="{0,date,dd.MM.yyyy\ HH:mm:ss.SSS}\ *{4}*\ [{2}]\ {3}\ {5}" +org.apache.sling.commons.log.names=["org.apache.solr"] +org.apache.sling.commons.log.level="warn" +org.apache.sling.commons.log.file="logs/error.log" \ No newline at end of file Modified: incubator/stanbol/trunk/parent/pom.xml URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/parent/pom.xml?rev=1183014&r1=1183013&r2=1183014&view=diff ============================================================================== --- incubator/stanbol/trunk/parent/pom.xml (original) +++ incubator/stanbol/trunk/parent/pom.xml Thu Oct 13 17:57:25 2011 @@ -921,28 +921,28 @@ <version>3.2.0</version> <scope>provided</scope> </dependency> - <!-- other Solr bundles currently not used <dependency> <groupId>org.apache.solr</groupId> - <artifactId>solr-clustering</artifactId> + <artifactId>solr-analysis-extras</artifactId> <version>3.2.0</version> <scope>provided</scope> </dependency> + <!-- other Solr bundles currently not used <dependency> <groupId>org.apache.solr</groupId> - <artifactId>solr-cell</artifactId> + <artifactId>solr-clustering</artifactId> <version>3.2.0</version> <scope>provided</scope> </dependency> <dependency> <groupId>org.apache.solr</groupId> - <artifactId>solr-carrot2-core</artifactId> + <artifactId>solr-cell</artifactId> <version>3.2.0</version> <scope>provided</scope> </dependency> <dependency> <groupId>org.apache.solr</groupId> - <artifactId>solr-analysis-extras</artifactId> + <artifactId>solr-carrot2-core</artifactId> <version>3.2.0</version> <scope>provided</scope> </dependency> -->
