Modified: incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/conf/test/conf/schema.xml URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/conf/test/conf/schema.xml?rev=1071231&r1=1071230&r2=1071231&view=diff ============================================================================== --- incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/conf/test/conf/schema.xml (original) +++ incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/conf/test/conf/schema.xml Wed Feb 16 12:45:36 2011 @@ -22,204 +22,142 @@ (i.e. ./solr/conf/schema.xml by default) or located where the classloader for the Solr webapp can find it. - This example schema is the recommended starting point for users. - It should be kept correct and concise, usable out-of-the-box. + This schema will be used by the Apache Stanbol SolrYard implementation to + index entities. + The implementation of the SolrYard does made several assumptions on + configurations defined in this Schema. So changes to this schema that do + affect such assumptions will most likelly cause unpredictable errors! + + However there are also a lot of places where users can optimize this schema + to specific requirements. See the comments within this schema for more + details! - For more information, on how to customize this file, please see - http://wiki.apache.org/solr/SchemaXml + For more information, on how to customize the Solr schema.xml in general, + please see http://wiki.apache.org/solr/SchemaXml. - PERFORMANCE NOTE: this schema includes many optional features and should not - be used for benchmarking. To improve performance one could - - set stored="false" for all fields possible (esp large fields) when you - only need to search on the field but don't need to return the original - value. - - set indexed="false" if you don't need to search on the field, but only - return the field as a result of searching on other indexed fields. - - remove all unneeded copyField statements - - for best index size and searching performance, set "index" to false - for all general text fields, use copyField to copy them to the - catchall "text" field, and use that for searching. - - For maximum indexing performance, use the StreamingUpdateSolrServer - java client. - - Remember to run the JVM in server mode, and use a higher logging level - that avoids logging every request --> -<schema name="IKS RICK Solr Yard Schema" version="1.2"> - <!-- attribute "name" is the name of this schema and is only used for display purposes. - Applications should change this to reflect the nature of the search collection. - version="1.2" is Solr's version number for the schema syntax and semantics. It should - not normally be changed by applications. - 1.0: multiValued attribute did not exist, all fields are multiValued by nature - 1.1: multiValued attribute introduced, false by default - 1.2: omitTermFreqAndPositions attribute introduced, true by default except for text fields. - --> - +<schema name="Apache Stanbol SolrYard Schema" version="1.2"> + <!-- + The SolrYard supports a list of types that is reflected by + "fieldType" specifications within this schema. + See the specific fieldType definition for more information + --> <types> - <!-- field type definitions. The "name" attribute is - just a label to be used by field definitions. The "class" - attribute and any other attributes determine the real - behavior of the fieldType. - Class names starting with "solr" refer to java classes in the - org.apache.solr.analysis package. - --> - - <!-- The StrField type is not analyzed, but indexed/stored verbatim. - - StrField and TextField support an optional compressThreshold which - limits compression (if enabled in the derived fields) to values which - exceed a certain size (in characters). + <!-- + This fieldType is used to store values with the dataType "xsd:string". + It is NOT used for natural language texts. Assume that this data type is + used for ISBN numbers, article numbers, string representations of + unsupported data types ... --> <fieldType name="string" class="solr.StrField" sortMissingLast="true" omitNorms="true"/> + <!-- + This can be used as alternative to "string" to enable case insensitive + searches on string values. + The KeywordTokenizerFactory ensures that the whole string is preserved as + a single token. + --> + <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.KeywordTokenizerFactory"/> + <filter class="solr.LowerCaseFilterFactory" /> + </analyzer> + </fieldType> + - <!-- boolean type: "true" or "false" --> + <!-- boolean type: "true" or "false" used to store values with the datatype "xsd:boolean" --> <fieldType name="boolean" class="solr.BoolField" sortMissingLast="true" omitNorms="true"/> - <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings --> + <!--Binary data type. The data should be sent/retrieved in as Base64 encoded Strings. + Currently not used by the SolrYard implementation, but reserved for future use. --> <fieldtype name="binary" class="solr.BinaryField"/> - - <!-- The optional sortMissingLast and sortMissingFirst attributes are - currently supported on types that are sorted internally as strings. - This includes "string","boolean","sint","slong","sfloat","sdouble","pdate" - - If sortMissingLast="true", then a sort on this field will cause documents - without the field to come after documents with the field, - regardless of the requested sort order (asc or desc). - - If sortMissingFirst="true", then a sort on this field will cause documents - without the field to come before documents with the field, - regardless of the requested sort order. - - If sortMissingLast="false" and sortMissingFirst="false" (the default), - then default lucene sorting will be used which places docs without the - field first in an ascending sort and last in a descending sort. - --> - <!-- - Default numeric field types. For faster range queries, consider the tint/tfloat/tlong/tdouble types. + Default numeric and date field types. By default used to index numeric values. + Note that the "solr.TrieIntField" does support indexing values at various + levels of precision to accelerate range queries. However the + precisionStep of 0 used by this fieldTypes disables this feature. + Change presisionStep to values > 0 to activate hierarchival indexing + for all numeric fields of that types. See Solr documentation for + suitable values and examples. --> <fieldType name="int" class="solr.TrieIntField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> <fieldType name="float" class="solr.TrieFloatField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> <fieldType name="long" class="solr.TrieLongField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> <fieldType name="double" class="solr.TrieDoubleField" precisionStep="0" omitNorms="true" positionIncrementGap="0"/> + <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/> <!-- - Numeric field types that index each value at various levels of precision - to accelerate range queries when the number of values between the range - endpoints is large. See the javadoc for NumericRangeQuery for internal - implementation details. - - Smaller precisionStep values (specified in bits) will lead to more tokens - indexed per value, slightly larger index size, and faster range queries. - A precisionStep of 0 disables indexing at different precision levels. + Numeric and date field types that do activate indexing values at various + levels of precision to accelerate range queries. + This can be used to activate hierarchival indexing for specific + fields. See Notes within the field section. --> <fieldType name="tint" class="solr.TrieIntField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> <fieldType name="tfloat" class="solr.TrieFloatField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> <fieldType name="tlong" class="solr.TrieLongField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> <fieldType name="tdouble" class="solr.TrieDoubleField" precisionStep="8" omitNorms="true" positionIncrementGap="0"/> - - <!-- The format for this date field is of the form 1995-12-31T23:59:59Z, and - is a more restricted form of the canonical representation of dateTime - http://www.w3.org/TR/xmlschema-2/#dateTime - The trailing "Z" designates UTC time and is mandatory. - Optional fractional seconds are allowed: 1995-12-31T23:59:59.999Z - All other components are mandatory. - - Expressions can also be used to denote calculations that should be - performed relative to "NOW" to determine the value, ie... - - NOW/HOUR - ... Round to the start of the current hour - NOW-1DAY - ... Exactly 1 day prior to now - NOW/DAY+6MONTHS+3DAYS - ... 6 months and 3 days in the future from the start of - the current day - - Consult the DateField javadocs for more information. - - Note: For faster range queries, consider the tdate type - --> - <fieldType name="date" class="solr.TrieDateField" omitNorms="true" precisionStep="0" positionIncrementGap="0"/> - - <!-- A Trie based date field for faster date range queries and date faceting. --> <fieldType name="tdate" class="solr.TrieDateField" omitNorms="true" precisionStep="6" positionIncrementGap="0"/> - <!-- The "RandomSortField" is not used to store or search any - data. You can declare fields of this type it in your schema - to generate pseudo-random orderings of your docs for sorting - purposes. The ordering is generated based on the field name - and the version of the index, As long as the index version - remains unchanged, and the same field name is reused, - the ordering of the docs will be consistent. - If you want different psuedo-random orderings of documents, - for the same version of the index, use a dynamicField and - change the name - --> - <fieldType name="random" class="solr.RandomSortField" indexed="true" /> - - <!-- solr.TextField allows the specification of custom text analyzers - specified as a tokenizer and a list of token filters. Different - analyzers may be specified for indexing and querying. - - The optional positionIncrementGap puts space between multiple fields of - this type on the same document, with the purpose of preventing false phrase - matching across fields. - - For more info on customizing your analyzer chain, please see - http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters - --> - - <!-- One can also specify an existing Analyzer class that has a - default constructor via the class attribute on the analyzer element - <fieldType name="text_greek" class="solr.TextField"> - <analyzer class="org.apache.lucene.analysis.el.GreekAnalyzer"/> - </fieldType> + <!-- + Natural Language Texts + + Indexing of natural language texts are supported by the solr.TextField class that + allows the specification of custom text analyzers specified as a tokenizer and a + list of token filters. + + For more info on customizing your analyzer chain, please see + http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters + + The SolrYard has support for different languages. Within the schema.xml one need + to define first a "fieldType" for language and second a dynamicField for the + prefix used by the SolrYard for this language. + For more information about the prefixes used by the SolrYard see the notes in the + field section of this configuration. + + In addition the SolrYard also indexes natural language values (of any language) + together with string values within a special field to support searches for + texts without an specified language. --> - - <!-- A text field that only splits on whitespace for exact matching of words --> - <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100"> - <analyzer> - <tokenizer class="solr.WhitespaceTokenizerFactory"/> - </analyzer> - </fieldType> - - <!-- A text field that uses WordDelimiterFilter to enable splitting and matching of - words on case-change, alpha numeric boundaries, and non-alphanumeric chars, - so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi". - Synonyms and stopwords are customized by external files, and stemming is enabled. - --> - <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100"> + <!-- + A general unstemmed text field - good if one does not know the language of the field. + This is used as the default fieldType for fields that store values of different + languages. + It is also the default fieldType for languages that do not define special fieldTypes. + --> + <fieldType name="textgen" class="solr.TextField" positionIncrementGap="100"> <analyzer type="index"> <tokenizer class="solr.WhitespaceTokenizerFactory"/> - <!-- in this example, we will only use synonyms at query time - <filter class="solr.SynonymFilterFactory" synonyms="index_synonyms.txt" ignoreCase="true" expand="false"/> - --> - <!-- Case insensitive stop word removal. - add enablePositionIncrements=true in both the index and query - analyzers to leave a 'gap' for more accurate phrase queries. - --> - <filter class="solr.StopFilterFactory" - ignoreCase="true" - words="stopwords.txt" - enablePositionIncrements="true" - /> - <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/> <filter class="solr.LowerCaseFilterFactory"/> - <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/> </analyzer> <analyzer type="query"> <tokenizer class="solr.WhitespaceTokenizerFactory"/> <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> - <filter class="solr.StopFilterFactory" - ignoreCase="true" - words="stopwords.txt" - enablePositionIncrements="true" - /> - <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/> <filter class="solr.LowerCaseFilterFactory"/> - <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/> </analyzer> </fieldType> - - - <!-- Less flexible matching, but less false matches. Probably not ideal for product names, - but may be good for SKUs. Can insert dashes in the wrong place and still match. --> + + <!-- + A text field that only splits on whitespace for exact matching of words. + Currently not used. May be used as an alternative to the textgen fieldType. + --> + <!-- + <fieldType name="text_ws" class="solr.TextField" positionIncrementGap="100"> + <analyzer> + <tokenizer class="solr.WhitespaceTokenizerFactory"/> + </analyzer> + </fieldType> + --> + + <!-- + This is the default fieldType used for english language texts. + + Less flexible matching than the text_en field type, but less false matches. + Probably not ideal for product names, but may be good for SKUs. + Can insert dashes in the wrong place and still match. + --> <fieldType name="text_en_Tight" class="solr.TextField" positionIncrementGap="100" > <analyzer> <tokenizer class="solr.WhitespaceTokenizerFactory"/> @@ -235,49 +173,64 @@ </fieldType> - <!-- A general unstemmed text field - good if one does not know the language of the field --> - <fieldType name="textgen" class="solr.TextField" positionIncrementGap="100"> + <!-- + This can be used as an alternative to the "text_en_Tight" fieldTpye for + english langauge texts. + + A text field that uses WordDelimiterFilter to enable splitting and matching of + words on case-change, alpha numeric boundaries, and non-alphanumeric chars, + so that a query of "wifi" or "wi fi" could match a document containing "Wi-Fi". + Synonyms and stopwords are customized by external files, and stemming is enabled. + --> + <!-- + <fieldType name="text_en" class="solr.TextField" positionIncrementGap="100"> <analyzer type="index"> <tokenizer class="solr.WhitespaceTokenizerFactory"/> - <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> - <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/> <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/> </analyzer> <analyzer type="query"> <tokenizer class="solr.WhitespaceTokenizerFactory"/> <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> - <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/> + <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="1"/> <filter class="solr.LowerCaseFilterFactory"/> + <filter class="solr.SnowballPorterFilterFactory" language="English" protected="protwords.txt"/> </analyzer> </fieldType> - - - <!-- A general unstemmed text field that indexes tokens normally and also - reversed (via ReversedWildcardFilterFactory), to enable more efficient - leading wildcard queries. --> + --> + + <!-- + The SolrYard allows leading Wildcards (e.g. "*aris"). To provide + good query performance for such queries one need to configure + fieldTypes that use the ReversedWildcardFilterFactory as shown by + this example. + See Solr documentation for details + + A general unstemmed text field that indexes tokens normally and also + reversed (via ReversedWildcardFilterFactory), to enable more efficient + leading wildcard queries. + --> + <!-- <fieldType name="text_rev" class="solr.TextField" positionIncrementGap="100"> <analyzer type="index"> <tokenizer class="solr.WhitespaceTokenizerFactory"/> <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true" /> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="1" catenateNumbers="1" catenateAll="0" splitOnCaseChange="0"/> <filter class="solr.LowerCaseFilterFactory"/> - <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" - maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/> + <filter class="solr.ReversedWildcardFilterFactory" withOriginal="true" maxPosAsterisk="3" maxPosQuestion="2" maxFractionAsterisk="0.33"/> </analyzer> <analyzer type="query"> <tokenizer class="solr.WhitespaceTokenizerFactory"/> <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt" ignoreCase="true" expand="true"/> - <filter class="solr.StopFilterFactory" - ignoreCase="true" - words="stopwords.txt" - enablePositionIncrements="true" - /> + <filter class="solr.StopFilterFactory" ignoreCase="true" words="stopwords.txt" enablePositionIncrements="true"/> <filter class="solr.WordDelimiterFilterFactory" generateWordParts="1" generateNumberParts="1" catenateWords="0" catenateNumbers="0" catenateAll="0" splitOnCaseChange="0"/> <filter class="solr.LowerCaseFilterFactory"/> </analyzer> </fieldType> - + --> <!-- charFilter + WhitespaceTokenizer --> <!-- <fieldType name="textCharNorm" class="solr.TextField" positionIncrementGap="100" > @@ -288,273 +241,218 @@ </fieldType> --> - <!-- This is an example of using the KeywordTokenizer along - With various TokenFilterFactories to produce a sortable field - that does not include some properties of the source text - --> - <fieldType name="alphaOnlySort" class="solr.TextField" sortMissingLast="true" omitNorms="true"> - <analyzer> - <!-- KeywordTokenizer does no actual tokenizing, so the entire - input string is preserved as a single token - --> - <tokenizer class="solr.KeywordTokenizerFactory"/> - <!-- The LowerCase TokenFilter does what you expect, which can be - when you want your sorting to be case insensitive - --> - <filter class="solr.LowerCaseFilterFactory" /> - <!-- The TrimFilter removes any leading or trailing whitespace --> - <filter class="solr.TrimFilterFactory" /> - <!-- The PatternReplaceFilter gives you the flexibility to use - Java Regular expression to replace any sequence of characters - matching a pattern with an arbitrary replacement string, - which may include back references to portions of the original - string matched by the pattern. - - See the Java Regular Expression documentation for more - information on pattern and replacement string syntax. - - http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/package-summary.html - --> - <filter class="solr.PatternReplaceFilterFactory" - pattern="([^a-z])" replacement="" replace="all" - /> - </analyzer> - </fieldType> - - <fieldtype name="phonetic" stored="false" indexed="true" class="solr.TextField" > - <analyzer> - <tokenizer class="solr.StandardTokenizerFactory"/> - <filter class="solr.DoubleMetaphoneFilterFactory" inject="false"/> - </analyzer> - </fieldtype> - - <fieldtype name="payloads" stored="false" indexed="true" class="solr.TextField" > - <analyzer> - <tokenizer class="solr.WhitespaceTokenizerFactory"/> - <!-- - The DelimitedPayloadTokenFilter can put payloads on tokens... for example, - a token of "foo|1.4" would be indexed as "foo" with a payload of 1.4f - Attributes of the DelimitedPayloadTokenFilterFactory : - "delimiter" - a one character delimiter. Default is | (pipe) - "encoder" - how to encode the following value into a playload - float -> org.apache.lucene.analysis.payloads.FloatEncoder, - integer -> o.a.l.a.p.IntegerEncoder - identity -> o.a.l.a.p.IdentityEncoder - Fully Qualified class name implementing PayloadEncoder, Encoder must have a no arg constructor. - --> - <filter class="solr.DelimitedPayloadTokenFilterFactory" encoder="float"/> - </analyzer> - </fieldtype> - - <!-- lowercases the entire field value, keeping it as a single token. --> - <fieldType name="lowercase" class="solr.TextField" positionIncrementGap="100"> - <analyzer> - <tokenizer class="solr.KeywordTokenizerFactory"/> - <filter class="solr.LowerCaseFilterFactory" /> - </analyzer> - </fieldType> - - - <!-- since fields of this type are by default not stored or indexed, - any data added to them will be ignored outright. --> + <!-- + This can be used to deactivate some functionality of the SolrYard or + to configure that some fields of a data set are not stored nor indexed + regardless of the Apache Stanbol Entityhub configuration! + --> <fieldtype name="ignored" stored="false" indexed="false" multiValued="true" class="solr.StrField" /> </types> <fields> - <!-- Valid attributes for fields: - name: mandatory - the name for the field - type: mandatory - the name of a previously defined type from the - <types> section - indexed: true if this field should be indexed (searchable or sortable) - stored: true if this field should be retrievable - compressed: [false] if this field should be stored using gzip compression - (this will only apply if the field type is compressable; among - the standard field types, only TextField and StrField are) - multiValued: true if this field may contain multiple values per document - omitNorms: (expert) set to true to omit the norms associated with - this field (this disables length normalization and index-time - boosting for the field, and saves some memory). Only full-text - fields or fields that need an index-time boost need norms. - termVectors: [false] set to true to store the term vector for a - given field. - When using MoreLikeThis, fields used for similarity should be - stored for best performance. - termPositions: Store position information with the term vector. - This will increase storage costs. - termOffsets: Store offset information with the term vector. This - will increase storage costs. - default: a value that should be used if no value is specified - when adding a document. + <!-- + For Information about the different attributes for fields + see http://wiki.apache.org/solr/SchemaXml. --> <!-- - The _uri field is used as ID for documents! + The uri field is used as ID for documents indexed by the SolrYard. + Do not change this definition! --> <field name="uri" type="string" indexed="true" stored="true" required="true" /> -<!-- - <field name="sku" type="textTight" indexed="true" stored="true" omitNorms="true"/> - <field name="name" type="textgen" indexed="true" stored="true"/> - <field name="alphaNameSort" type="alphaOnlySort" indexed="true" stored="false"/> - <field name="manu" type="textgen" indexed="true" stored="true" omitNorms="true"/> - <field name="cat" type="text_ws" indexed="true" stored="true" multiValued="true" omitNorms="true" /> - <field name="features" type="text" indexed="true" stored="true" multiValued="true"/> - <field name="includes" type="text" indexed="true" stored="true" termVectors="true" termPositions="true" termOffsets="true" /> - - <field name="weight" type="float" indexed="true" stored="true"/> - <field name="price" type="float" indexed="true" stored="true"/> - <field name="popularity" type="int" indexed="true" stored="true" /> - <field name="inStock" type="boolean" indexed="true" stored="true" /> ---> - <!-- Common metadata fields, named specifically to match up with - SolrCell metadata when parsing rich documents such as Word, PDF. - Some fields are multiValued only because Tika currently may return - multiple values for them. --> -<!-- - <field name="title" type="text" indexed="true" stored="true" multiValued="true"/> - <field name="subject" type="text" indexed="true" stored="true"/> - <field name="description" type="text" indexed="true" stored="true"/> - <field name="comments" type="text" indexed="true" stored="true"/> - <field name="author" type="textgen" indexed="true" stored="true"/> - <field name="keywords" type="textgen" indexed="true" stored="true"/> - <field name="category" type="textgen" indexed="true" stored="true"/> - <field name="content_type" type="string" indexed="true" stored="true" multiValued="true"/> - <field name="last_modified" type="date" indexed="true" stored="true"/> - <field name="links" type="string" indexed="true" stored="true" multiValued="true"/> - --> - - <!-- ************************************************ - SPECIAL FIELDS - ************************************************ --> - <!-- used to index all natural language text of the document (via copyField) --> + <!-- + used to index all natural language texts of all fields of a document + (via copyField). This is used as default search field. + The type may be changed. + --> <field name="_text" type="textgen" indexed="true" stored="false" multiValued="true"/> - <!-- used to store all references of the document (via copyField) - This field is also important if to update dependend documents if a document - is deleted within the index --> + <!-- + used to store all references of the document (via copyField). + This field may be used to search for related entities. + Do not change this definition! + --> <field name="_ref" type="string" indexed="true" stored="false" multiValued="true"/> - <!-- This field is used to store the a key used to seperate documents stored by - different yards into a single Index. In the SolrYard configuration this can be - activated by setting "Multi Yard Layout" to true. - Yards using this setting will store there yardId in this field when storing documents - All queries will use this as constraint in the FilterQuery. + <!-- + Field used to store the domain in case multiple datasets are stored witin the same + Solr index (search for "Multi Yard Layout" for details). + Do not change this definition! --> <field name="_domain" type="string" indexed="true" stored="false" multiValued="true"/> - - - <!-- Uncommenting the following will create a "timestamp" field using - a default value of "NOW" to indicate when each document was indexed. - --> + <!-- + Dynamic field definitions (used if a field name is not found) + see http://wiki.apache.org/solr/SchemaXml for details. + + The SolrYard heavily uses dynamic fields to index fields based on there + data type. + Notes: + - By default all dynamicField specifications use multiValued="true" + because this schema.xml does not make any assumptions on the data stored. + Even a boolean field may define both "true" and "false" as values. + - The SolrYard currently uses only prefixes to represent data types + and languages. However postfixes are reserved for future extensions. + That means that postfixes MUST NOT be used for dynamicField definitions + --> <!-- - <field name="timestamp" type="date" indexed="true" stored="true" default="NOW" multiValued="false"/> - --> - - - <!-- Dynamic field definitions. If a field name is not found, dynamicFields - will be used if the name matches any of the patterns. - RESTRICTION: the glob-like pattern in the name attribute must have - a "*" only at the start or the end. - EXAMPLE: name="*_i" will match any field ending in _i (like myid_i, z_i) - Longer patterns will be matched first. if equal size patterns - both match, the first appearing in the schema will be used. --> - <dynamicField name="bool/*" type="boolean" indexed="true" stored="true"/> - <dynamicField name="int/*" type="int" indexed="true" stored="true" multiValued="true"/><!-- TODO: check if we need multivalued for numbers --> + Used for all fields with boolean data type. + One can change multivalued to "false" in case the data indexed do confirm to + this restriction + --> + <dynamicField name="bool/*" type="boolean" indexed="true" stored="true" multiValued="true"/> + <!-- + Numeric dataTypes. + Note that thy type can be changed to fieldTypes supporting hierarchical + indexing of values to increase performance of ranged queries + --> + <dynamicField name="int/*" type="int" indexed="true" stored="true" multiValued="true"/> <dynamicField name="lon/*" type="long" indexed="true" stored="true" multiValued="true"/> <dynamicField name="flo/*" type="float" indexed="true" stored="true" multiValued="true"/> <dynamicField name="dou/*" type="double" indexed="true" stored="true" multiValued="true"/> <dynamicField name="cal/*" type="date" indexed="true" stored="true" multiValued="true"/> - <dynamicField name="dur/*" type="string" indexed="true" stored="true" multiValued="true"/> <!-- TODO: Add support for duration! --> - <!-- String fields that are not natural language --> + <dynamicField name="dur/*" type="string" indexed="true" stored="true" multiValued="true"/> + <!-- + String fields that are not natural language + To support case insensitive searches in such fields change + the type to "lowercase" + --> <dynamicField name="str/*" type="string" indexed="true" stored="true" multiValued="true"/> - <!-- references are URIs to other resources --> + <!-- + references are values that represent IDs of other resources. + Typically this will store URIs but in principle also other IDs + could be used. + --> <dynamicField name="ref/*" type="string" indexed="true" stored="true" multiValued="true"/> - <!-- add here special field types for known languages - use @en* to match en-GB and en-US - use @en.* to match only en but not en-GB or en-US - use @en-GB.* and @en* to have a special field type for en-GB and - one for other english text --> - <dynamicField name="@en*" type="text_en_Tight" indexed="true" stored="true" multiValued="true"/> - <!-- the "@*" catches all the other languages including "@." (default - language) --> + <!-- + DynamicFields representing natural language texts. + + The type of such fields may be changed to one of the alternatives + descibed in the types section of this configuration. + + The SolrYard prefixes natural language texts with "@" + followed by the defined language. + Currently no processing of the language is done. So + defineing the language "1xx5zr7" for Text will create + a field with the prefix "@1xx5zr7". + + To define a special field type for a language one has + to use "@" plus the key for the language as prefix. + See the specification for English language texts as example. + + Also prefixes for country specific languages can be + realized by defining dynamic fields like follows: + use @en* to match en-GB and en-US + use @en/* to match only en but not en-GB or en-US + use @en-GB/* and @en* to have a special field type for + en-GB and one for other english text + --> + <!-- + Dynamic field for english languages. + Note that the prefix "@en*" matches also "@en-GB" and "@en-US" + --> + <dynamicField name="@en*" type="text_en_Tight" indexed="true" stored="true" multiValued="true"/> + <!-- + The "@*" catches all the other languages including "@/" + (default language) used for texts without a defined language + --> <dynamicField name="@*" type="textgen" indexed="true" stored="true" multiValued="true"/> + <!-- + To add special configurations for specific fields one + has to include the fieldName within the prefix of the + dynamicField specification. + The SolrYard uses namespace prefixes to generate + field names. When defined prefixes defined in the + NamespaceEnum of the Entityhub are used. + Currently there is no way to define used prefixes for + other namespaces. + + This example shows how to activate lower case search + for the dcmi-terms format property + <dynamicField name="str/dc:format*" type="lowercase" indexed="true" stored="true" multiValued="true"/> + + This example shows how to activate fast ranged queries + for spatial searches + <dynamicField name="dou/geo:lat*" type="tdouble" indexed="true" stored="true" multiValued="true"/> + <dynamicField name="dou/geo:lon*" type="tdouble" indexed="true" stored="true" multiValued="true"/> + <dynamicField name="int/geo:alt*" type="tint" indexed="true" stored="true" multiValued="true"/> + + This example defines to use the type "text_en" for + english language rdfs:comment values. + + <dynamicField name="@en/rdfs:comment*" type="text_en" indexed="true" stored="true" multiValued="true"/> + + Note that this would not match "en-GB" nor "en-US". + --> + - <!-- ************************************************ - SPECIAL DYNAMIC FIELDS - ************************************************ --> + <!-- + The following dynamic field specifications define fiedls used + by the SolrYard implementation for special purposes. + --> - <!-- the "_!@*" fields contain a copy of all languages for that field. - we need not to store such values, because they are only needed in the - inverted index --> + <!-- + the "_!@*" fields contain a copy of all languages AND string values + for that field. This field is used for text queries with no specified + language. + This field need not to be stored. The type can be changed to alternatives + as described in the types section of this configuration. + --> <dynamicField name="_!@*" type="textgen" indexed="true" stored="false" multiValued="true"/> - <!-- fields starting with _config. are used to store configurations about how the - index was created within the index (e.g. used namespace prefixes) --> + <!-- + fields starting with "_config/" are used to store configurations about how the + index was created within the index (e.g. used namespace prefixes). + Do not change this definition! + --> <dynamicField name="_config/*" type="string" indexed="false" multiValued="true"/> - - - <!-- uncomment the following to ignore any fields that don't already match an existing - field name or dynamic field, rather than reporting them as an error. - alternately, change the type="ignored" to some other type e.g. "text" if you want - unknown fields indexed and/or stored by default --> - <!--dynamicField name="*" type="ignored" multiValued="true" /--> - + </fields> - <!-- Field to use to determine and enforce document uniqueness. - Unless this field is marked with required="false", it will be a required field + <!-- + Field to use to determine and enforce document uniqueness. --> <uniqueKey>uri</uniqueKey> - <!-- field for the QueryParser to use when an explicit fieldname is absent --> + <!-- + field for the QueryParser to use when an explicit fieldname is absent. + The SolrYard does currently not take advantage of this. However it can + be used when directly accessing the SolrYard. + --> <defaultSearchField>_text</defaultSearchField> - <!-- SolrQueryParser configuration: defaultOperator="AND|OR" --> + <!-- + The SolrYard explizitly adds AND and OR for all boolean terms in + generated queries. So changing that should have no influence on + the SolrYard (not tested) + + SolrQueryParser configuration: defaultOperator="AND|OR" + --> <solrQueryParser defaultOperator="OR"/> - <!-- copyField commands copy one field to another at the time a document - is added to the index. It's used either to index the same field differently, - or to add multiple fields to the same field for easier/faster searching. --> + <!-- + The SolrYard Implementation assumes the following copyField commands. + This commands MUST NOT be removed! + --> - <!-- this defines what fields are copied to the default search field --> + <!-- + Values of all fields that represent natural language texts + or string values are copied to the default search field + "_text". + Currently the SolrYard does not use this field, but it is + reserved for future useage and MUST therefoer already be + included when indexing documents + --> <copyField source="@*" dest="_text"/> <copyField source="str/*" dest="_text"/> + <!-- + All references to other entities (documents) need to be + copied to the "_ref" field. + This field is required to query for dependencies of other + documents (e.g. when one needs to remove a docuemnt and + all references to it) + --> <copyField source="ref/*" dest="_ref"/> - <!-- NOTE [UNIMPLEMENTED] - The current design reserves suffixes for semantic tagging. That means - that the suffix can be used to "tag" an field to play an specific - role (e.g. titel, description, categorization ... - The Intension is to use this feature for special copyField patterns - to index all titels, descriptions, categories ... in special fields - (Rupert Westenthaler - 2010-11-03) - --> - - <!-- Above, multiple source fields are copied to the [text] field. - Another way to map multiple source fields to the same - destination field is to use the dynamic field syntax. - copyField also supports a maxChars to copy setting. --> - - <!-- <copyField source="*_t" dest="text" maxChars="3000"/> --> - - <!-- copy name to alphaNameSort, a field designed for sorting by name --> - <!-- <copyField source="name" dest="alphaNameSort"/> --> - - - <!-- Similarity is the scoring routine for each document vs. a query. - A custom similarity may be specified here, but the default is fine - for most applications. --> - <!-- <similarity class="org.apache.lucene.search.DefaultSimilarity"/> --> - <!-- ... OR ... - Specify a SimilarityFactory class name implementation - allowing parameters to be used. - --> - <!-- - <similarity class="com.example.solr.CustomSimilarityFactory"> - <str name="paramkey">param value</str> - </similarity> - --> - - </schema>
Modified: incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/conf/test/conf/stopwords.txt URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/conf/test/conf/stopwords.txt?rev=1071231&r1=1071230&r2=1071231&view=diff ============================================================================== --- incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/conf/test/conf/stopwords.txt (original) +++ incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/conf/test/conf/stopwords.txt Wed Feb 16 12:45:36 2011 @@ -13,11 +13,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -#----------------------------------------------------------------------- -# a couple of test stopwords to test that the words are really being -# configured from this file: -stopworda -stopwordb #Standard english stop words taken from Lucene's StopAnalyzer a Modified: incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/conf/test/conf/synonyms.txt URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/conf/test/conf/synonyms.txt?rev=1071231&r1=1071230&r2=1071231&view=diff ============================================================================== --- incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/conf/test/conf/synonyms.txt (original) +++ incubator/stanbol/trunk/entityhub/yard/solr/src/main/resources/solr/conf/test/conf/synonyms.txt Wed Feb 16 12:45:36 2011 @@ -10,15 +10,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -#----------------------------------------------------------------------- -#some test synonym mappings unlikely to appear in real input text -aaa => aaaa -bbb => bbbb1 bbbb2 -ccc => cccc1,cccc2 -a\=>a => b\=>b -a\,a => b\,b -fooaaa,baraaa,bazaaa - # Some synonym groups specific to this example GB,gib,gigabyte,gigabytes MB,mib,megabyte,megabytes
