wrong results with wdf & ngtf

aowen Thu, 20 Mar 2014 06:50:38 -0700

Is there a way to tell ngramfilterfactory while indexing that number shall 
never be tokenized? then the query should be able to find numbers.


Or do i have to change the ngram-min for numbers (not alpha) to 1, if that is 
possible? So to speak put the hole number as token and not all possible tokens.

Solr analysis shows onnly WDF has no underscore in its tokens, the rest have 
it. can i tell the query to search numbers differently with NGTF, WT, LCF or 
whatever?

I also tried <filter class="solr.WordDelimiterFilterFactory" 
types="at-under-alpha.txt"/>
                @ => ALPHA
                _ => ALPHA

I have gotten nearly everything to work. There are to queries where i dont get 
back what i want.

                "avaloq frage 1"               -> only returns if i set 
minGramSize=1 while indexing
                "yh_cug"                            -> query parser doesn't 
remove "_" but the indexer does (WDF) so there is no match

Is there a way to also query the hole term "avaloq frage 1" without tokenizing 
it?

Fieldtype:

<fieldType name="text_de" class="solr.TextField" positionIncrementGap="100">
      <analyzer type="index"> 
                               <tokenizer 
class="solr.StandardTokenizerFactory"/>
                                <filter class="solr.LowerCaseFilterFactory"/>
                               <filter class="solr.WordDelimiterFilterFactory" 
types="at-under-alpha.txt"/> 
                               <filter class="solr.StopFilterFactory" 
ignoreCase="true" words="lang/stopwords_de.txt" format="snowball" 
enablePositionIncrements="true"/> <!-- remove common words -->
                                <filter 
class="solr.GermanNormalizationFilterFactory"/>
                               <filter class="solr.SnowballPorterFilterFactory" 
language="German"/> <!-- remove noun/adjective inflections like plural endings 
-->                             
                               <filter class="solr.NGramFilterFactory" 
minGramSize="3" maxGramSize="15"/>
                               <filter class="solr.WordDelimiterFilterFactory" 
generateWordParts="1" generateNumberParts="1" catenateWords="1" 
catenateNumbers="1" catenateAll="0" splitOnCaseChange="1"/>
                   </analyzer>
                   <analyzer type="query">
                                               <tokenizer 
class="solr.WhiteSpaceTokenizerFactory"/>
                                               <filter 
class="solr.LowerCaseFilterFactory"/>
                                               <filter 
class="solr.WordDelimiterFilterFactory" types="at-under-alpha.txt"/> 
                                               <filter 
class="solr.StopFilterFactory" ignoreCase="true" words="lang/stopwords_de.txt" 
format="snowball" enablePositionIncrements="true"/> <!-- remove common words -->
                                               <filter 
class="solr.GermanNormalizationFilterFactory"/>
                                               <filter 
class="solr.SnowballPorterFilterFactory" language="German"/>
      </analyzer>
</fieldType>


Solrconfig:

> <queryParser name="synonym_edismax"
> class="solr.SynonymExpandingExtendedDismaxQParserPlugin">
>   <lst name="synonymAnalyzers">
> <lst name="myCoolAnalyzer">
>   <lst name="tokenizer">
> <str name="class">standard</str>
>   </lst>
>   <lst name="filter">
> <str name="class">shingle</str>
> <str name="outputUnigramsIfNoShingles">true</str>
> <str name="outputUnigrams">true</str>
> <str name="minShingleSize">2</str>
> <str name="maxShingleSize">4</str>
>   </lst>
>   <lst name="filter">
> <str name="class">synonym</str>
> <str name="tokenizerFactory">solr.KeywordTokenizerFactory</str>
> <str name="synonyms">synonyms.txt</str>
> <str name="expand">true</str>
> <str name="ignoreCase">true</str>
>   </lst>
> </lst>
>   </lst>
> </queryParser>
> 
> <requestHandler name="/select2" class="solr.SearchHandler">
>      <lst name="defaults">
>        <str name="echoParams">explicit</str>
>        <int name="rows">10</int>
>        <str name="defType">synonym_edismax</str>
>    <str name="synonyms">true</str>
>    <str name="qf">plain_text^10 editorschoice^200
> title^20 h_*^14
> tags^10 thema^15 inhaltstyp^6 breadcrumb^6 doctype^10
> contentmanager^5 links^5
> last_modified^5 url^5
>    </str>
>    <str name="bq">(expiration:[NOW TO *] OR (*:* 
> -expiration:*))^6</str>
>    <str name="bf">div(clicks,max(displays,1))^8</str> <!-- tested -->
> 
>        <str name="df">text</str>
>    <str name="fl">*,path,score</str>
>    <str name="wt">json</str>
>    <str name="q.op">AND</str>
> 
>    <!-- Highlighting defaults -->
>        <str name="hl">on</str>
>        <str name="hl.fl">plain_text,title</str>
>    <str name="hl.fragSize">200</str>
>    <str name="hl.simple.pre">&lt;b&gt;</str>
>        <str name="hl.simple.post">&lt;/b&gt;</str>
> 
> <!-- <lst name="invariants"> -->
>     <str name="facet">on</str>
> <str name="facet.mincount">1</str>
>         <str name="facet.field">{!ex=inhaltstyp_s}inhaltstyp_s</str>
> <str name="f.inhaltstyp_s.facet.sort">index</str>
> <str name="facet.field">{!ex=doctype}doctype</str>
> <str name="f.doctype.facet.sort">index</str>
> <str name="facet.field">{!ex=thema_f}thema_f</str>
> <str name="f.thema_f.facet.sort">index</str>
> <str name="facet.field">{!ex=author_s}author_s</str>
> <str name="f.author_s.facet.sort">index</str>
> <str
> name="facet.field">{!ex=sachverstaendiger_s}sachverstaendiger_s</str>
> <str name="f.sachverstaendiger_s.facet.sort">index</str>
> <str name="facet.field">{!ex=veranstaltung_s}veranstaltung_s</str>
> <str name="f.veranstaltung_s.facet.sort">index</str>
> <str name="facet.date">{!ex=last_modified}last_modified</str>
> <str name="facet.date.gap">+1MONTH</str>
> <str name="facet.date.end">NOW/MONTH+1MONTH</str>
> <str name="facet.date.start">NOW/MONTH-36MONTHS</str>
> <str name="facet.date.other">after</str>
> 
>        </lst>
> </requestHandler>

wrong results with wdf & ngtf

Reply via email to