wrong results with wdf & ngtf

Andreas Owen Thu, 20 Mar 2014 02:50:07 -0700

Is there a way to tell ngramfilterfactory while indexing that number shall
never be tokenized? then the query should be able to find numbers.


 

Or do i have to change the ngram-min for numbers (not alpha) to 1, if that
is possible? So to speak put the hole number as token and not all possible
tokens.

 

Solr analysis shows onnly WDF has no underscore in its tokens, the rest have
it. can i tell the query to search numbers differently with NGTF, WT, LCF or
whatever?

 

I also tried <filter class="solr.WordDelimiterFilterFactory"
types="at-under-alpha.txt"/>

                @ => ALPHA

                _ => ALPHA

 

I have gotten nearly everything to work. There are to queries where i dont
get back what i want.

 

                "avaloq frage 1"               -> only returns if i set
minGramSize=1 while indexing

                "yh_cug"                            -> query parser doesn't
remove "_" but the indexer does (WDF) so there is no match

 

Is there a way to also query the hole term "avaloq frage 1" without
tokenizing it?

 

Fieldtype:

 

<fieldType name="text_de" class="solr.TextField" positionIncrementGap="100">

      <analyzer type="index"> 

                               <tokenizer
class="solr.StandardTokenizerFactory"/>

                                <filter
class="solr.LowerCaseFilterFactory"/>

                               <filter
class="solr.WordDelimiterFilterFactory" types="at-under-alpha.txt"/> 

                               <filter class="solr.StopFilterFactory"
ignoreCase="true" words="lang/stopwords_de.txt" format="snowball"
enablePositionIncrements="true"/> <!-- remove common words -->

                                <filter
class="solr.GermanNormalizationFilterFactory"/>

                               <filter
class="solr.SnowballPorterFilterFactory" language="German"/> <!-- remove
noun/adjective inflections like plural endings -->


                               <filter class="solr.NGramFilterFactory"
minGramSize="3" maxGramSize="15"/>

                               <filter
class="solr.WordDelimiterFilterFactory" generateWordParts="1"
generateNumberParts="1" catenateWords="1" catenateNumbers="1"
catenateAll="0" splitOnCaseChange="1"/>

                   </analyzer>

                   <analyzer type="query">

                                               <tokenizer
class="solr.WhiteSpaceTokenizerFactory"/>

                                               <filter
class="solr.LowerCaseFilterFactory"/>

                                               <filter
class="solr.WordDelimiterFilterFactory" types="at-under-alpha.txt"/> 

                                               <filter
class="solr.StopFilterFactory" ignoreCase="true"
words="lang/stopwords_de.txt" format="snowball"
enablePositionIncrements="true"/> <!-- remove common words -->

                                               <filter
class="solr.GermanNormalizationFilterFactory"/>

                                               <filter
class="solr.SnowballPorterFilterFactory" language="German"/>

      </analyzer>

</fieldType>

 

 

Solrconfig:

 

> <queryParser name="synonym_edismax"

> class="solr.SynonymExpandingExtendedDismaxQParserPlugin">

>   <lst name="synonymAnalyzers">

> <lst name="myCoolAnalyzer">

>   <lst name="tokenizer">

> <str name="class">standard</str>

>   </lst>

>   <lst name="filter">

> <str name="class">shingle</str>

> <str name="outputUnigramsIfNoShingles">true</str>

> <str name="outputUnigrams">true</str>

> <str name="minShingleSize">2</str>

> <str name="maxShingleSize">4</str>

>   </lst>

>   <lst name="filter">

> <str name="class">synonym</str>

> <str name="tokenizerFactory">solr.KeywordTokenizerFactory</str>

> <str name="synonyms">synonyms.txt</str>

> <str name="expand">true</str>

> <str name="ignoreCase">true</str>

>   </lst>

> </lst>

>   </lst>

> </queryParser>

> 

> <requestHandler name="/select2" class="solr.SearchHandler">

>      <lst name="defaults">

>        <str name="echoParams">explicit</str>

>        <int name="rows">10</int>

>        <str name="defType">synonym_edismax</str>

>    <str name="synonyms">true</str>

>    <str name="qf">plain_text^10 editorschoice^200

> title^20 h_*^14

> tags^10 thema^15 inhaltstyp^6 breadcrumb^6 doctype^10

> contentmanager^5 links^5

> last_modified^5 url^5

>    </str>

>    <str name="bq">(expiration:[NOW TO *] OR (*:* 

> -expiration:*))^6</str>

>    <str name="bf">div(clicks,max(displays,1))^8</str> <!-- tested -->

> 

>        <str name="df">text</str>

>    <str name="fl">*,path,score</str>

>    <str name="wt">json</str>

>    <str name="q.op">AND</str>

> 

>    <!-- Highlighting defaults -->

>        <str name="hl">on</str>

>        <str name="hl.fl">plain_text,title</str>

>    <str name="hl.fragSize">200</str>

>    <str name="hl.simple.pre">&lt;b&gt;</str>

>        <str name="hl.simple.post">&lt;/b&gt;</str>

> 

> <!-- <lst name="invariants"> -->

>     <str name="facet">on</str>

> <str name="facet.mincount">1</str>

>         <str name="facet.field">{!ex=inhaltstyp_s}inhaltstyp_s</str>

> <str name="f.inhaltstyp_s.facet.sort">index</str>

> <str name="facet.field">{!ex=doctype}doctype</str>

> <str name="f.doctype.facet.sort">index</str>

> <str name="facet.field">{!ex=thema_f}thema_f</str>

> <str name="f.thema_f.facet.sort">index</str>

> <str name="facet.field">{!ex=author_s}author_s</str>

> <str name="f.author_s.facet.sort">index</str>

> <str

> name="facet.field">{!ex=sachverstaendiger_s}sachverstaendiger_s</str>

> <str name="f.sachverstaendiger_s.facet.sort">index</str>

> <str name="facet.field">{!ex=veranstaltung_s}veranstaltung_s</str>

> <str name="f.veranstaltung_s.facet.sort">index</str>

> <str name="facet.date">{!ex=last_modified}last_modified</str>

> <str name="facet.date.gap">+1MONTH</str>

> <str name="facet.date.end">NOW/MONTH+1MONTH</str>

> <str name="facet.date.start">NOW/MONTH-36MONTHS</str>

> <str name="facet.date.other">after</str>

> 

>        </lst>

> </requestHandler>

wrong results with wdf & ngtf

Reply via email to