I'm providing a search feature in a web app that searches for documents
that range in size from 1KB to 200MB of varying MIME types (PDF, DOC,
etc). Currently there are about 3000 documents and this will continue to
grow. I'm providing full word search and partial word search. For each
document, there are three source fields that I'm interested in searching
and highlighting on: name, description, and content. Since I'm providing
both full and partial word search, I've created additional fields that
get tokenized differently: name_par, description_par, and content_par.
Those are indexed and stored as well for querying and highlighting. As
suggested in the Solr wiki, I've got two catch all fields text and
text_par for faster querying. 
 
An average search results page displays 25 results and I provide paging.
I'm just returning the doc ID in my Solr search results and response
times have been quite good (1 to 10 ms). The problem in performance
occurs when I turn on highlighting. I'm already using the
FastVectorHighlighter and depending on the query, it has taken as long
as 15 seconds to get the highlight snippets. However, this isn't always
the case. Certain query terms result in 1 sec or less response time. In
any case, 15 seconds is way too long. 
 
I'm fairly new to Solr but I've spent days coming up with what I've got
so far. Feel free to correct any misconceptions I have. Can anyone
advise me on what I'm doing wrong or offer a better way to setup my core
to improve highlighting performance? 
 
A typical query would look like:
/select?q=foo&start=0&rows=25&fl=id&hl=true 
 
I'm using Solr 4.1. Below the relevant core schema and config details: 
 
<!-- Misc fields --> 
<field name="_version_" type="long" indexed="true" stored="true"/> 
<field name="id" type="string" indexed="true" stored="true"
required="true" multiValued="false"/> 
 
 
<!-- Fields for whole word matches --> 
<field name="name" type="text_general" indexed="true" stored="true"
multiValued="true" termPositions="true" termVectors="true"
termOffsets="true"/> 
<field name="description" type="text_general" indexed="true"
stored="true" multiValued="true" termPositions="true" termVectors="true"
termOffsets="true"/> 
<field name="content" type="text_general" indexed="true" stored="true"
multiValued="true" termPositions="true" termVectors="true"
termOffsets="true"/> 
<field name="text" type="text_general" indexed="true" stored="false"
multiValued="true"/> 
 
<!-- Fields for partial word matches --> 
<field name="name_par" type="text_general_partial" indexed="true"
stored="true" multiValued="true" termPositions="true" termVectors="true"
termOffsets="true"/> 
<field name="description_par" type="text_general_partial" indexed="true"
stored="true" multiValued="true" termPositions="true" termVectors="true"
termOffsets="true"/> 
<field name="content_par" type="text_general_partial" indexed="true"
stored="true" multiValued="true" termPositions="true" termVectors="true"
termOffsets="true"/> 
<field name="text_par" type="text_general_partial" indexed="true"
stored="false" multiValued="true"/> 
 
 
<!-- Copy source name, description, and content fields to name_par,
description_par, and content_par for partial word searches --> 
<copyField source="name" dest="name_par"/> 
<copyField source="description" dest="description_par"/> 
<copyField source="content" dest="content_par"/> 
 
<!-- Copy source name, description, and content fields to catch-all text
field for faster querying. --> 
<copyField source="name" dest="text"/> 
<copyField source="description" dest="text"/> 
<copyField source="content" dest="text"/> 
 
<!-- Copy source name, description, and content fields to catch-all
text_par field for faster querying of partial word searches. --> 
<copyField source="name" dest="text_par"/> 
<copyField source="description" dest="text_par"/> 
<copyField source="content" dest="text_par"/> 
 
<!-- A text field for whole word matches --> 
<fieldType name="text_general" class="solr.TextField"
positionIncrementGap="100"> 
  <analyzer type="index"> 
    <tokenizer class="solr.StandardTokenizerFactory"/> 
    <filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" /> 
    <filter class="solr.LowerCaseFilterFactory"/> 
  </analyzer> 
  <analyzer type="query"> 
    <tokenizer class="solr.StandardTokenizerFactory"/> 
    <filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" /> 
    <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true"/> 
    <filter class="solr.LowerCaseFilterFactory"/> 
   </analyzer> 
 </fieldType> 
 
<!-- A text field for parital matches --> 
<fieldType name="text_general_partial" class="solr.TextField"
positionIncrementGap="100"> 
  <analyzer type="index"> 
    <tokenizer class="solr.StandardTokenizerFactory"/> 
    <filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" /> 
    <filter class="solr.LowerCaseFilterFactory"/> 
        <filter class="solr.EdgeNGramFilterFactory" minGramSize="2"
maxGramSize="7"/> 
  </analyzer> 
  <analyzer type="query"> 
    <tokenizer class="solr.StandardTokenizerFactory"/> 
    <filter class="solr.StopFilterFactory" ignoreCase="true"
words="stopwords.txt" enablePositionIncrements="true" /> 
    <filter class="solr.SynonymFilterFactory" synonyms="synonyms.txt"
ignoreCase="true" expand="true"/> 
    <filter class="solr.LowerCaseFilterFactory"/> 
  </analyzer> 
</fieldType> 
 
 
 
<requestHandler name="/select" class="solr.SearchHandler"> 
    <!-- default values for query parameters can be specified, these
will be overridden by parameters in the request. --> 
     <lst name="defaults"> 
       <str name="echoParams">explicit</str> 
       <int name="rows">10</int> 
       <str name="df">text</str> 
           <str name="defType">edismax</str> 
           <str name="qf">text^2 text_par^1</str>   <!-- Boost whole
word matches more than partial matches in the scroing. --> 
           <bool name="termVectors">true</bool> 
       <bool name="termPositions">true</bool> 
       <bool name="termOffsets">true</bool> 
       <bool name="hl.useFastVectorHighlighter">true</bool> 
       <str name="hl.boundaryScanner">breakIterator</str> 
       <str name="hl.snippets">2</str> 
           <str name="hl.fl">name name_par description description_par
content content_par</str> 
       <int name="hl.fragsize">162</int> 
           <str name="hl.fragListBuilder">simple</str> 
       <str name="hl.fragmentsBuilder">default</str> 
       <str name="hl.simple.pre"><![CDATA[<strong>]]></str> 
       <str name="hl.simple.post"><![CDATA[</strong>]]></str> 
           <str name="hl.tag.pre"><![CDATA[<strong>]]></str> 
       <str name="hl.tag.post"><![CDATA[</strong>]]></str>    
     </lst> 
 </requestHandler> 


Cheers!

- Andy

Reply via email to