Highlighting snippets truncated when matching large number of indexed documents

hsharma mailinglists Wed, 02 Sep 2015 00:22:26 -0700

Hi there,

I'm observing that the snippets being returned in the highlighting
section of the response are getting truncated. However, this behavior
is being seen only when the query matches a large number of documents
and the results requested are near the end of the Solr-returned
overall results list.


I'm using Solr 5.2.1 (Java 1.8.0_51) and my document is defined in
terms of the following two fields, as specified in the schema file:

  <fieldType name="be_string" class="solr.TextField" sortMissingLast="true">
    <analyzer>
      <tokenizer class="solr.StandardTokenizerFactory" />
      <filter class="solr.LowerCaseFilterFactory" />
    </analyzer>
  </fieldType>
  <fieldType name="be_ngramstring" class="solr.TextField"
sortMissingLast="true">
    <analyzer type="index">
      <tokenizer class="solr.StandardTokenizerFactory" />
      <filter class="solr.LowerCaseFilterFactory" />
      <filter class="solr.EdgeNGramFilterFactory" minGramSize="1"
maxGramSize="20"/>
    </analyzer>
    <analyzer type="query">
      <tokenizer class="solr.StandardTokenizerFactory" />
      <filter class="solr.LowerCaseFilterFactory" />
    </analyzer>
  </fieldType>

  <field name="name" type="be_string"/>
  <field name="name_edgengram" type="be_ngramstring" stored="true"/>
  <copyField source="name" dest="name_edgengram"/>

Hence, the fields of interest are called "name" and "name_edgengram".

I search for the word 'data' and Solr indicates that there are 565
results. I retrieve 10 results at a time, and the highlighting works
fine till I make a request to Solr for getting 10 results starting at
number 490. The http request made is >>

http://localhost:8983/solr/mycore/select?q=name%3A%22data%22+OR+name_edgengram%3A%22data%22&start=490&fl=id%2Cname&wt=json&indent=true&hl=true&hl.fl=name%2Cname_edgengram&hl.simple.pre=%3Cem%3E&hl.simple.post=%3C%2Fem%3E&hl.highlightMultiTerm=true&hl.fragSize=0

My highlighting parameters are specified at query-time. I get the
following json response from Solr >>

################################################
{
  "responseHeader": {
    "status": 0,
    "QTime": 76,
    "params": {
      "q": "name:\"data\" OR name_edgengram:\"data\"",
      "hl": "true",
      "hl.simple.post": "</em>",
      "indent": "true",
      "fl": "id,name",
      "start": "490",
      "hl.fragSize": "0",
      "hl.fl": "name,name_edgengram",
      "wt": "json",
      "hl.simple.pre": "<em>",
      "hl.highlightMultiTerm": "true"
    }
  },
  "response": {
    "numFound": 565,
    "start": 490,
    "docs": [
      {
        "name":
"software/information-management/cq-image-jsp-/content/sascom/en_us/software/data-management/jcr:content/par/tabctrl_d036/tab-2-tabImage",
        "id": "p-798-pn9058800-uu303582258"
      },
      {
        "name":
"en_us/whitepapers/how-to-advance-your-data-mining-predictive-analytics-with-modern-techniques-106219.html",
        "id": "p-798-pn9677905-uu304125128"
      },
      {
        "name":
"en_us/insights/cq-image-jsp-/content/sascom/en_us/insights/data-management/jcr:content/par/tabctrl_4a63/tab-0/styledcontainer_231d/par/styledcontainer_3919/par/image_a747",
        "id": "p-798-pn9058609-uu303582055"
      },
      {
        "name":
"software/smb/cq-textimage-jsp-/content/sascom/en_us/software/small-midsize-business/desktop-data-mining/jcr:content/par/styledcontainer_6b5c/par/contentcarousel_ea6/cntntcarousel/textimage_e28",
        "id": "p-798-pn9058629-uu303582076"
      },
      {
        "name":
"en_us/whitepapers/harvard-business-review-the-evolution-of-decision-making-how-leading-organizations-are-adopting-a-data-driven-culture105998.html",
        "id": "p-798-pn9677481-uu297657017"
      },
      {
        "id": "kw-798-3075204",
        "name": "mpp database"
      },
      {
        "id": "kw-798-951983",
        "name": "In-Database Analytics"
      },
      {
        "id": "kw-798-3075206",
        "name": "in-memory database"
      },
      {
        "name": "software/data_mining/",
        "id": "p-798-pn30459505-uu376483712"
      },
      {
        "name": "rnd/datavisualization/",
        "id": "p-798-pn68559-uu524630"
      }
    ]
  },
  "highlighting": {
    "p-798-pn9058800-uu303582258": {
      "name": [
        
"software/information-management/cq-image-jsp-/content/sascom/en_us/software/<em>data</em>"
      ],
      "name_edgengram": [
        
"software/information-management/cq-image-jsp-/content/sascom/en_us/software/<em>data</em>"
      ]
    },
    "p-798-pn9677905-uu304125128": {
      "name": [
        
"en_us/whitepapers/how-to-advance-your-<em>data</em>-mining-predictive-analytics-with"
      ],
      "name_edgengram": [
        
"en_us/whitepapers/how-to-advance-your-<em>data</em>-mining-predictive-analytics-with"
      ]
    },
    "p-798-pn9058609-uu303582055": {
      "name": [
        
"en_us/insights/cq-image-jsp-/content/sascom/en_us/insights/<em>data</em>-management"
      ],
      "name_edgengram": [
        
"en_us/insights/cq-image-jsp-/content/sascom/en_us/insights/<em>data</em>-management"
      ]
    },
    "p-798-pn9058629-uu303582076": {
      "name": [
        
"-business/desktop-<em>data</em>-mining/jcr:content/par/styledcontainer_6b5c/par/contentcarousel_ea6/cntntcarousel"
      ],
      "name_edgengram": [
        
"-business/desktop-<em>data</em>-mining/jcr:content/par/styledcontainer_6b5c/par/contentcarousel_ea6/cntntcarousel"
      ]
    },
    "p-798-pn9677481-uu297657017": {
      "name": [
        
"-leading-organizations-are-adopting-a-<em>data</em>-driven-culture105998.html"
      ],
      "name_edgengram": [
        
"-leading-organizations-are-adopting-a-<em>data</em>-driven-culture105998.html"
      ]
    },
    "kw-798-3075204": {
      "name_edgengram": [
        "mpp <em>database</em>"
      ]
    },
    "kw-798-951983": {
      "name_edgengram": [
        "In-<em>Database</em> Analytics"
      ]
    },
    "kw-798-3075206": {
      "name_edgengram": [
        "in-memory <em>database</em>"
      ]
    },
    "p-798-pn30459505-uu376483712": {
      "name_edgengram": [
        "/software/<em>data_mining</em>/"
      ]
    },
    "p-798-pn68559-uu524630": {
      "name_edgengram": [
        "rnd/<em>datavisualization</em>/"
      ]
    }
  }
}
################################################

As you can see, for the document with id "p-798-pn9058629-uu303582076" >>

The "name" field has the value
"software/smb/cq-textimage-jsp-/content/sascom/en_us/software/small-midsize-business/desktop-data-mining/jcr:content/par/styledcontainer_6b5c/par/contentcarousel_ea6/cntntcarousel/textimage_e28".
But the corresponding highlight snippets are truncated and have these values >>
      "name": [
        
"-business/desktop-<em>data</em>-mining/jcr:content/par/styledcontainer_6b5c/par/contentcarousel_ea6/cntntcarousel"
      ],
      "name_edgengram": [
        
"-business/desktop-<em>data</em>-mining/jcr:content/par/styledcontainer_6b5c/par/contentcarousel_ea6/cntntcarousel"
      ]

Similarly, for the document with id "p-798-pn9677481-uu297657017" >>

The "name" field has the value
"en_us/whitepapers/harvard-business-review-the-evolution-of-decision-making-how-leading-organizations-are-adopting-a-data-driven-culture105998.html".
But the corresponding highlight snippets are truncated and have these values >>
      "name": [
        
"-leading-organizations-are-adopting-a-<em>data</em>-driven-culture105998.html"
      ],
      "name_edgengram": [
        
"-leading-organizations-are-adopting-a-<em>data</em>-driven-culture105998.html"
      ]



[1] I would like to understand why is it that this truncation of the
highlighting snippets happens only when retrieving the latter results
from a somewhat large set of matched documents?

[2] Also, it appears that the highlight snippets for both fields are
getting truncated (indicating that the edge n-gram filter does not
have much to do with this truncation?). That is, when truncation
happens, it happens for both the fields specified in the hl.fl
parameter of the http request; or it doesn't happen for either of
them.

[3] I'm aware that there was an issue resembling the one I have
described above, for earlier versions of Solr >>

      ** https://issues.apache.org/jira/browse/SOLR-3110 :: Search
result comes up with truncated words at the start of highlighted
fragment
      ** https://issues.apache.org/jira/browse/LUCENE-1822 ::
FastVectorHighlighter: SimpleFragListBuilder hard-coded 6 char margin
is too naive
      ** https://issues.apache.org/jira/browse/LUCENE-1824 ::
FastVectorHighlighter truncates words at beginning and end of
fragments

To the best of my knowledge, I'm not making use of the
FastVectorHighlighter (which is the highlighter implementation
associated with the snippet-truncation in the JIRA issues listed
above). I believe I'm using the Standard Highlighter because as per
https://cwiki.apache.org/confluence/display/solr/Standard+Highlighter,
I'm doing so by passing the highlighting parameters to the request
handler with the query. Further, the parameter "hl.maxAnalyzedChars"
is set to its default value of 51,200 characters and the documents
above whose highlighting snippets were truncated have field-values
with a much lower number of characters.

My solrconfig contains the following definitions of the request
handler and the search component for highlighting:

  <requestHandler name="/select" class="solr.SearchHandler">
    <lst name="defaults">
      <str name="echoParams">explicit</str>
      <int name="rows">10</int>
    </lst>
  </requestHandler>

  <searchComponent class="solr.HighlightComponent" name="highlight">
    <highlighting>
      <fragmenter name="gap"
                  default="true"
                  class="solr.highlight.GapFragmenter">
        <lst name="defaults">
          <int name="hl.fragsize">100</int>
        </lst>
      </fragmenter>
      <fragmenter name="regex"
                  class="solr.highlight.RegexFragmenter">
        <lst name="defaults">
          <int name="hl.fragsize">70</int>
          <float name="hl.regex.slop">0.5</float>
          <str name="hl.regex.pattern">[-\w ,/\n\&quot;&apos;]{20,200}</str>
        </lst>
      </fragmenter>

      <formatter name="html"
                 default="true"
                 class="solr.highlight.HtmlFormatter">
        <lst name="defaults">
          <str name="hl.simple.pre"><![CDATA[<em>]]></str>
          <str name="hl.simple.post"><![CDATA[</em>]]></str>
        </lst>
      </formatter>

      <encoder name="html"
               class="solr.highlight.HtmlEncoder" />

      <fragListBuilder name="simple"
                       class="solr.highlight.SimpleFragListBuilder"/>

      <fragListBuilder name="single"
                       class="solr.highlight.SingleFragListBuilder"/>

      <fragListBuilder name="weighted"
                       default="true"
                       class="solr.highlight.WeightedFragListBuilder"/>

      <fragmentsBuilder name="default"
                        default="true"
                        class="solr.highlight.ScoreOrderFragmentsBuilder">
      </fragmentsBuilder>

      <fragmentsBuilder name="colored"
                        class="solr.highlight.ScoreOrderFragmentsBuilder">
        <lst name="defaults">
          <str name="hl.tag.pre"><![CDATA[
               <b style="background:yellow">,<b style="background:lawgreen">,
               <b style="background:aquamarine">,<b style="background:magenta">,
               <b style="background:palegreen">,<b style="background:coral">,
               <b style="background:wheat">,<b style="background:khaki">,
               <b style="background:lime">,<b
style="background:deepskyblue">]]></str>
          <str name="hl.tag.post"><![CDATA[</b>]]></str>
        </lst>
      </fragmentsBuilder>

      <boundaryScanner name="default"
                       default="true"
                       class="solr.highlight.SimpleBoundaryScanner">
        <lst name="defaults">
          <str name="hl.bs.maxScan">10</str>
          <str name="hl.bs.chars">.,!? &#9;&#10;&#13;</str>
        </lst>
      </boundaryScanner>

      <boundaryScanner name="breakIterator"
                       class="solr.highlight.BreakIteratorBoundaryScanner">
        <lst name="defaults">
          <str name="hl.bs.type">WORD</str>
          <str name="hl.bs.language">en</str>
          <str name="hl.bs.country">US</str>
        </lst>
      </boundaryScanner>
    </highlighting>
  </searchComponent>


I'd appreciate any insights/suggestions to tackle this.

Also, this is my first post to the mailing list. So please do let me
know if there's any other information that I can provide to make this
explanation better and more complete.


Thanks,
Harsh

Highlighting snippets truncated when matching large number of indexed documents

Reply via email to