Re: [HACKERS] [GENERAL] Fragments in tsearch2 headline

Sushant Sinha Sat, 21 Jun 2008 07:01:30 -0700

I have an attached an updated patch with following changes:

1. Respects ShortWord and MinWords
2. Uses hlCover instead of Cover
3. Does not store norm (or lexeme) for headline marking
4. Removes ts_rank.h
5. Earlier it was counting even NONWORDTOKEN in the headline. Now it
only counts the actual words and excludes spaces etc.


I have also changed NumFragments option to MaxFragments as there may not
be enough covers to display NumFragments.

Another change that I was thinking:

Right now if cover size > max_words then I just cut the trailing words.
Instead I was thinking that we should split the cover into more
fragments such that each fragment contains a few query words. Then each
fragment will not contain all query words but will show more occurrences
of query words in the headline. I would  like to know what your opinion
on this is.

-Sushant.

On Thu, 2008-06-05 at 20:21 +0400, Teodor Sigaev wrote:
> > A couple of caveats: 
> > 
> > 1. ts_headline testing was done with current cvs head where as
> > headline_with_fragments was done with postgres 8.3.1.
> > 2. For headline_with_fragments, TSVector for the document was obtained
> > by joining with another table.
> > Are these differences understandable?
> 
> That is possible situation because ts_headline has several criterias of 
> 'best' 
> covers - length, number of words from query, good words at the begin and at 
> the 
> end of headline while your fragment's algorithm takes care only on total 
> number 
> of words in all covers. It's not very good, but it's acceptable, I think. 
> Headline (and ranking too) hasn't any formal rules to define is it good or 
> bad? 
> Just a people's opinions.
> 
> Next possible reason: original algorithm had a look on all covers trying to 
> find 
> the best one while your algorithm tries to find just the shortest covers to 
> fill 
> a headline.
> 
> But it's very desirable to use ShortWord - it's not very comfortable for user 
> if 
> one option produces unobvious side effect with another one.
> `
> 
> > If you think these caveats are the reasons or there is something I am
> > missing, then I can repeat the entire experiments with exactly the same
> > conditions. 
> 
> Interesting for me test is a comparing hlCover with Cover in your patch, i.e. 
> develop a patch which uses hlCover instead of Cover and compare  old patch 
> with 
> new one.

Index: src/backend/tsearch/wparser_def.c
===================================================================
RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/backend/tsearch/wparser_def.c,v
retrieving revision 1.14
diff -c -r1.14 wparser_def.c
*** src/backend/tsearch/wparser_def.c	1 Jan 2008 19:45:52 -0000	1.14
--- src/backend/tsearch/wparser_def.c	21 Jun 2008 07:59:02 -0000
***************
*** 1684,1701 ****
  	return false;
  }
  
! Datum
! prsd_headline(PG_FUNCTION_ARGS)
  {
! 	HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
! 	List	   *prsoptions = (List *) PG_GETARG_POINTER(1);
! 	TSQuery		query = PG_GETARG_TSQUERY(2);
  
! 	/* from opt + start and and tag */
! 	int			min_words = 15;
! 	int			max_words = 35;
! 	int			shortword = 3;
  
  	int			p = 0,
  				q = 0;
  	int			bestb = -1,
--- 1684,1891 ----
  	return false;
  }
  
! static void 
! mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos)
  {
! 	int   i;
! 	char *coversep = "...";
!        	int   coverlen = strlen(coversep);
  
! 	for (i = startpos; i <= endpos; i++)
! 	{
! 		if (prs->words[i].item)
! 			prs->words[i].selected = 1;
! 		if (highlight == 0)
! 		{
! 			if (HLIDIGNORE(prs->words[i].type))
! 				prs->words[i].replace = 1;
! 		}
! 		else
! 		{
! 			if (XMLHLIDIGNORE(prs->words[i].type))
! 				prs->words[i].replace = 1;
! 		}
! 
! 		prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
! 	}
! 	/* add cover separators if needed */ 
! 	if (startpos > 0 && strncmp(prs->words[startpos-1].word, coversep, 
! 						prs->words[startpos-1].len) != 0)
! 	{
! 		
! 		prs->words[startpos-1].word = repalloc(prs->words[startpos-1].word, sizeof(char) * coverlen);
! 		prs->words[startpos-1].in   = 1;
! 		prs->words[startpos-1].len  = coverlen;
! 		memcpy(prs->words[startpos-1].word, coversep, coverlen);
! 	}
! 	if (endpos-1 < prs->curwords &&  strncmp(prs->words[startpos-1].word, coversep,
! 						prs->words[startpos-1].len) != 0)
! 	{
! 		prs->words[endpos+1].word = repalloc(prs->words[endpos+1].word, sizeof(char) * coverlen);
! 		prs->words[endpos+1].in   = 1;
! 		memcpy(prs->words[endpos+1].word, coversep, coverlen);
! 	}
! }
! 
! typedef struct 
! {
! 	int4 startpos;
! 	int4 endpos;
! 	int2 in;
! 	int2 excluded;
! } CoverPos;
! 
! 
! static void
! mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight,
!                         int shortword, int min_words, 
! 			int max_words, int max_fragments)
! {
! 	int4           	curlen, coverlen, i, f, num_f;
! 	int4		stretch, maxstretch;
! 
! 	int4           	startpos = 0, 
!  			endpos   = 0,
! 			p        = 0,
! 			q        = 0;
! 
! 	int4		numcovers = 0, 
! 			maxcovers = 32;
! 
! 	int4          	min, minI = 0;
! 	CoverPos	*covers;
! 
! 	covers = palloc(maxcovers * sizeof(CoverPos));
!  
! 	/* get all covers */
! 	while (hlCover(prs, query, &p, &q))
! 	{
! 		if (numcovers >= maxcovers)
! 		{
! 			maxcovers *= 2;
! 			covers     = repalloc(covers, sizeof(CoverPos) * maxcovers);
! 		}
! 		covers[numcovers].startpos = p;
! 		covers[numcovers].endpos   = q;
! 
! 		covers[numcovers].in       = 0;
! 		covers[numcovers].excluded = 0;
! 		numcovers ++;
!  		/* move p to generate the next cover */
!  		p++;
! 	}
! 
! 	/* choose best covers */
! 	for (f = 0; f < max_fragments; f++)
! 	{
! 		min = 0x7fffffff;
! 		for (i = 0; i < numcovers; i ++)
! 		{
! 			coverlen = covers[i].endpos - covers[i].startpos + 1;
! 			if (!covers[i].in &&  !covers[i].excluded && min > coverlen)
! 			{
! 				min  = coverlen;
! 				minI = i;
! 			}
! 		}
! 		if (min < 0x7fffffff)
! 		{
! 			covers[minI].in = 1;
! 			/* adjust the size of cover
! 			* if max_words >= len
! 			*      then headline from ext.p - (max_words-len)/2 to ext.q + (maxcoverSize-len) /2
! 			* if maxcoverSize < len
! 			*      then headline from ext.p to ext.p +  maxcoverSize
! 			*/ 
! 			startpos = covers[minI].startpos;
! 			endpos   = covers[minI].endpos;
! 
! 			for(curlen = 0, i = startpos; i < endpos && curlen < max_words; i++)
! 				if (!NONWORDTOKEN(prs->words[i].type))
! 					curlen++;
! 			endpos = i;
! 			if (curlen < max_words) 
! 			{
! 				/* divide the stretch on both sides of cover */
! 				maxstretch = (max_words - curlen)/2;
! 				/* first stretch the startpos */
! 				stretch = 0;
! 				for (i = startpos; i >= 0 && stretch < maxstretch; i--)
! 				{
! 					if (!NONWORDTOKEN(prs->words[i].type))
! 					{
! 						curlen  ++;
! 						stretch ++;
! 					}
! 				}
! 				/* cut back startpos till we find a non short token */
! 				for ( ; i <= startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++)
! 				{
! 					if (!NONWORDTOKEN(prs->words[i].type))
! 						curlen --;
! 				}
! 				startpos = i;
! 				/* now stretch the endpos as much as possible*/
! 				for (i = endpos; i < prs->curwords && curlen < max_words; i++)
! 				{
! 					if (!NONWORDTOKEN(prs->words[i].type))
! 						curlen  ++;
! 				}
! 				/* cut back endpos till we find a non-short token */
! 				for ( ; i >= endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--)
! 				{
! 					if (!NONWORDTOKEN(prs->words[i].type))
! 						curlen --;
! 				}
! 				endpos = i;
! 			}
! 			covers[minI].startpos = startpos;
! 			covers[minI].endpos   = endpos;
! 
! 			/* exclude overlapping covers */
! 			for (i = 0; i < numcovers; i ++)
! 			{
! 				if (i != minI && 
!                                     (covers[i].startpos >= covers[minI].startpos &&
!                                     covers[i].startpos <= covers[minI].endpos)) 
! 					covers[i].excluded = 1;
! 			}
! 		}
! 		else
! 			break;
! 	}
! 
! 	/* Mark the chosen fragments (covers) */
  
+ 	for (num_f = 0, i = 0; i < numcovers; i++)
+ 	{
+ 		if (covers[i].in && !covers[i].excluded)
+ 		{
+ 			startpos = covers[i].startpos;
+ 			endpos   = covers[i].endpos;
+ 		
+ 			mark_fragment(prs, highlight, covers[i].startpos, covers[i].endpos);
+ 			num_f ++;
+ 		}
+ 	}
+ 	/* show at least min_words we have not marked anything*/
+ 	if (num_f <= 0)
+ 	{
+ 		startpos = endpos = curlen = 0;
+ 		for (i = 0; i < prs->curwords && curlen < min_words; i++)
+ 		{
+ 			if (!NONWORDTOKEN(prs->words[i].type))
+ 				curlen++;
+ 			endpos = i;
+ 		}
+ 		mark_fragment(prs, highlight, startpos, endpos);
+ 	}
+ 	pfree(covers);
+ }
+ static void
+ mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight, 
+ 		int shortword, int min_words, int max_words)
+ {
  	int			p = 0,
  				q = 0;
  	int			bestb = -1,
***************
*** 1707,1762 ****
  				curlen;
  
  	int			i;
- 	int			highlight = 0;
- 	ListCell   *l;
- 
- 	/* config */
- 	prs->startsel = NULL;
- 	prs->stopsel = NULL;
- 	foreach(l, prsoptions)
- 	{
- 		DefElem    *defel = (DefElem *) lfirst(l);
- 		char	   *val = defGetString(defel);
- 
- 		if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
- 			max_words = pg_atoi(val, sizeof(int32), 0);
- 		else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
- 			min_words = pg_atoi(val, sizeof(int32), 0);
- 		else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
- 			shortword = pg_atoi(val, sizeof(int32), 0);
- 		else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
- 			prs->startsel = pstrdup(val);
- 		else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
- 			prs->stopsel = pstrdup(val);
- 		else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
- 			highlight = (pg_strcasecmp(val, "1") == 0 ||
- 						 pg_strcasecmp(val, "on") == 0 ||
- 						 pg_strcasecmp(val, "true") == 0 ||
- 						 pg_strcasecmp(val, "t") == 0 ||
- 						 pg_strcasecmp(val, "y") == 0 ||
- 						 pg_strcasecmp(val, "yes") == 0);
- 		else
- 			ereport(ERROR,
- 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- 					 errmsg("unrecognized headline parameter: \"%s\"",
- 							defel->defname)));
- 	}
  
  	if (highlight == 0)
  	{
- 		if (min_words >= max_words)
- 			ereport(ERROR,
- 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- 					 errmsg("MinWords should be less than MaxWords")));
- 		if (min_words <= 0)
- 			ereport(ERROR,
- 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- 					 errmsg("MinWords should be positive")));
- 		if (shortword < 0)
- 			ereport(ERROR,
- 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- 					 errmsg("ShortWord should be >= 0")));
- 
  		while (hlCover(prs, query, &p, &q))
  		{
  			/* find cover len in words */
--- 1897,1905 ----
***************
*** 1877,1882 ****
--- 2020,2101 ----
  		prs->words[i].in = (prs->words[i].repeated) ? 0 : 1;
  	}
  
+ }
+ 
+ Datum
+ prsd_headline(PG_FUNCTION_ARGS)
+ {
+ 	HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0);
+ 	List	   *prsoptions = (List *) PG_GETARG_POINTER(1);
+ 	TSQuery		query = PG_GETARG_TSQUERY(2);
+ 
+ 	/* from opt + start and and tag */
+ 	int			min_words     = 15;
+ 	int			max_words     = 35;
+ 	int			shortword     = 3;
+ 	int			max_fragments = 0;
+ 	int			highlight     = 0;
+ 	ListCell   *l;
+ 
+ 	/* config */
+ 	prs->startsel = NULL;
+ 	prs->stopsel = NULL;
+ 	foreach(l, prsoptions)
+ 	{
+ 		DefElem    *defel = (DefElem *) lfirst(l);
+ 		char	   *val = defGetString(defel);
+ 
+ 		if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
+ 			max_words = pg_atoi(val, sizeof(int32), 0);
+ 		else if (pg_strcasecmp(defel->defname, "MinWords") == 0)
+ 			min_words = pg_atoi(val, sizeof(int32), 0);
+ 		else if (pg_strcasecmp(defel->defname, "ShortWord") == 0)
+ 			shortword = pg_atoi(val, sizeof(int32), 0);
+ 		else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0)
+ 			max_fragments = pg_atoi(val, sizeof(int32), 0);
+ 		else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
+ 			prs->startsel = pstrdup(val);
+ 		else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
+ 			prs->stopsel = pstrdup(val);
+ 		else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0)
+ 			highlight = (pg_strcasecmp(val, "1") == 0 ||
+ 						 pg_strcasecmp(val, "on") == 0 ||
+ 						 pg_strcasecmp(val, "true") == 0 ||
+ 						 pg_strcasecmp(val, "t") == 0 ||
+ 						 pg_strcasecmp(val, "y") == 0 ||
+ 						 pg_strcasecmp(val, "yes") == 0);
+ 		else
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("unrecognized headline parameter: \"%s\"",
+ 							defel->defname)));
+ 	}
+ 
+ 	if (highlight == 0)
+ 	{
+ 		if (min_words >= max_words)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("MinWords should be less than MaxWords")));
+ 		if (min_words <= 0)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("MinWords should be positive")));
+ 		if (shortword < 0)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("ShortWord should be >= 0")));
+ 		if (max_fragments < 0)
+ 			ereport(ERROR,
+ 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ 					 errmsg("MaxFragments should be >= 0")));
+ 
+ 		if (max_fragments == 0)
+ 			/* call the default headline generator */
+ 			mark_hl_words(prs, query, highlight, shortword, min_words, max_words);
+ 		else
+ 			mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments);
+ 	}
  	if (!prs->startsel)
  		prs->startsel = pstrdup("<b>");
  	if (!prs->stopsel)
***************
*** 1886,1888 ****
--- 2105,2108 ----
  
  	PG_RETURN_POINTER(prs);
  }
+

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] [GENERAL] Fragments in tsearch2 headline

Reply via email to