I have an attached an updated patch with following changes: 1. Respects ShortWord and MinWords 2. Uses hlCover instead of Cover 3. Does not store norm (or lexeme) for headline marking 4. Removes ts_rank.h 5. Earlier it was counting even NONWORDTOKEN in the headline. Now it only counts the actual words and excludes spaces etc.
I have also changed NumFragments option to MaxFragments as there may not be enough covers to display NumFragments. Another change that I was thinking: Right now if cover size > max_words then I just cut the trailing words. Instead I was thinking that we should split the cover into more fragments such that each fragment contains a few query words. Then each fragment will not contain all query words but will show more occurrences of query words in the headline. I would like to know what your opinion on this is. -Sushant. On Thu, 2008-06-05 at 20:21 +0400, Teodor Sigaev wrote: > > A couple of caveats: > > > > 1. ts_headline testing was done with current cvs head where as > > headline_with_fragments was done with postgres 8.3.1. > > 2. For headline_with_fragments, TSVector for the document was obtained > > by joining with another table. > > Are these differences understandable? > > That is possible situation because ts_headline has several criterias of > 'best' > covers - length, number of words from query, good words at the begin and at > the > end of headline while your fragment's algorithm takes care only on total > number > of words in all covers. It's not very good, but it's acceptable, I think. > Headline (and ranking too) hasn't any formal rules to define is it good or > bad? > Just a people's opinions. > > Next possible reason: original algorithm had a look on all covers trying to > find > the best one while your algorithm tries to find just the shortest covers to > fill > a headline. > > But it's very desirable to use ShortWord - it's not very comfortable for user > if > one option produces unobvious side effect with another one. > ` > > > If you think these caveats are the reasons or there is something I am > > missing, then I can repeat the entire experiments with exactly the same > > conditions. > > Interesting for me test is a comparing hlCover with Cover in your patch, i.e. > develop a patch which uses hlCover instead of Cover and compare old patch > with > new one.
Index: src/backend/tsearch/wparser_def.c =================================================================== RCS file: /home/sushant/devel/pgsql-cvs/pgsql/src/backend/tsearch/wparser_def.c,v retrieving revision 1.14 diff -c -r1.14 wparser_def.c *** src/backend/tsearch/wparser_def.c 1 Jan 2008 19:45:52 -0000 1.14 --- src/backend/tsearch/wparser_def.c 21 Jun 2008 07:59:02 -0000 *************** *** 1684,1701 **** return false; } ! Datum ! prsd_headline(PG_FUNCTION_ARGS) { ! HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0); ! List *prsoptions = (List *) PG_GETARG_POINTER(1); ! TSQuery query = PG_GETARG_TSQUERY(2); ! /* from opt + start and and tag */ ! int min_words = 15; ! int max_words = 35; ! int shortword = 3; int p = 0, q = 0; int bestb = -1, --- 1684,1891 ---- return false; } ! static void ! mark_fragment(HeadlineParsedText *prs, int highlight, int startpos, int endpos) { ! int i; ! char *coversep = "..."; ! int coverlen = strlen(coversep); ! for (i = startpos; i <= endpos; i++) ! { ! if (prs->words[i].item) ! prs->words[i].selected = 1; ! if (highlight == 0) ! { ! if (HLIDIGNORE(prs->words[i].type)) ! prs->words[i].replace = 1; ! } ! else ! { ! if (XMLHLIDIGNORE(prs->words[i].type)) ! prs->words[i].replace = 1; ! } ! ! prs->words[i].in = (prs->words[i].repeated) ? 0 : 1; ! } ! /* add cover separators if needed */ ! if (startpos > 0 && strncmp(prs->words[startpos-1].word, coversep, ! prs->words[startpos-1].len) != 0) ! { ! ! prs->words[startpos-1].word = repalloc(prs->words[startpos-1].word, sizeof(char) * coverlen); ! prs->words[startpos-1].in = 1; ! prs->words[startpos-1].len = coverlen; ! memcpy(prs->words[startpos-1].word, coversep, coverlen); ! } ! if (endpos-1 < prs->curwords && strncmp(prs->words[startpos-1].word, coversep, ! prs->words[startpos-1].len) != 0) ! { ! prs->words[endpos+1].word = repalloc(prs->words[endpos+1].word, sizeof(char) * coverlen); ! prs->words[endpos+1].in = 1; ! memcpy(prs->words[endpos+1].word, coversep, coverlen); ! } ! } ! ! typedef struct ! { ! int4 startpos; ! int4 endpos; ! int2 in; ! int2 excluded; ! } CoverPos; ! ! ! static void ! mark_hl_fragments(HeadlineParsedText *prs, TSQuery query, int highlight, ! int shortword, int min_words, ! int max_words, int max_fragments) ! { ! int4 curlen, coverlen, i, f, num_f; ! int4 stretch, maxstretch; ! ! int4 startpos = 0, ! endpos = 0, ! p = 0, ! q = 0; ! ! int4 numcovers = 0, ! maxcovers = 32; ! ! int4 min, minI = 0; ! CoverPos *covers; ! ! covers = palloc(maxcovers * sizeof(CoverPos)); ! ! /* get all covers */ ! while (hlCover(prs, query, &p, &q)) ! { ! if (numcovers >= maxcovers) ! { ! maxcovers *= 2; ! covers = repalloc(covers, sizeof(CoverPos) * maxcovers); ! } ! covers[numcovers].startpos = p; ! covers[numcovers].endpos = q; ! ! covers[numcovers].in = 0; ! covers[numcovers].excluded = 0; ! numcovers ++; ! /* move p to generate the next cover */ ! p++; ! } ! ! /* choose best covers */ ! for (f = 0; f < max_fragments; f++) ! { ! min = 0x7fffffff; ! for (i = 0; i < numcovers; i ++) ! { ! coverlen = covers[i].endpos - covers[i].startpos + 1; ! if (!covers[i].in && !covers[i].excluded && min > coverlen) ! { ! min = coverlen; ! minI = i; ! } ! } ! if (min < 0x7fffffff) ! { ! covers[minI].in = 1; ! /* adjust the size of cover ! * if max_words >= len ! * then headline from ext.p - (max_words-len)/2 to ext.q + (maxcoverSize-len) /2 ! * if maxcoverSize < len ! * then headline from ext.p to ext.p + maxcoverSize ! */ ! startpos = covers[minI].startpos; ! endpos = covers[minI].endpos; ! ! for(curlen = 0, i = startpos; i < endpos && curlen < max_words; i++) ! if (!NONWORDTOKEN(prs->words[i].type)) ! curlen++; ! endpos = i; ! if (curlen < max_words) ! { ! /* divide the stretch on both sides of cover */ ! maxstretch = (max_words - curlen)/2; ! /* first stretch the startpos */ ! stretch = 0; ! for (i = startpos; i >= 0 && stretch < maxstretch; i--) ! { ! if (!NONWORDTOKEN(prs->words[i].type)) ! { ! curlen ++; ! stretch ++; ! } ! } ! /* cut back startpos till we find a non short token */ ! for ( ; i <= startpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i++) ! { ! if (!NONWORDTOKEN(prs->words[i].type)) ! curlen --; ! } ! startpos = i; ! /* now stretch the endpos as much as possible*/ ! for (i = endpos; i < prs->curwords && curlen < max_words; i++) ! { ! if (!NONWORDTOKEN(prs->words[i].type)) ! curlen ++; ! } ! /* cut back endpos till we find a non-short token */ ! for ( ; i >= endpos && (NOENDTOKEN(prs->words[i].type) || prs->words[i].len <= shortword); i--) ! { ! if (!NONWORDTOKEN(prs->words[i].type)) ! curlen --; ! } ! endpos = i; ! } ! covers[minI].startpos = startpos; ! covers[minI].endpos = endpos; ! ! /* exclude overlapping covers */ ! for (i = 0; i < numcovers; i ++) ! { ! if (i != minI && ! (covers[i].startpos >= covers[minI].startpos && ! covers[i].startpos <= covers[minI].endpos)) ! covers[i].excluded = 1; ! } ! } ! else ! break; ! } ! ! /* Mark the chosen fragments (covers) */ + for (num_f = 0, i = 0; i < numcovers; i++) + { + if (covers[i].in && !covers[i].excluded) + { + startpos = covers[i].startpos; + endpos = covers[i].endpos; + + mark_fragment(prs, highlight, covers[i].startpos, covers[i].endpos); + num_f ++; + } + } + /* show at least min_words we have not marked anything*/ + if (num_f <= 0) + { + startpos = endpos = curlen = 0; + for (i = 0; i < prs->curwords && curlen < min_words; i++) + { + if (!NONWORDTOKEN(prs->words[i].type)) + curlen++; + endpos = i; + } + mark_fragment(prs, highlight, startpos, endpos); + } + pfree(covers); + } + static void + mark_hl_words(HeadlineParsedText *prs, TSQuery query, int highlight, + int shortword, int min_words, int max_words) + { int p = 0, q = 0; int bestb = -1, *************** *** 1707,1762 **** curlen; int i; - int highlight = 0; - ListCell *l; - - /* config */ - prs->startsel = NULL; - prs->stopsel = NULL; - foreach(l, prsoptions) - { - DefElem *defel = (DefElem *) lfirst(l); - char *val = defGetString(defel); - - if (pg_strcasecmp(defel->defname, "MaxWords") == 0) - max_words = pg_atoi(val, sizeof(int32), 0); - else if (pg_strcasecmp(defel->defname, "MinWords") == 0) - min_words = pg_atoi(val, sizeof(int32), 0); - else if (pg_strcasecmp(defel->defname, "ShortWord") == 0) - shortword = pg_atoi(val, sizeof(int32), 0); - else if (pg_strcasecmp(defel->defname, "StartSel") == 0) - prs->startsel = pstrdup(val); - else if (pg_strcasecmp(defel->defname, "StopSel") == 0) - prs->stopsel = pstrdup(val); - else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0) - highlight = (pg_strcasecmp(val, "1") == 0 || - pg_strcasecmp(val, "on") == 0 || - pg_strcasecmp(val, "true") == 0 || - pg_strcasecmp(val, "t") == 0 || - pg_strcasecmp(val, "y") == 0 || - pg_strcasecmp(val, "yes") == 0); - else - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("unrecognized headline parameter: \"%s\"", - defel->defname))); - } if (highlight == 0) { - if (min_words >= max_words) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("MinWords should be less than MaxWords"))); - if (min_words <= 0) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("MinWords should be positive"))); - if (shortword < 0) - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("ShortWord should be >= 0"))); - while (hlCover(prs, query, &p, &q)) { /* find cover len in words */ --- 1897,1905 ---- *************** *** 1877,1882 **** --- 2020,2101 ---- prs->words[i].in = (prs->words[i].repeated) ? 0 : 1; } + } + + Datum + prsd_headline(PG_FUNCTION_ARGS) + { + HeadlineParsedText *prs = (HeadlineParsedText *) PG_GETARG_POINTER(0); + List *prsoptions = (List *) PG_GETARG_POINTER(1); + TSQuery query = PG_GETARG_TSQUERY(2); + + /* from opt + start and and tag */ + int min_words = 15; + int max_words = 35; + int shortword = 3; + int max_fragments = 0; + int highlight = 0; + ListCell *l; + + /* config */ + prs->startsel = NULL; + prs->stopsel = NULL; + foreach(l, prsoptions) + { + DefElem *defel = (DefElem *) lfirst(l); + char *val = defGetString(defel); + + if (pg_strcasecmp(defel->defname, "MaxWords") == 0) + max_words = pg_atoi(val, sizeof(int32), 0); + else if (pg_strcasecmp(defel->defname, "MinWords") == 0) + min_words = pg_atoi(val, sizeof(int32), 0); + else if (pg_strcasecmp(defel->defname, "ShortWord") == 0) + shortword = pg_atoi(val, sizeof(int32), 0); + else if (pg_strcasecmp(defel->defname, "MaxFragments") == 0) + max_fragments = pg_atoi(val, sizeof(int32), 0); + else if (pg_strcasecmp(defel->defname, "StartSel") == 0) + prs->startsel = pstrdup(val); + else if (pg_strcasecmp(defel->defname, "StopSel") == 0) + prs->stopsel = pstrdup(val); + else if (pg_strcasecmp(defel->defname, "HighlightAll") == 0) + highlight = (pg_strcasecmp(val, "1") == 0 || + pg_strcasecmp(val, "on") == 0 || + pg_strcasecmp(val, "true") == 0 || + pg_strcasecmp(val, "t") == 0 || + pg_strcasecmp(val, "y") == 0 || + pg_strcasecmp(val, "yes") == 0); + else + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("unrecognized headline parameter: \"%s\"", + defel->defname))); + } + + if (highlight == 0) + { + if (min_words >= max_words) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("MinWords should be less than MaxWords"))); + if (min_words <= 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("MinWords should be positive"))); + if (shortword < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("ShortWord should be >= 0"))); + if (max_fragments < 0) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("MaxFragments should be >= 0"))); + + if (max_fragments == 0) + /* call the default headline generator */ + mark_hl_words(prs, query, highlight, shortword, min_words, max_words); + else + mark_hl_fragments(prs, query, highlight, shortword, min_words, max_words, max_fragments); + } if (!prs->startsel) prs->startsel = pstrdup("<b>"); if (!prs->stopsel) *************** *** 1886,1888 **** --- 2105,2108 ---- PG_RETURN_POINTER(prs); } +
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers