Thanks Pierre for porting this! I just tested this for my application
and it works. There was a small bug in that startHL has to be
initialized to 0 for each chosen cover. I fixed that and attached the
new patch.
Teodor did not want a separate function. He wanted it as an extension to
ts_headline. One way to do this will be to invoke it only when options
like MaxCoverSize is used. It will be slightly ugly though.
It still seems to have bugs. I will try to clean that up.
-Sushant.
On Thu, 2008-05-22 at 13:31 +0200, Pierre-Yves Strub wrote:
> Hi,
>
> I've ported the patch of Sushant Sinha for fragmented headlines to pg8.3.1
> (http://archives.postgresql.org/pgsql-general/2007-11/msg00508.php)
>
> W.r.t, http://archives.postgresql.org/pgsql-general/2008-03/msg00806.php
> I can continue the work until this becomes an acceptable patch for pg.
>
> Pierre-yves.
diff -Nurb postgresql-8.3.1/contrib/tsearch2/tsearch2.c postgresql-8.3.1-orig/contrib/tsearch2/tsearch2.c
--- postgresql-8.3.1/contrib/tsearch2/tsearch2.c 2008-01-01 14:45:45.000000000 -0500
+++ postgresql-8.3.1-orig/contrib/tsearch2/tsearch2.c 2008-05-22 18:44:16.000000000 -0400
@@ -82,6 +82,7 @@
Datum tsa_to_tsquery_name(PG_FUNCTION_ARGS);
Datum tsa_plainto_tsquery_name(PG_FUNCTION_ARGS);
Datum tsa_headline_byname(PG_FUNCTION_ARGS);
+Datum tsa_headline_with_fragments(PG_FUNCTION_ARGS);
Datum tsa_ts_stat(PG_FUNCTION_ARGS);
Datum tsa_tsearch2(PG_FUNCTION_ARGS);
Datum tsa_rewrite_accum(PG_FUNCTION_ARGS);
@@ -101,6 +102,7 @@
PG_FUNCTION_INFO_V1(tsa_to_tsquery_name);
PG_FUNCTION_INFO_V1(tsa_plainto_tsquery_name);
PG_FUNCTION_INFO_V1(tsa_headline_byname);
+PG_FUNCTION_INFO_V1(tsa_headline_with_fragments);
PG_FUNCTION_INFO_V1(tsa_ts_stat);
PG_FUNCTION_INFO_V1(tsa_tsearch2);
PG_FUNCTION_INFO_V1(tsa_rewrite_accum);
@@ -358,6 +360,24 @@
return result;
}
+/* tsa_headline_with_fragments(text, tsvector, text, tsquery, text) */
+Datum
+tsa_headline_with_fragments(PG_FUNCTION_ARGS)
+{
+ text *cfgname = PG_GETARG_TEXT_P(0);
+ Datum arg1 = PG_GETARG_DATUM(1);
+ Datum arg2 = PG_GETARG_DATUM(2);
+ Datum arg3 = PG_GETARG_DATUM(3);
+ Datum arg4 = PG_GETARG_DATUM(4);
+ Oid config_oid;
+
+ config_oid = TextGetObjectId(regconfigin, cfgname);
+
+ return DirectFunctionCall5(ts_headline_with_fragments,
+ ObjectIdGetDatum(config_oid),
+ arg1, arg2, arg3, arg4);
+}
+
/*
* tsearch2 version of update trigger
*
diff -Nurb postgresql-8.3.1/contrib/tsearch2/tsearch2.sql.in postgresql-8.3.1-orig/contrib/tsearch2/tsearch2.sql.in
--- postgresql-8.3.1/contrib/tsearch2/tsearch2.sql.in 2007-11-28 14:33:04.000000000 -0500
+++ postgresql-8.3.1-orig/contrib/tsearch2/tsearch2.sql.in 2008-05-22 18:44:16.000000000 -0400
@@ -384,6 +384,11 @@
LANGUAGE INTERNAL
RETURNS NULL ON NULL INPUT IMMUTABLE;
+CREATE FUNCTION headline_with_fragments(text, text, tsquery, text)
+ RETURNS text
+ AS 'MODULE_PATHNAME', 'tsa_headline_with_fragments'
+ LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
+
-- CREATE the OPERATOR class
CREATE OPERATOR CLASS gist_tsvector_ops
FOR TYPE tsvector USING gist
diff -Nurb postgresql-8.3.1/src/backend/tsearch/ts_parse.c postgresql-8.3.1-orig/src/backend/tsearch/ts_parse.c
--- postgresql-8.3.1/src/backend/tsearch/ts_parse.c 2008-01-01 14:45:52.000000000 -0500
+++ postgresql-8.3.1-orig/src/backend/tsearch/ts_parse.c 2008-05-22 18:49:53.000000000 -0400
@@ -578,6 +578,112 @@
FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
}
+#define COVER_SEP "..."
+#define COVER_SEP_LEN (sizeof(COVER_SEP)-1)
+
+void
+hlparsetext_with_covers(Oid cfgId,
+ HeadlineParsedText *prs,
+ TSQuery query,
+ text *in,
+ struct coverpos *covers,
+ int4 numcovers)
+{
+ TSParserCacheEntry *prsobj;
+ TSConfigCacheEntry *cfg;
+ void *prsdata;
+ LexizeData ldata;
+ int4 icover, startpos, endpos, currentpos = 0;
+
+ char *lemm = NULL;
+ int4 lenlemm = 0;
+ ParsedLex *lexs;
+ int4 type, startHL = 0;
+ TSLexeme *norms;
+ int4 oldnumwords, newnumwords, i;
+
+ cfg = lookup_ts_config_cache(cfgId);
+ prsobj = lookup_ts_parser_cache(cfg->prsId);
+
+ prsdata = (void*) DatumGetPointer(FunctionCall2(&(prsobj->prsstart),
+ PointerGetDatum(VARDATA(in)),
+ Int32GetDatum(VARSIZE(in) - VARHDRSZ)));
+
+ LexizeInit(&ldata, cfg);
+
+ for (icover = 0; icover < numcovers; icover++)
+ {
+ if (!covers[icover].in)
+ continue;
+
+ startpos = covers[icover].startpos;
+ endpos = covers[icover].endpos;
+
+ if (currentpos > endpos)
+ {
+ /* XXX - something wrong ... we have gone past the cover */
+ continue;
+ }
+
+ /* see if we need to add a cover seperator */
+ if (currentpos < startpos && startpos > 0)
+ {
+ hladdword(prs, COVER_SEP, COVER_SEP_LEN, 3);
+ prs->words[prs->curwords - 1].in = 1;
+ }
+ /* add words to the headline only when currentpos crosses the startpos */
+ startHL = 0;
+ do
+ {
+ type = DatumGetInt32(FunctionCall3(&(prsobj->prstoken),
+ PointerGetDatum(prsdata),
+ PointerGetDatum(&lemm),
+ PointerGetDatum(&lenlemm)));
+
+ LexizeAddLemm(&ldata, type, lemm, lenlemm);
+
+ do
+ {
+ if ((norms = LexizeExec(&ldata, &lexs)) != NULL)
+ {
+ TSLexeme *ptr = norms;
+
+ currentpos += 1;
+ while (ptr->lexeme)
+ {
+ if (ptr->flags & TSL_ADDPOS)
+ currentpos += 1;
+ ptr++;
+ }
+ }
+
+ // start check
+ if (!startHL && currentpos >= startpos)
+ startHL = 1;
+
+ if (startHL)
+ {
+ oldnumwords = prs->curwords;
+ addHLParsedLex(prs, query, lexs, norms);
+ newnumwords = prs->curwords;
+
+ for (i = oldnumwords; i < newnumwords; i++)
+ {
+ prs->words[i].in = 1;
+ if (prs->words[i].item)
+ prs->words[i].selected = 1;
+ }
+ }
+ } while(norms && currentpos < endpos);
+
+ if (currentpos >= endpos)
+ break;
+ } while (type > 0);
+ }
+
+ FunctionCall1(&(prsobj->prsend), PointerGetDatum(prsdata));
+}
+
text *
generateHeadline(HeadlineParsedText *prs)
{
diff -Nurb postgresql-8.3.1/src/backend/tsearch/wparser.c postgresql-8.3.1-orig/src/backend/tsearch/wparser.c
--- postgresql-8.3.1/src/backend/tsearch/wparser.c 2008-01-15 13:22:47.000000000 -0500
+++ postgresql-8.3.1-orig/src/backend/tsearch/wparser.c 2008-05-22 18:44:16.000000000 -0400
@@ -370,3 +370,34 @@
PG_GETARG_DATUM(1),
PG_GETARG_DATUM(2)));
}
+
+Datum
+ts_headline_with_fragments(PG_FUNCTION_ARGS)
+{
+ text *opt = (PG_NARGS() > 3 && PG_GETARG_POINTER(3)) ? PG_GETARG_TEXT_P(3) : NULL;
+ HeadlineParsedText *prs;
+ List *prsoptions;
+ text *out;
+
+ if (opt)
+ prsoptions = deserialize_deflist(PointerGetDatum(opt));
+ else
+ prsoptions = NIL;
+
+ prs = (HeadlineParsedText*) DatumGetPointer(DirectFunctionCall4(prsd_headline_with_fragments,
+ PG_GETARG_DATUM(0),
+ PG_GETARG_DATUM(1),
+ PG_GETARG_DATUM(2),
+ PointerGetDatum(prsoptions)));
+
+ out = generateHeadline(prs);
+
+ if (opt)
+ PG_FREE_IF_COPY(opt, 3);
+ pfree(prs->words);
+ pfree(prs->startsel);
+ pfree(prs->stopsel);
+ pfree(prs);
+
+ PG_RETURN_POINTER(out);
+}
diff -Nurb postgresql-8.3.1/src/backend/tsearch/wparser_def.c postgresql-8.3.1-orig/src/backend/tsearch/wparser_def.c
--- postgresql-8.3.1/src/backend/tsearch/wparser_def.c 2008-01-01 14:45:52.000000000 -0500
+++ postgresql-8.3.1-orig/src/backend/tsearch/wparser_def.c 2008-05-22 18:44:16.000000000 -0400
@@ -19,6 +19,7 @@
#include "tsearch/ts_public.h"
#include "tsearch/ts_type.h"
#include "tsearch/ts_utils.h"
+#include "tsearch/ts_rank.h"
#include "utils/builtins.h"
@@ -1886,3 +1887,191 @@
PG_RETURN_POINTER(prs);
}
+
+Datum
+prsd_headline_with_fragments(PG_FUNCTION_ARGS)
+{
+ Oid cfgId = PG_GETARG_OID(0);
+ text *in = PG_GETARG_TEXT_P(1);
+ TSQuery query = PG_GETARG_TSQUERY(2);
+ List *prsoptions = (List *) PG_GETARG_POINTER(3);
+
+ TSVector t = (TSVector) DatumGetPointer(DirectFunctionCall2(to_tsvector_byid,
+ ObjectIdGetDatum(cfgId),
+ PointerGetDatum(in)));
+
+ ListCell *l;
+ HeadlineParsedText* prs = NULL;
+ DocRepresentation* doc;
+ Extention ext;
+ int4 coverlen, doclen;
+ int4 startpos = 0, endpos = 0;
+ int4 numWords = 0;
+ QueryRepresentation qr;
+ int4 i, numcovers = 0, maxcovers = 32, maxstretch;
+ int maxcoverSize = 20, mincoverSize = 5, maxWords = 40;
+ int4 min, minI = 0;
+
+ struct coverpos* covers = palloc(maxcovers*sizeof(struct coverpos));
+
+ prs = (HeadlineParsedText*) palloc0(sizeof(HeadlineParsedText));
+ prs->lenwords = 32;
+ prs->words = (HeadlineWordEntry *) palloc(sizeof(HeadlineWordEntry) * prs->lenwords);
+ prs->startsel = NULL;
+ prs->stopsel = NULL;
+
+ foreach(l, prsoptions)
+ {
+ DefElem *defel = (DefElem *) lfirst(l);
+ char *val = defGetString(defel);
+
+ if (pg_strcasecmp(defel->defname, "MaxCoverSize") == 0)
+ maxcoverSize = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "MinCoverSize") == 0)
+ mincoverSize = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "MaxWords") == 0)
+ maxWords = pg_atoi(val, sizeof(int32), 0);
+ else if (pg_strcasecmp(defel->defname, "StartSel") == 0)
+ prs->startsel = pstrdup(val);
+ else if (pg_strcasecmp(defel->defname, "StopSel") == 0)
+ prs->stopsel = pstrdup(val);
+ else
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("unrecognized headline parameter: \"%s\"",
+ defel->defname)));
+ }
+
+ if (mincoverSize >= maxcoverSize)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MinCoverSize should be less than MaxCoverSize")));
+ if (mincoverSize <= 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MinCoverSize should be positive")));
+ if (maxWords < 0)
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("MaxWords should be non-negative")));
+
+ if (!prs->startsel)
+ prs->startsel = pstrdup("<b>");
+ if (!prs->stopsel)
+ prs->stopsel = pstrdup("</b>");
+ prs->startsellen = strlen(prs->startsel);
+ prs->stopsellen = strlen(prs->stopsel);
+
+ qr.query = query;
+ qr.operandexist = (bool*) palloc0(sizeof(bool) * query->size);
+
+ /* start generating covers for the query */
+ doc = get_docrep(t, &qr, &doclen);
+ if (!doc)
+ {
+ pfree(covers);
+ pfree(t);
+ pfree(qr.operandexist);
+ PG_FREE_IF_COPY(in, 1);
+ PG_FREE_IF_COPY(query, 2);
+ PG_FREE_IF_COPY(prsoptions, 3);
+
+ /* cannot do anything */
+ PG_RETURN_POINTER(prs);
+ }
+
+ /* get all covers */
+ MemSet(&ext, 0, sizeof(Extention));
+ while (Cover(doc, doclen, &qr, &ext))
+ {
+ if (numcovers >= maxcovers)
+ {
+ maxcovers *= 2;
+ covers = repalloc(covers, sizeof(struct coverpos) * maxcovers);
+ }
+ covers[numcovers].startpos = ext.p;
+ covers[numcovers].endpos = ext.q;
+ covers[numcovers].in = 0;
+ numcovers ++;
+ }
+
+ /* choose best covers */
+ while (maxWords - numWords > mincoverSize)
+ {
+ min = 9999999;/* XXX - will not display headlines that exceed 9999999 */
+ for (i = 0; i < numcovers; i ++)
+ {
+ coverlen = covers[i].endpos - covers[i].startpos + 1;
+ if (!covers[i].in && min > coverlen)
+ {
+ min = coverlen;
+ minI = i;
+ }
+ }
+ if (min < 9999999)
+ {
+ covers[minI].in = 1;
+ /* adjust the size of cover
+ * if maxcoverSize >= len
+ * then headline from ext.p - (maxcoverSize-len)/2 to ext.q + (maxcoverSize-len) /2
+ * if maxcoverSize < len
+ * then headline from ext.p to ext.p + maxcoverSize
+ * (ensures starting lexeme is in the headline)
+ */
+ /* cut down endpos if it crosses maxWords */
+ startpos = covers[minI].startpos;
+ endpos = covers[minI].endpos;
+ coverlen = endpos - startpos + 1;
+
+ /* truncate the cover if it exceeds max words */
+ if(numWords + coverlen > maxWords)
+ endpos = startpos + maxWords - numWords;
+ else
+ {
+ if (maxcoverSize >= coverlen)
+ {
+ /* what is the max we can stretch: min of
+ * 1. maxcoverSize
+ * 2. maxWords - numWords
+ */
+ if (maxcoverSize > maxWords - numWords)
+ maxstretch = maxWords - numWords;
+ else
+ maxstretch = maxcoverSize;
+
+ /* divide the stretch on both sides of cover */
+ startpos -= (maxstretch - coverlen)/2;
+ endpos += (maxstretch - coverlen)/2;
+ if (startpos < 1)
+ startpos = 1;
+ /* XXX - do we need to check whether endpos crosses the document
+ * the other function would return if the document ends or the
+ * endpos is reached.
+ * Dropping this check for time being
+ */
+ }
+ else if (maxcoverSize < coverlen)
+ endpos = startpos + maxcoverSize;
+ }
+ covers[minI].startpos = startpos;
+ covers[minI].endpos = endpos;
+ numWords += endpos - startpos + 1;
+ }
+ else
+ break;
+ }
+
+ /* Render the headline */
+ if (maxWords > 0)
+ hlparsetext_with_covers(cfgId, prs, query, in, covers, numcovers);
+
+ /* clean up */
+ pfree(covers);
+ pfree(t);
+ pfree(qr.operandexist);
+ PG_FREE_IF_COPY(in, 1);
+ PG_FREE_IF_COPY(query, 2);
+ PG_FREE_IF_COPY(prsoptions, 3);
+
+ PG_RETURN_POINTER(prs);
+}
diff -Nurb postgresql-8.3.1/src/backend/utils/adt/tsrank.c postgresql-8.3.1-orig/src/backend/utils/adt/tsrank.c
--- postgresql-8.3.1/src/backend/utils/adt/tsrank.c 2008-01-01 14:45:53.000000000 -0500
+++ postgresql-8.3.1-orig/src/backend/utils/adt/tsrank.c 2008-05-22 18:44:16.000000000 -0400
@@ -17,6 +17,7 @@
#include "tsearch/ts_type.h"
#include "tsearch/ts_utils.h"
+#include "tsearch/ts_rank.h"
#include "utils/array.h"
#include "miscadmin.h"
@@ -463,14 +464,6 @@
PG_RETURN_FLOAT4(res);
}
-typedef struct
-{
- QueryItem **item;
- int16 nitem;
- uint8 wclass;
- int32 pos;
-} DocRepresentation;
-
static int
compareDocR(const void *va, const void *vb)
{
@@ -482,12 +475,6 @@
return (a->pos > b->pos) ? 1 : -1;
}
-typedef struct
-{
- TSQuery query;
- bool *operandexist;
-} QueryRepresentation;
-
#define QR_GET_OPERAND_EXISTS(q, v) ( (q)->operandexist[ ((QueryItem*)(v)) - GETQUERY((q)->query) ] )
#define QR_SET_OPERAND_EXISTS(q, v) QR_GET_OPERAND_EXISTS(q,v) = true
@@ -499,17 +486,7 @@
return QR_GET_OPERAND_EXISTS(qr, val);
}
-typedef struct
-{
- int pos;
- int p;
- int q;
- DocRepresentation *begin;
- DocRepresentation *end;
-} Extention;
-
-
-static bool
+bool
Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, Extention *ext)
{
DocRepresentation *ptr;
@@ -590,7 +567,7 @@
return Cover(doc, len, qr, ext);
}
-static DocRepresentation *
+DocRepresentation *
get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen)
{
QueryItem *item = GETQUERY(qr->query);
diff -Nurb postgresql-8.3.1/src/include/tsearch/ts_rank.h postgresql-8.3.1-orig/src/include/tsearch/ts_rank.h
--- postgresql-8.3.1/src/include/tsearch/ts_rank.h 1969-12-31 19:00:00.000000000 -0500
+++ postgresql-8.3.1-orig/src/include/tsearch/ts_rank.h 2008-05-22 18:44:16.000000000 -0400
@@ -0,0 +1,36 @@
+#ifndef __TSRANK_H__
+#define __TSRANK_H__
+
+#include "ts_type.h"
+#include "ts_cache.h"
+
+typedef struct
+{
+ QueryItem **item;
+ int16 nitem;
+ uint8 wclass;
+ int32 pos;
+} DocRepresentation;
+
+typedef struct
+{
+ TSQuery query;
+ bool *operandexist;
+} QueryRepresentation;
+
+typedef struct
+{
+ int pos;
+ int p;
+ int q;
+ DocRepresentation *begin;
+ DocRepresentation *end;
+} Extention;
+
+bool
+Cover(DocRepresentation *doc, int len, QueryRepresentation *qr, Extention *ext);
+
+DocRepresentation *
+get_docrep(TSVector txt, QueryRepresentation *qr, int *doclen);
+
+#endif /* __TSRANK_H__ */
diff -Nurb postgresql-8.3.1/src/include/tsearch/ts_utils.h postgresql-8.3.1-orig/src/include/tsearch/ts_utils.h
--- postgresql-8.3.1/src/include/tsearch/ts_utils.h 2008-01-01 14:45:59.000000000 -0500
+++ postgresql-8.3.1-orig/src/include/tsearch/ts_utils.h 2008-05-22 18:44:16.000000000 -0400
@@ -14,6 +14,7 @@
#include "tsearch/ts_type.h"
#include "tsearch/ts_public.h"
+#include "tsearch/ts_rank.h"
#include "nodes/pg_list.h"
/*
@@ -95,8 +96,25 @@
* 3 generateHeadline to generate result text
*/
+struct coverpos
+{
+ int4 startpos;
+ int4 endpos;
+ int4 in;
+};
+
extern void hlparsetext(Oid cfgId, HeadlineParsedText *prs, TSQuery query,
char *buf, int4 buflen);
+
+
+extern void
+hlparsetext_with_covers(Oid cfgId,
+ HeadlineParsedText *prs,
+ TSQuery query,
+ text *in,
+ struct coverpos *covers,
+ int4 numcovers);
+
extern text *generateHeadline(HeadlineParsedText *prs);
/*
@@ -227,6 +245,7 @@
extern Datum prsd_end(PG_FUNCTION_ARGS);
extern Datum prsd_headline(PG_FUNCTION_ARGS);
extern Datum prsd_lextype(PG_FUNCTION_ARGS);
+extern Datum prsd_headline_with_fragments(PG_FUNCTION_ARGS);
/*
* Dictionary interface to SQL
@@ -264,6 +283,7 @@
extern Datum ts_headline_byid(PG_FUNCTION_ARGS);
extern Datum ts_headline(PG_FUNCTION_ARGS);
extern Datum ts_headline_opt(PG_FUNCTION_ARGS);
+extern Datum ts_headline_with_fragments(PG_FUNCTION_ARGS);
/*
* current cfg
diff -Nurb postgresql-8.3.1/src/interfaces/libpq/libpq.rc postgresql-8.3.1-orig/src/interfaces/libpq/libpq.rc
--- postgresql-8.3.1/src/interfaces/libpq/libpq.rc 2008-03-14 23:24:54.000000000 -0400
+++ postgresql-8.3.1-orig/src/interfaces/libpq/libpq.rc 2008-05-22 18:44:16.000000000 -0400
@@ -1,8 +1,8 @@
#include <winver.h>
VS_VERSION_INFO VERSIONINFO
- FILEVERSION 8,3,1,8075
- PRODUCTVERSION 8,3,1,8075
+ FILEVERSION 8,3,1,8143
+ PRODUCTVERSION 8,3,1,8143
FILEFLAGSMASK 0x3fL
FILEFLAGS 0
FILEOS VOS__WINDOWS32
--
Sent via pgsql-general mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-general