Hi all
I would like to propose patch with a set of new small functions for fts in
case of
jsonb data type:
* to_tsvector(config, jsonb) - make a tsvector from all string values and
elements of jsonb object. To prevent the situation, when tsquery can find
a
phrase consisting of lexemes from two different values/elements, this
function will add an increment to position of each lexeme from every new
value/element.
* ts_headline(config, jsonb, tsquery, options) - generate a headline
directly
from jsonb object
Here are the examples how they work:
```
=# select to_tsvector('{"a": "aaa bbb", "b": ["ccc ddd"], "c": {"d": "eee
fff"}}'::jsonb);
to_tsvector
-------------------------------------------------
'aaa':1 'bbb':2 'ccc':4 'ddd':5 'eee':7 'fff':8
(1 row)
=# select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc
ddd"}}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >');
ts_headline
----------------------
aaa <bbb> ccc <ddd>
(1 row)
```
Any comments or suggestions?
diff --git a/src/backend/tsearch/to_tsany.c b/src/backend/tsearch/to_tsany.c
index 6e5de8f..08e08e5 100644
--- a/src/backend/tsearch/to_tsany.c
+++ b/src/backend/tsearch/to_tsany.c
@@ -16,6 +16,8 @@
#include "tsearch/ts_cache.h"
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"
+#include "utils/jsonb.h"
+#include "utils/fmgrprotos.h"
typedef struct MorphOpaque
@@ -256,6 +258,58 @@ to_tsvector(PG_FUNCTION_ARGS)
PointerGetDatum(in)));
}
+Datum
+jsonb_to_tsvector(PG_FUNCTION_ARGS)
+{
+ Jsonb *jb = PG_GETARG_JSONB(0);
+ JsonbIterator *it;
+ JsonbValue v;
+ Oid cfgId;
+ ParsedText prs;
+ TSVector result, item_vector;
+ JsonbIteratorToken type;
+ int i;
+
+ cfgId = getTSCurrentConfig(true);
+ it = JsonbIteratorInit(&jb->root);
+
+ while ((type = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
+ {
+ if ((type == WJB_VALUE || type == WJB_ELEM) && v.type == jbvString)
+ {
+ prs.lenwords = v.val.string.len / 6;
+
+ if (prs.lenwords == 0)
+ prs.lenwords = 2;
+
+ prs.curwords = 0;
+ prs.pos = 0;
+ prs.words = (ParsedWord *) palloc(sizeof(ParsedWord) * prs.lenwords);
+
+ parsetext(cfgId, &prs, v.val.string.val, v.val.string.len);
+
+ if (prs.curwords)
+ {
+ if (result->size != 0)
+ {
+ for (i = 0; i < prs.curwords; i++)
+ prs.words[i].pos.pos = prs.words[i].pos.pos + TS_JUMP;
+
+ item_vector = make_tsvector(&prs);
+
+ result = DirectFunctionCall2(tsvector_concat,
+ TSVectorGetDatum(result),
+ PointerGetDatum(item_vector));
+ }
+ else
+ result = make_tsvector(&prs);
+ }
+ }
+ }
+
+ PG_RETURN_DATUM(result);
+}
+
/*
* to_tsquery
*/
diff --git a/src/backend/tsearch/wparser.c b/src/backend/tsearch/wparser.c
index 8ca1c62..035632e 100644
--- a/src/backend/tsearch/wparser.c
+++ b/src/backend/tsearch/wparser.c
@@ -21,6 +21,7 @@
#include "tsearch/ts_utils.h"
#include "utils/builtins.h"
#include "utils/varlena.h"
+#include "utils/jsonb.h"
/******sql-level interface******/
@@ -362,3 +363,41 @@ ts_headline_opt(PG_FUNCTION_ARGS)
PG_GETARG_DATUM(1),
PG_GETARG_DATUM(2)));
}
+
+Datum
+ts_headline_jsonb(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_byid_opt,
+ ObjectIdGetDatum(getTSCurrentConfig(true)),
+ CStringGetTextDatum(jsonb_values_as_string(PG_GETARG_DATUM(0))),
+ PG_GETARG_DATUM(1)));
+}
+
+Datum
+ts_headline_jsonb_byid(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_DATUM(DirectFunctionCall3(ts_headline_byid_opt,
+ PG_GETARG_DATUM(0),
+ CStringGetTextDatum(jsonb_values_as_string(PG_GETARG_DATUM(1))),
+ PG_GETARG_DATUM(2)));
+}
+
+Datum
+ts_headline_jsonb_opt(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_DATUM(DirectFunctionCall4(ts_headline_byid_opt,
+ ObjectIdGetDatum(getTSCurrentConfig(true)),
+ CStringGetTextDatum(jsonb_values_as_string(PG_GETARG_DATUM(0))),
+ PG_GETARG_DATUM(1),
+ PG_GETARG_DATUM(2)));
+}
+
+Datum
+ts_headline_jsonb_byid_opt(PG_FUNCTION_ARGS)
+{
+ PG_RETURN_DATUM(DirectFunctionCall4(ts_headline_byid_opt,
+ PG_GETARG_DATUM(0),
+ CStringGetTextDatum(jsonb_values_as_string(PG_GETARG_DATUM(1))),
+ PG_GETARG_DATUM(2),
+ PG_GETARG_DATUM(3)));
+}
diff --git a/src/backend/utils/adt/jsonfuncs.c b/src/backend/utils/adt/jsonfuncs.c
index 6a7aab2..d504b87 100644
--- a/src/backend/utils/adt/jsonfuncs.c
+++ b/src/backend/utils/adt/jsonfuncs.c
@@ -4130,3 +4130,29 @@ setPathArray(JsonbIterator **it, Datum *path_elems, bool *path_nulls,
}
}
}
+
+/*
+ * Gather all string values and elements from jsonb into one string buffer.
+ * It's convenient for using inside ts_headline_* functions.
+ */
+char*
+jsonb_values_as_string(Jsonb *jb)
+{
+ JsonbIterator *it;
+ JsonbValue v;
+ JsonbIteratorToken type;
+ StringInfo buffer = makeStringInfo();
+
+ it = JsonbIteratorInit(&jb->root);
+
+ while ((type = JsonbIteratorNext(&it, &v, false)) != WJB_DONE)
+ {
+ if ((type == WJB_VALUE || type == WJB_ELEM) && v.type == jbvString)
+ {
+ appendStringInfoString(buffer, v.val.string.val);
+ appendBinaryStringInfo(buffer, " ", 1);
+ }
+ }
+
+ return buffer->data;
+}
diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h
index a4cc86d..ccedece 100644
--- a/src/include/catalog/pg_proc.h
+++ b/src/include/catalog/pg_proc.h
@@ -4759,6 +4759,15 @@ DESCR("generate headline");
DATA(insert OID = 3755 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f s s 2 0 25 "25 3615" _null_ _null_ _null_ _null_ _null_ ts_headline _null_ _null_ _null_ ));
DESCR("generate headline");
+DATA(insert OID = 4201 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f i s 4 0 25 "3734 3802 3615 25" _null_ _null_ _null_ _null_ _null_ ts_headline_jsonb_byid_opt _null_ _null_ _null_ ));
+DESCR("generate headline from jsonb");
+DATA(insert OID = 4202 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f i s 3 0 25 "3734 3802 3615" _null_ _null_ _null_ _null_ _null_ ts_headline_jsonb_byid _null_ _null_ _null_ ));
+DESCR("generate headline from jsonb");
+DATA(insert OID = 4203 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f s s 3 0 25 "3802 3615 25" _null_ _null_ _null_ _null_ _null_ ts_headline_jsonb_opt _null_ _null_ _null_ ));
+DESCR("generate headline from jsonb");
+DATA(insert OID = 4204 ( ts_headline PGNSP PGUID 12 100 0 0 0 f f f f t f s s 2 0 25 "3802 3615" _null_ _null_ _null_ _null_ _null_ ts_headline_jsonb _null_ _null_ _null_ ));
+DESCR("generate headline from jsonb");
+
DATA(insert OID = 3745 ( to_tsvector PGNSP PGUID 12 100 0 0 0 f f f f t f i s 2 0 3614 "3734 25" _null_ _null_ _null_ _null_ _null_ to_tsvector_byid _null_ _null_ _null_ ));
DESCR("transform to tsvector");
DATA(insert OID = 3746 ( to_tsquery PGNSP PGUID 12 100 0 0 0 f f f f t f i s 2 0 3615 "3734 25" _null_ _null_ _null_ _null_ _null_ to_tsquery_byid _null_ _null_ _null_ ));
@@ -4775,6 +4784,8 @@ DATA(insert OID = 3751 ( plainto_tsquery PGNSP PGUID 12 100 0 0 0 f f f f t f s
DESCR("transform to tsquery");
DATA(insert OID = 5001 ( phraseto_tsquery PGNSP PGUID 12 100 0 0 0 f f f f t f s s 1 0 3615 "25" _null_ _null_ _null_ _null_ _null_ phraseto_tsquery _null_ _null_ _null_ ));
DESCR("transform to tsquery");
+DATA(insert OID = 3800 ( to_tsvector PGNSP PGUID 12 100 0 0 0 f f f f t f s s 1 0 3614 "3802" _null_ _null_ _null_ _null_ _null_ jsonb_to_tsvector _null_ _null_ _null_ ));
+DESCR("transform jsonb to tsvector");
DATA(insert OID = 3752 ( tsvector_update_trigger PGNSP PGUID 12 1 0 0 0 f f f f f f v s 0 0 2279 "" _null_ _null_ _null_ _null_ _null_ tsvector_update_trigger_byid _null_ _null_ _null_ ));
DESCR("trigger for automatic update of tsvector column");
diff --git a/src/include/tsearch/ts_type.h b/src/include/tsearch/ts_type.h
index 155650c..873e2e1 100644
--- a/src/include/tsearch/ts_type.h
+++ b/src/include/tsearch/ts_type.h
@@ -86,6 +86,15 @@ typedef struct
#define MAXNUMPOS (256)
#define LIMITPOS(x) ( ( (x) >= MAXENTRYPOS ) ? (MAXENTRYPOS-1) : (x) )
+/*
+ * In case if a TSVector contains several parts and we want to treat them as
+ * separate, it's necessary to add an artificial increment to position of each
+ * lexeme from every next part. It's required to avoid the situation when
+ * tsquery can find a phrase consisting of lexemes from two of such parts.
+ * TS_JUMP defined a value of this increment.
+ */
+#define TS_JUMP 1
+
/* This struct represents a complete tsvector datum */
typedef struct
{
diff --git a/src/include/utils/jsonb.h b/src/include/utils/jsonb.h
index 411e158..233f7b7 100644
--- a/src/include/utils/jsonb.h
+++ b/src/include/utils/jsonb.h
@@ -377,5 +377,7 @@ extern char *JsonbToCString(StringInfo out, JsonbContainer *in,
extern char *JsonbToCStringIndent(StringInfo out, JsonbContainer *in,
int estimated_len);
+extern char *jsonb_values_as_string(Jsonb *jsonb);
+
#endif /* __JSONB_H__ */
diff --git a/src/test/regress/expected/jsonb.out b/src/test/regress/expected/jsonb.out
index 8ec4150..3333730 100644
--- a/src/test/regress/expected/jsonb.out
+++ b/src/test/regress/expected/jsonb.out
@@ -3474,3 +3474,86 @@ HINT: Try using the function jsonb_set to replace key value.
select jsonb_insert('{"a": {"b": "value"}}', '{a, b}', '"new_value"', true);
ERROR: cannot replace existing key
HINT: Try using the function jsonb_set to replace key value.
+-- jsonb to tsvector
+select to_tsvector('{"a": "aaa bbb ddd ccc", "b": ["eee fff ggg"], "c": {"d": "hhh iii"}}'::jsonb);
+ to_tsvector
+---------------------------------------------------------------------------
+ 'aaa':1 'bbb':2 'ccc':4 'ddd':3 'eee':6 'fff':7 'ggg':8 'hhh':10 'iii':11
+(1 row)
+
+-- jsonb to tsvector with stop words
+select to_tsvector('{"a": "aaa in bbb ddd ccc", "b": ["the eee fff ggg"], "c": {"d": "hhh. iii"}}'::jsonb);
+ to_tsvector
+----------------------------------------------------------------------------
+ 'aaa':1 'bbb':3 'ccc':5 'ddd':4 'eee':8 'fff':9 'ggg':10 'hhh':12 'iii':13
+(1 row)
+
+-- ts_vector corner cases
+select to_tsvector('""'::jsonb);
+ to_tsvector
+-------------
+
+(1 row)
+
+select to_tsvector('{}'::jsonb);
+ to_tsvector
+-------------
+
+(1 row)
+
+select to_tsvector('[]'::jsonb);
+ to_tsvector
+-------------
+
+(1 row)
+
+select to_tsvector('null'::jsonb);
+ to_tsvector
+-------------
+
+(1 row)
+
+-- ts_headline for jsonb
+select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'));
+ ts_headline
+------------------------------------------------------------------------------
+ aaa <b>bbb</b> ccc <b>ddd</b> fffccc1 ddd1 ccc1 ddd1 ggg hhhiii jjj iii jjj
+(1 row)
+
+select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'));
+ ts_headline
+---------------------------------------------------------------
+ aaa <b>bbb</b> ccc <b>ddd</b> fff\x02 ggg hhhiii jjj iii jjj
+(1 row)
+
+select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >');
+ ts_headline
+--------------------------------------------------------------------
+ aaa <bbb> ccc <ddd> fffccc1 ddd1 ccc1 ddd1 ggg hhhiii jjj iii jjj
+(1 row)
+
+select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >');
+ ts_headline
+--------------------------------------------------------------------
+ aaa <bbb> ccc <ddd> fffccc1 ddd1 ccc1 ddd1 ggg hhhiii jjj iii jjj
+(1 row)
+
+-- corner cases for ts_headline with jsonb
+select ts_headline('null'::jsonb, tsquery('aaa & bbb'));
+ ts_headline
+-------------
+
+(1 row)
+
+select ts_headline('{}'::jsonb, tsquery('aaa & bbb'));
+ ts_headline
+-------------
+
+(1 row)
+
+select ts_headline('[]'::jsonb, tsquery('aaa & bbb'));
+ ts_headline
+-------------
+
+(1 row)
+
diff --git a/src/test/regress/sql/jsonb.sql b/src/test/regress/sql/jsonb.sql
index e2eaca0..2e31ee6 100644
--- a/src/test/regress/sql/jsonb.sql
+++ b/src/test/regress/sql/jsonb.sql
@@ -878,3 +878,26 @@ select jsonb_insert('{"a": {"b": "value"}}', '{a, c}', '"new_value"', true);
select jsonb_insert('{"a": {"b": "value"}}', '{a, b}', '"new_value"');
select jsonb_insert('{"a": {"b": "value"}}', '{a, b}', '"new_value"', true);
+
+-- jsonb to tsvector
+select to_tsvector('{"a": "aaa bbb ddd ccc", "b": ["eee fff ggg"], "c": {"d": "hhh iii"}}'::jsonb);
+
+-- jsonb to tsvector with stop words
+select to_tsvector('{"a": "aaa in bbb ddd ccc", "b": ["the eee fff ggg"], "c": {"d": "hhh. iii"}}'::jsonb);
+
+-- ts_vector corner cases
+select to_tsvector('""'::jsonb);
+select to_tsvector('{}'::jsonb);
+select to_tsvector('[]'::jsonb);
+select to_tsvector('null'::jsonb);
+
+-- ts_headline for jsonb
+select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'));
+select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'));
+select ts_headline('{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >');
+select ts_headline('english', '{"a": "aaa bbb", "b": {"c": "ccc ddd fff", "c1": "ccc1 ddd1"}, "d": ["ggg hhh", "iii jjj"]}'::jsonb, tsquery('bbb & ddd & hhh'), 'StartSel = <, StopSel = >');
+
+-- corner cases for ts_headline with jsonb
+select ts_headline('null'::jsonb, tsquery('aaa & bbb'));
+select ts_headline('{}'::jsonb, tsquery('aaa & bbb'));
+select ts_headline('[]'::jsonb, tsquery('aaa & bbb'));
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers