Updating the patch with emitting parttoken and registering it with snowball config.
-Sushant. On Fri, 2010-09-03 at 09:44 -0400, Robert Haas wrote: > On Wed, Sep 1, 2010 at 2:42 AM, Sushant Sinha <sushant...@gmail.com> wrote: > > I have attached a patch that emits parts of a host token, a url token, > > an email token and a file token. Further, it makes sure that a > > host/url/email/file token and the first part-token are at the same > > position in tsvector. > > You should probably add this patch here: > > https://commitfest.postgresql.org/action/commitfest_view/open >
Index: src/backend/snowball/snowball.sql.in =================================================================== RCS file: /projects/cvsroot/pgsql/src/backend/snowball/snowball.sql.in,v retrieving revision 1.6 diff -u -r1.6 snowball.sql.in --- src/backend/snowball/snowball.sql.in 27 Oct 2007 16:01:08 -0000 1.6 +++ src/backend/snowball/snowball.sql.in 4 Sep 2010 02:59:10 -0000 @@ -22,6 +22,6 @@ WITH _ASCDICTNAME_; ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING - FOR word, hword_part, hword + FOR word, hword_part, hword, parttoken WITH _NONASCDICTNAME_; Index: src/backend/tsearch/ts_parse.c =================================================================== RCS file: /projects/cvsroot/pgsql/src/backend/tsearch/ts_parse.c,v retrieving revision 1.17 diff -u -r1.17 ts_parse.c --- src/backend/tsearch/ts_parse.c 26 Feb 2010 02:01:05 -0000 1.17 +++ src/backend/tsearch/ts_parse.c 4 Sep 2010 02:59:11 -0000 @@ -19,7 +19,7 @@ #include "tsearch/ts_utils.h" #define IGNORE_LONGLEXEME 1 - +#define COMPLEX_TOKEN(x) ( x == 4 || x == 5 || x == 6 || x == 18 || x == 17 || x == 18 || x == 19) /* * Lexize subsystem */ @@ -407,8 +407,6 @@ { TSLexeme *ptr = norms; - prs->pos++; /* set pos */ - while (ptr->lexeme) { if (prs->curwords == prs->lenwords) @@ -429,6 +427,10 @@ prs->curwords++; } pfree(norms); + + if (!COMPLEX_TOKEN(type)) + prs->pos++; /* set pos */ + } } while (type > 0); Index: src/backend/tsearch/wparser_def.c =================================================================== RCS file: /projects/cvsroot/pgsql/src/backend/tsearch/wparser_def.c,v retrieving revision 1.33 diff -u -r1.33 wparser_def.c --- src/backend/tsearch/wparser_def.c 19 Aug 2010 05:57:34 -0000 1.33 +++ src/backend/tsearch/wparser_def.c 4 Sep 2010 02:59:12 -0000 @@ -23,7 +23,7 @@ /* Define me to enable tracing of parser behavior */ -/* #define WPARSER_TRACE */ +//#define WPARSER_TRACE /* Output token categories */ @@ -51,8 +51,9 @@ #define SIGNEDINT 21 #define UNSIGNEDINT 22 #define XMLENTITY 23 +#define PARTTOKEN 24 -#define LASTNUM 23 +#define LASTNUM 24 static const char *const tok_alias[] = { "", @@ -78,7 +79,8 @@ "float", "int", "uint", - "entity" + "entity", + "parttoken" }; static const char *const lex_descr[] = { @@ -105,7 +107,8 @@ "Decimal notation", "Signed integer", "Unsigned integer", - "XML entity" + "XML entity", + "Part of file/url/host/email" }; @@ -249,7 +252,8 @@ TParserPosition *state; bool ignore; bool wanthost; - + int partstop; + TParserState afterpart; /* silly char */ char c; @@ -617,8 +621,41 @@ } return 1; } +static int +p_ispartbingo(TParser *prs) +{ + int ret = 0; + if (prs->partstop > 0) + { + ret = 1; + if (prs->partstop <= prs->state->posbyte) + { + prs->state->state = prs->afterpart; + prs->partstop = 0; + } + else + prs->state->state = TPS_Base; + } + return ret; +} +static int +p_ispart(TParser *prs) +{ + if (prs->partstop > 0) + return 1; + else + return 0; +} +static int +p_ispartEOF(TParser *prs) +{ + if (p_ispart(prs) && p_isEOF(prs)) + return 1; + else + return 0; +} /* deliberately suppress unused-function complaints for the above */ void _make_compiler_happy(void); void @@ -688,6 +725,21 @@ } static void +SpecialPart(TParser *prs) +{ + prs->partstop = prs->state->posbyte; + prs->state->posbyte -= prs->state->lenbytetoken; + prs->state->poschar -= prs->state->lenchartoken; + prs->afterpart = TPS_Base; +} +static void +SpecialUrlPart(TParser *prs) +{ + SpecialPart(prs); + prs->afterpart = TPS_InURLPathStart; +} + +static void SpecialVerVersion(TParser *prs) { prs->state->posbyte -= prs->state->lenbytetoken; @@ -1057,6 +1109,7 @@ {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL}, {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL}, {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL}, + {p_ispart, 0, A_NEXT, TPS_InSpace, 0, NULL}, {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL}, @@ -1065,9 +1118,11 @@ static const TParserStateActionItem actionTPS_InNumWord[] = { + {p_ispartEOF, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL}, {p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL}, {p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL}, {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL}, + {p_ispartbingo, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL}, {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL}, @@ -1076,8 +1131,10 @@ }; static const TParserStateActionItem actionTPS_InAsciiWord[] = { + {p_ispartEOF, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL}, {p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL}, {p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL}, + {p_ispartbingo, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL}, {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL}, @@ -1094,6 +1151,7 @@ }; static const TParserStateActionItem actionTPS_InWord[] = { + {p_ispartEOF, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL}, {p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL}, {p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL}, {p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL}, @@ -1103,15 +1161,17 @@ }; static const TParserStateActionItem actionTPS_InUnsignedInt[] = { + {p_ispartEOF, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL}, {p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}, {p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL}, + {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL}, + {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL}, + {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL}, + {p_ispartbingo, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL}, {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL}, {p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL}, {p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL}, - {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL}, - {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL}, - {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL}, {NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL} }; @@ -1418,7 +1478,7 @@ }; static const TParserStateActionItem actionTPS_InHostDomain[] = { - {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}, + {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, SpecialPart}, {p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL}, {p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL}, {p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL}, @@ -1427,9 +1487,9 @@ {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL}, {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL}, {p_isdigit, 0, A_POP, TPS_Null, 0, NULL}, - {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL}, + {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, SpecialUrlPart}, {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL}, - {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL} + {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, SpecialPart} }; static const TParserStateActionItem actionTPS_InPortFirst[] = { @@ -1439,11 +1499,11 @@ }; static const TParserStateActionItem actionTPS_InPort[] = { - {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}, + {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, SpecialPart}, {p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL}, - {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL}, + {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, SpecialUrlPart}, {p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL}, - {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL} + {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, SpecialPart} }; static const TParserStateActionItem actionTPS_InHostFirstAN[] = { @@ -1457,6 +1517,7 @@ {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL}, {p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL}, + {p_ispartbingo, 0, A_BINGO | A_CLRALL, TPS_Null, PARTTOKEN, NULL}, {p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL}, @@ -1466,7 +1527,7 @@ static const TParserStateActionItem actionTPS_InEmail[] = { {p_isstophost, 0, A_POP, TPS_Null, 0, NULL}, - {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL}, + {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, SpecialPart}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; @@ -1507,22 +1568,22 @@ }; static const TParserStateActionItem actionTPS_InPathSecond[] = { - {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL}, + {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, SpecialPart}, {p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL}, - {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL}, - {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL}, + {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, SpecialPart}, + {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, SpecialPart}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; static const TParserStateActionItem actionTPS_InFile[] = { - {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL}, + {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, SpecialPart}, {p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL}, {p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL}, {p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL}, - {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL} + {NULL, 0, A_BINGO, TPS_Base, FILEPATH, SpecialPart} }; static const TParserStateActionItem actionTPS_InFileNext[] = { @@ -1544,9 +1605,9 @@ }; static const TParserStateActionItem actionTPS_InURLPath[] = { - {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL}, + {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, SpecialPart}, {p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL}, - {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL} + {NULL, 0, A_BINGO, TPS_Base, URLPATH, SpecialPart} }; static const TParserStateActionItem actionTPS_InFURL[] = { Index: src/test/regress/expected/tsdicts.out =================================================================== RCS file: /projects/cvsroot/pgsql/src/test/regress/expected/tsdicts.out,v retrieving revision 1.6 diff -u -r1.6 tsdicts.out --- src/test/regress/expected/tsdicts.out 14 Aug 2009 14:53:20 -0000 1.6 +++ src/test/regress/expected/tsdicts.out 4 Sep 2010 02:59:13 -0000 @@ -236,9 +236,9 @@ word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart WITH ispell, english_stem; SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot'); - to_tsvector ----------------------------------------------------------------------------------------------------- - 'ball':7 'book':1,5 'booking':1,5 'foot':7,10 'football':7 'footballklubber':7 'klubber':7 'sky':3 + to_tsvector +--------------------------------------------------------------------------------------------------- + 'ball':6 'book':0,4 'booking':0,4 'foot':6,9 'football':6 'footballklubber':6 'klubber':6 'sky':2 (1 row) SELECT to_tsquery('ispell_tst', 'footballklubber'); @@ -260,9 +260,9 @@ ALTER TEXT SEARCH CONFIGURATION hunspell_tst ALTER MAPPING REPLACE ispell WITH hunspell; SELECT to_tsvector('hunspell_tst', 'Booking the skies after rebookings for footballklubber from a foot'); - to_tsvector ----------------------------------------------------------------------------------------------------- - 'ball':7 'book':1,5 'booking':1,5 'foot':7,10 'football':7 'footballklubber':7 'klubber':7 'sky':3 + to_tsvector +--------------------------------------------------------------------------------------------------- + 'ball':6 'book':0,4 'booking':0,4 'foot':6,9 'football':6 'footballklubber':6 'klubber':6 'sky':2 (1 row) SELECT to_tsquery('hunspell_tst', 'footballklubber'); @@ -285,21 +285,21 @@ asciiword, hword_asciipart, asciihword WITH synonym, english_stem; SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre'); - to_tsvector ---------------------------------------------------- - 'call':4 'often':3 'pgsql':1,6,8,12 'pronounc':10 + to_tsvector +-------------------------------------------------- + 'call':3 'often':2 'pgsql':0,5,7,11 'pronounc':9 (1 row) SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead of Google'); - to_tsvector ----------------------------------------------------------- - 'common':2 'googl':7,10 'instead':8 'mistak':3 'write':6 + to_tsvector +--------------------------------------------------------- + 'common':1 'googl':6,9 'instead':7 'mistak':2 'write':5 (1 row) SELECT to_tsvector('synonym_tst', 'Indexes or indices - Which is right plural form of index?'); - to_tsvector ----------------------------------------------- - 'form':8 'index':1,3,10 'plural':7 'right':6 + to_tsvector +--------------------------------------------- + 'form':7 'index':0,2,9 'plural':6 'right':5 (1 row) SELECT to_tsquery('synonym_tst', 'Index & indices'); @@ -319,18 +319,18 @@ SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one'); to_tsvector ---------------------------------- - '1':1,5 '12':3 '123':4 'pgsql':2 + '1':0,4 '12':2 '123':3 'pgsql':1 (1 row) SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usually called supernovae (abbrevation SN)'); - to_tsvector -------------------------------------------------------------- - 'abbrev':10 'call':8 'new':4 'sn':1,9,11 'star':5 'usual':7 + to_tsvector +------------------------------------------------------------ + 'abbrev':9 'call':7 'new':3 'sn':0,8,10 'star':4 'usual':6 (1 row) SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets'); - to_tsvector -------------------------------------------------------- - 'card':3,10 'invit':2,9 'like':6 'look':5 'order':1,8 + to_tsvector +------------------------------------------------------ + 'card':2,9 'invit':1,8 'like':5 'look':4 'order':0,7 (1 row) Index: src/test/regress/expected/tsearch.out =================================================================== RCS file: /projects/cvsroot/pgsql/src/test/regress/expected/tsearch.out,v retrieving revision 1.18 diff -u -r1.18 tsearch.out --- src/test/regress/expected/tsearch.out 28 Apr 2010 02:04:16 -0000 1.18 +++ src/test/regress/expected/tsearch.out 4 Sep 2010 02:59:14 -0000 @@ -251,7 +251,8 @@ 21 | int | Signed integer 22 | uint | Unsigned integer 23 | entity | XML entity -(23 rows) + 24 | parttoken | Part of file/url/host/email +(24 rows) SELECT * FROM ts_parse('default', '345 q...@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teo...@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>"> /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234 @@ -263,35 +264,91 @@ 1 | qwe 12 | @ 19 | efd.r + 24 | efd + 12 | . + 24 | r 12 | ' 14 | http:// 6 | www.com + 24 | www + 12 | . + 24 | com 12 | / 14 | http:// 5 | aew.werc.ewr/?ad=qwe&dw 6 | aew.werc.ewr + 24 | aew + 12 | . + 24 | werc + 12 | . + 24 | ewr 18 | /?ad=qwe&dw + 12 | /? + 24 | ad + 12 | = + 24 | qwe + 12 | & + 24 | dw 12 | 5 | 1aew.werc.ewr/?ad=qwe&dw 6 | 1aew.werc.ewr + 24 | 1aew + 12 | . + 24 | werc + 12 | . + 24 | ewr 18 | /?ad=qwe&dw + 12 | /? + 24 | ad + 12 | = + 24 | qwe + 12 | & + 24 | dw 12 | 6 | 2aew.werc.ewr + 24 | 2aew + 12 | . + 24 | werc + 12 | . + 24 | ewr 12 | 14 | http:// 5 | 3aew.werc.ewr/?ad=qwe&dw 6 | 3aew.werc.ewr + 24 | 3aew + 12 | . + 24 | werc + 12 | . + 24 | ewr 18 | /?ad=qwe&dw + 12 | /? + 24 | ad + 12 | = + 24 | qwe + 12 | & + 24 | dw 12 | 14 | http:// 6 | 4aew.werc.ewr + 24 | 4aew + 12 | . + 24 | werc + 12 | . + 24 | ewr 12 | 14 | http:// 5 | 5aew.werc.ewr:8100/? 6 | 5aew.werc.ewr:8100 + 24 | 5aew + 12 | . + 24 | werc + 12 | . + 24 | ewr + 12 | : + 24 | 8100 18 | /? - 12 | - 1 | ad + 12 | /? + 24 | ad 12 | = 1 | qwe 12 | & @@ -299,11 +356,41 @@ 12 | 5 | 6aew.werc.ewr:8100/?ad=qwe&dw 6 | 6aew.werc.ewr:8100 + 24 | 6aew + 12 | . + 24 | werc + 12 | . + 24 | ewr + 12 | : + 24 | 8100 18 | /?ad=qwe&dw + 12 | /? + 24 | ad + 12 | = + 24 | qwe + 12 | & + 24 | dw 12 | 5 | 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 6 | 7aew.werc.ewr:8100 + 24 | 7aew + 12 | . + 24 | werc + 12 | . + 24 | ewr + 12 | : + 24 | 8100 18 | /?ad=qwe&dw=%20%32 + 12 | /? + 24 | ad + 12 | = + 24 | qwe + 12 | & + 24 | dw + 12 | =% + 24 | 20 + 12 | % + 24 | 32 12 | 7 | +4.0e-10 12 | @@ -320,6 +407,11 @@ 20 | 5.005 12 | 4 | teo...@stack.net + 24 | teodor + 12 | @ + 24 | stack + 12 | . + 24 | net 12 | 16 | qwe-wer 11 | qwe @@ -349,20 +441,51 @@ 12 | + | 19 | /usr/local/fff + 12 | / + 24 | usr + 12 | / + 24 | local + 12 | / + 24 | fff 12 | 19 | /awdf/dwqe/4325 + 12 | / + 24 | awdf + 12 | / + 24 | dwqe + 12 | / + 24 | 4325 12 | 19 | rewt/ewr + 24 | rewt + 12 | / + 24 | ewr 12 | 1 | wefjn 12 | 19 | /wqe-324/ewr + 12 | / + 24 | wqe + 21 | -324 + 12 | / + 24 | ewr 12 | 19 | gist.h + 24 | gist + 12 | . + 24 | h 12 | 19 | gist.h.c + 24 | gist + 12 | . + 24 | h + 12 | . + 24 | c 12 | 19 | gist.c + 24 | gist + 12 | . + 24 | c 12 | . 1 | readline 12 | @@ -393,14 +516,14 @@ 12 | 12 | <> 1 | qwerty -(133 rows) +(255 rows) SELECT to_tsvector('english', '345 q...@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teo...@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>"> /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234 <i <b> wow < jqw <> qwerty'); - to_tsvector ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - '+4.0e-10':28 '-4.2':60,62 '/?':18 '/?ad=qwe&dw':7,10,14,24 '/?ad=qwe&dw=%20%32':27 '/awdf/dwqe/4325':48 '/usr/local/fff':47 '/wqe-324/ewr':51 '1aew.werc.ewr':9 '1aew.werc.ewr/?ad=qwe&dw':8 '234':63 '234.435':32 '2aew.werc.ewr':11 '345':1 '3aew.werc.ewr':13 '3aew.werc.ewr/?ad=qwe&dw':12 '4.2':56,57,58 '455':33 '4aew.werc.ewr':15 '5.005':34 '5aew.werc.ewr:8100':17 '5aew.werc.ewr:8100/?':16 '6aew.werc.ewr:8100':23 '6aew.werc.ewr:8100/?ad=qwe&dw':22 '7aew.werc.ewr:8100':26 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':25 'ad':19 'aew.werc.ewr':6 'aew.werc.ewr/?ad=qwe&dw':5 'asdf':39 'dw':21 'efd.r':3 'ewr1':45 'ewri2':46 'gist.c':54 'gist.h':52 'gist.h.c':53 'hjwer':44 'jf':41 'jqw':66 'qwe':2,20,29,30,37 'qwe-wer':36 'qwer':40 'qwerti':67 'qwqwe':31 'readlin':55,59,61 'rewt/ewr':49 'sdjk':42 'teo...@stack.net':35 'wefjn':50 'wer':38 'wow':65 'www.com':4 + to_tsvector +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + '+4.0e-10':53 '-324':84 '-4.2':98,100 '/?':34 '/?ad=qwe&dw':9,15,24,41 '/?ad=qwe&dw=%20%32':48 '/awdf/dwqe/4325':77 '/usr/local/fff':74 '/wqe-324/ewr':83 '1aew':12 '1aew.werc.ewr':12 '1aew.werc.ewr/?ad=qwe&dw':12 '20':51 '234':101 '234.435':57 '2aew':18 '2aew.werc.ewr':18 '32':52 '345':0 '3aew':21 '3aew.werc.ewr':21 '3aew.werc.ewr/?ad=qwe&dw':21 '4.2':94,95,96 '4325':79 '455':58 '4aew':27 '4aew.werc.ewr':27 '5.005':59 '5aew':30 '5aew.werc.ewr:8100':30 '5aew.werc.ewr:8100/?':30 '6aew':37 '6aew.werc.ewr:8100':37 '6aew.werc.ewr:8100/?ad=qwe&dw':37 '7aew':44 '7aew.werc.ewr:8100':44 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':44 '8100':33,40,47 'ad':9,15,24,34,41,48 'aew':6 'aew.werc.ewr':6 'aew.werc.ewr/?ad=qwe&dw':6 'asdf':66 'awdf':77 'c':90,92 'com':5 'dw':11,17,26,36,43,50 'dwqe':78 'efd':2 'efd.r':2 'ewr':8,14,20,23,29,32,39,46,81,85 'ewr1':72 'ewri2':73 'fff':76 'gist':86,88,91 'gist.c':91 'gist.h':86 'gist.h.c':88 'h':87,89 'hjwer':71 'jf':68 'jqw':104 'local':75 'net':62 'qwe':1,10,16,25,35,42,49,54,55,64 'qwe-wer':63 'qwer':67 'qwerti':105 'qwqwe':56 'r':3 'readlin':93,97,99 'rewt':80 'rewt/ewr':80 'sdjk':69 'stack':61 'teodor':60 'teo...@stack.net':60 'usr':74 'wefjn':82 'wer':65 'werc':7,13,19,22,28,31,38,45 'wow':103 'wqe':83 'www':4 'www.com':4 (1 row) SELECT length(to_tsvector('english', '345 q...@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teo...@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>"> @@ -408,7 +531,7 @@ <i <b> wow < jqw <> qwerty')); length -------- - 53 + 85 (1 row) -- ts_debug @@ -428,41 +551,83 @@ -- check parsing of URLs SELECT * from ts_debug('english', 'http://www.harewoodsolutions.co.uk/press.aspx</span>'); - alias | description | token | dictionaries | dictionary | lexemes -----------+---------------+----------------------------------------+--------------+------------+------------------------------------------ - protocol | Protocol head | http:// | {} | | - url | URL | www.harewoodsolutions.co.uk/press.aspx | {simple} | simple | {www.harewoodsolutions.co.uk/press.aspx} - host | Host | www.harewoodsolutions.co.uk | {simple} | simple | {www.harewoodsolutions.co.uk} - url_path | URL path | /press.aspx | {simple} | simple | {/press.aspx} - tag | XML tag | </span> | {} | | -(5 rows) + alias | description | token | dictionaries | dictionary | lexemes +-----------+-----------------------------+----------------------------------------+----------------+--------------+------------------------------------------ + protocol | Protocol head | http:// | {} | | + url | URL | www.harewoodsolutions.co.uk/press.aspx | {simple} | simple | {www.harewoodsolutions.co.uk/press.aspx} + host | Host | www.harewoodsolutions.co.uk | {simple} | simple | {www.harewoodsolutions.co.uk} + parttoken | Part of file/url/host/email | www | {english_stem} | english_stem | {www} + blank | Space symbols | . | {} | | + parttoken | Part of file/url/host/email | harewoodsolutions | {english_stem} | english_stem | {harewoodsolut} + blank | Space symbols | . | {} | | + parttoken | Part of file/url/host/email | co | {english_stem} | english_stem | {co} + blank | Space symbols | . | {} | | + parttoken | Part of file/url/host/email | uk | {english_stem} | english_stem | {uk} + url_path | URL path | /press.aspx | {simple} | simple | {/press.aspx} + blank | Space symbols | / | {} | | + parttoken | Part of file/url/host/email | press | {english_stem} | english_stem | {press} + blank | Space symbols | . | {} | | + parttoken | Part of file/url/host/email | aspx | {english_stem} | english_stem | {aspx} + tag | XML tag | </span> | {} | | +(16 rows) SELECT * from ts_debug('english', 'http://aew.wer0c.ewr/id?ad=qwe&dw<span>'); - alias | description | token | dictionaries | dictionary | lexemes -----------+---------------+----------------------------+--------------+------------+------------------------------ - protocol | Protocol head | http:// | {} | | - url | URL | aew.wer0c.ewr/id?ad=qwe&dw | {simple} | simple | {aew.wer0c.ewr/id?ad=qwe&dw} - host | Host | aew.wer0c.ewr | {simple} | simple | {aew.wer0c.ewr} - url_path | URL path | /id?ad=qwe&dw | {simple} | simple | {/id?ad=qwe&dw} - tag | XML tag | <span> | {} | | -(5 rows) + alias | description | token | dictionaries | dictionary | lexemes +-----------+-----------------------------+----------------------------+----------------+--------------+------------------------------ + protocol | Protocol head | http:// | {} | | + url | URL | aew.wer0c.ewr/id?ad=qwe&dw | {simple} | simple | {aew.wer0c.ewr/id?ad=qwe&dw} + host | Host | aew.wer0c.ewr | {simple} | simple | {aew.wer0c.ewr} + parttoken | Part of file/url/host/email | aew | {english_stem} | english_stem | {aew} + blank | Space symbols | . | {} | | + parttoken | Part of file/url/host/email | wer | {english_stem} | english_stem | {wer} + parttoken | Part of file/url/host/email | 0c | {english_stem} | english_stem | {0c} + blank | Space symbols | . | {} | | + parttoken | Part of file/url/host/email | ewr | {english_stem} | english_stem | {ewr} + url_path | URL path | /id?ad=qwe&dw | {simple} | simple | {/id?ad=qwe&dw} + blank | Space symbols | / | {} | | + parttoken | Part of file/url/host/email | id | {english_stem} | english_stem | {id} + blank | Space symbols | ? | {} | | + parttoken | Part of file/url/host/email | ad | {english_stem} | english_stem | {ad} + blank | Space symbols | = | {} | | + parttoken | Part of file/url/host/email | qwe | {english_stem} | english_stem | {qwe} + blank | Space symbols | & | {} | | + parttoken | Part of file/url/host/email | dw | {english_stem} | english_stem | {dw} + tag | XML tag | <span> | {} | | +(19 rows) SELECT * from ts_debug('english', 'http://5aew.werc.ewr:8100/?'); - alias | description | token | dictionaries | dictionary | lexemes -----------+---------------+----------------------+--------------+------------+------------------------ - protocol | Protocol head | http:// | {} | | - url | URL | 5aew.werc.ewr:8100/? | {simple} | simple | {5aew.werc.ewr:8100/?} - host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100} - url_path | URL path | /? | {simple} | simple | {/?} -(4 rows) + alias | description | token | dictionaries | dictionary | lexemes +-----------+-----------------------------+----------------------+----------------+--------------+------------------------ + protocol | Protocol head | http:// | {} | | + url | URL | 5aew.werc.ewr:8100/? | {simple} | simple | {5aew.werc.ewr:8100/?} + host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100} + parttoken | Part of file/url/host/email | 5aew | {english_stem} | english_stem | {5aew} + blank | Space symbols | . | {} | | + parttoken | Part of file/url/host/email | werc | {english_stem} | english_stem | {werc} + blank | Space symbols | . | {} | | + parttoken | Part of file/url/host/email | ewr | {english_stem} | english_stem | {ewr} + blank | Space symbols | : | {} | | + parttoken | Part of file/url/host/email | 8100 | {english_stem} | english_stem | {8100} + url_path | URL path | /? | {simple} | simple | {/?} + blank | Space symbols | /? | {} | | +(12 rows) SELECT * from ts_debug('english', '5aew.werc.ewr:8100/?xx'); - alias | description | token | dictionaries | dictionary | lexemes -----------+-------------+------------------------+--------------+------------+-------------------------- - url | URL | 5aew.werc.ewr:8100/?xx | {simple} | simple | {5aew.werc.ewr:8100/?xx} - host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100} - url_path | URL path | /?xx | {simple} | simple | {/?xx} -(3 rows) + alias | description | token | dictionaries | dictionary | lexemes +-----------+-----------------------------+------------------------+----------------+--------------+-------------------------- + url | URL | 5aew.werc.ewr:8100/?xx | {simple} | simple | {5aew.werc.ewr:8100/?xx} + host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100} + parttoken | Part of file/url/host/email | 5aew | {english_stem} | english_stem | {5aew} + blank | Space symbols | . | {} | | + parttoken | Part of file/url/host/email | werc | {english_stem} | english_stem | {werc} + blank | Space symbols | . | {} | | + parttoken | Part of file/url/host/email | ewr | {english_stem} | english_stem | {ewr} + blank | Space symbols | : | {} | | + parttoken | Part of file/url/host/email | 8100 | {english_stem} | english_stem | {8100} + url_path | URL path | /?xx | {simple} | simple | {/?xx} + blank | Space symbols | /? | {} | | + parttoken | Part of file/url/host/email | xx | {english_stem} | english_stem | {xx} +(12 rows) -- to_tsquery SELECT to_tsquery('english', 'qwe & sKies '); @@ -1002,7 +1167,7 @@ SELECT to_tsvector('SKIES My booKs'); to_tsvector ---------------------------- - 'books':3 'my':2 'skies':1 + 'books':2 'my':1 'skies':0 (1 row) SELECT plainto_tsquery('SKIES My booKs'); @@ -1021,7 +1186,7 @@ SELECT to_tsvector('SKIES My booKs'); to_tsvector ------------------ - 'book':3 'sky':1 + 'book':2 'sky':0 (1 row) SELECT plainto_tsquery('SKIES My booKs'); @@ -1075,20 +1240,20 @@ select * from pendtest where 'ipsu:*'::tsquery @@ ts; ts -------------------- - 'ipsum':2 'lore':1 + 'ipsum':1 'lore':0 (1 row) select * from pendtest where 'ipsa:*'::tsquery @@ ts; ts -------------------- - 'ipsam':2 'lore':1 + 'ipsam':1 'lore':0 (1 row) select * from pendtest where 'ips:*'::tsquery @@ ts; ts -------------------- - 'ipsam':2 'lore':1 - 'ipsum':2 'lore':1 + 'ipsam':1 'lore':0 + 'ipsum':1 'lore':0 (2 rows) select * from pendtest where 'ipt:*'::tsquery @@ ts;
-- Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org) To make changes to your subscription: http://www.postgresql.org/mailpref/pgsql-hackers