Updating the patch with emitting parttoken and registering it with
snowball config.
-Sushant.
On Fri, 2010-09-03 at 09:44 -0400, Robert Haas wrote:
> On Wed, Sep 1, 2010 at 2:42 AM, Sushant Sinha <[email protected]> wrote:
> > I have attached a patch that emits parts of a host token, a url token,
> > an email token and a file token. Further, it makes sure that a
> > host/url/email/file token and the first part-token are at the same
> > position in tsvector.
>
> You should probably add this patch here:
>
> https://commitfest.postgresql.org/action/commitfest_view/open
>
Index: src/backend/snowball/snowball.sql.in
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/snowball/snowball.sql.in,v
retrieving revision 1.6
diff -u -r1.6 snowball.sql.in
--- src/backend/snowball/snowball.sql.in 27 Oct 2007 16:01:08 -0000 1.6
+++ src/backend/snowball/snowball.sql.in 4 Sep 2010 02:59:10 -0000
@@ -22,6 +22,6 @@
WITH _ASCDICTNAME_;
ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
- FOR word, hword_part, hword
+ FOR word, hword_part, hword, parttoken
WITH _NONASCDICTNAME_;
Index: src/backend/tsearch/ts_parse.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/tsearch/ts_parse.c,v
retrieving revision 1.17
diff -u -r1.17 ts_parse.c
--- src/backend/tsearch/ts_parse.c 26 Feb 2010 02:01:05 -0000 1.17
+++ src/backend/tsearch/ts_parse.c 4 Sep 2010 02:59:11 -0000
@@ -19,7 +19,7 @@
#include "tsearch/ts_utils.h"
#define IGNORE_LONGLEXEME 1
-
+#define COMPLEX_TOKEN(x) ( x == 4 || x == 5 || x == 6 || x == 18 || x == 17 || x == 18 || x == 19)
/*
* Lexize subsystem
*/
@@ -407,8 +407,6 @@
{
TSLexeme *ptr = norms;
- prs->pos++; /* set pos */
-
while (ptr->lexeme)
{
if (prs->curwords == prs->lenwords)
@@ -429,6 +427,10 @@
prs->curwords++;
}
pfree(norms);
+
+ if (!COMPLEX_TOKEN(type))
+ prs->pos++; /* set pos */
+
}
} while (type > 0);
Index: src/backend/tsearch/wparser_def.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/tsearch/wparser_def.c,v
retrieving revision 1.33
diff -u -r1.33 wparser_def.c
--- src/backend/tsearch/wparser_def.c 19 Aug 2010 05:57:34 -0000 1.33
+++ src/backend/tsearch/wparser_def.c 4 Sep 2010 02:59:12 -0000
@@ -23,7 +23,7 @@
/* Define me to enable tracing of parser behavior */
-/* #define WPARSER_TRACE */
+//#define WPARSER_TRACE
/* Output token categories */
@@ -51,8 +51,9 @@
#define SIGNEDINT 21
#define UNSIGNEDINT 22
#define XMLENTITY 23
+#define PARTTOKEN 24
-#define LASTNUM 23
+#define LASTNUM 24
static const char *const tok_alias[] = {
"",
@@ -78,7 +79,8 @@
"float",
"int",
"uint",
- "entity"
+ "entity",
+ "parttoken"
};
static const char *const lex_descr[] = {
@@ -105,7 +107,8 @@
"Decimal notation",
"Signed integer",
"Unsigned integer",
- "XML entity"
+ "XML entity",
+ "Part of file/url/host/email"
};
@@ -249,7 +252,8 @@
TParserPosition *state;
bool ignore;
bool wanthost;
-
+ int partstop;
+ TParserState afterpart;
/* silly char */
char c;
@@ -617,8 +621,41 @@
}
return 1;
}
+static int
+p_ispartbingo(TParser *prs)
+{
+ int ret = 0;
+ if (prs->partstop > 0)
+ {
+ ret = 1;
+ if (prs->partstop <= prs->state->posbyte)
+ {
+ prs->state->state = prs->afterpart;
+ prs->partstop = 0;
+ }
+ else
+ prs->state->state = TPS_Base;
+ }
+ return ret;
+}
+static int
+p_ispart(TParser *prs)
+{
+ if (prs->partstop > 0)
+ return 1;
+ else
+ return 0;
+}
+static int
+p_ispartEOF(TParser *prs)
+{
+ if (p_ispart(prs) && p_isEOF(prs))
+ return 1;
+ else
+ return 0;
+}
/* deliberately suppress unused-function complaints for the above */
void _make_compiler_happy(void);
void
@@ -688,6 +725,21 @@
}
static void
+SpecialPart(TParser *prs)
+{
+ prs->partstop = prs->state->posbyte;
+ prs->state->posbyte -= prs->state->lenbytetoken;
+ prs->state->poschar -= prs->state->lenchartoken;
+ prs->afterpart = TPS_Base;
+}
+static void
+SpecialUrlPart(TParser *prs)
+{
+ SpecialPart(prs);
+ prs->afterpart = TPS_InURLPathStart;
+}
+
+static void
SpecialVerVersion(TParser *prs)
{
prs->state->posbyte -= prs->state->lenbytetoken;
@@ -1057,6 +1109,7 @@
{p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
{p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
{p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
+ {p_ispart, 0, A_NEXT, TPS_InSpace, 0, NULL},
{p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
@@ -1065,9 +1118,11 @@
static const TParserStateActionItem actionTPS_InNumWord[] = {
+ {p_ispartEOF, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL},
{p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
{p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
{p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+ {p_ispartbingo, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
@@ -1076,8 +1131,10 @@
};
static const TParserStateActionItem actionTPS_InAsciiWord[] = {
+ {p_ispartEOF, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL},
{p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
{p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_ispartbingo, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
@@ -1094,6 +1151,7 @@
};
static const TParserStateActionItem actionTPS_InWord[] = {
+ {p_ispartEOF, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL},
{p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
{p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
{p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
@@ -1103,15 +1161,17 @@
};
static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
+ {p_ispartEOF, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL},
{p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
+ {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
+ {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+ {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+ {p_ispartbingo, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
- {p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
- {p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
- {p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
{NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
};
@@ -1418,7 +1478,7 @@
};
static const TParserStateActionItem actionTPS_InHostDomain[] = {
- {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
+ {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, SpecialPart},
{p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
{p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
@@ -1427,9 +1487,9 @@
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
- {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
+ {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, SpecialUrlPart},
{p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
- {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, SpecialPart}
};
static const TParserStateActionItem actionTPS_InPortFirst[] = {
@@ -1439,11 +1499,11 @@
};
static const TParserStateActionItem actionTPS_InPort[] = {
- {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
+ {p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, SpecialPart},
{p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
- {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
+ {p_isstophost, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, SpecialUrlPart},
{p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
- {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
+ {NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, SpecialPart}
};
static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
@@ -1457,6 +1517,7 @@
{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
{p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
+ {p_ispartbingo, 0, A_BINGO | A_CLRALL, TPS_Null, PARTTOKEN, NULL},
{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
@@ -1466,7 +1527,7 @@
static const TParserStateActionItem actionTPS_InEmail[] = {
{p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
- {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
+ {p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, SpecialPart},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
@@ -1507,22 +1568,22 @@
};
static const TParserStateActionItem actionTPS_InPathSecond[] = {
- {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
+ {p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, SpecialPart},
{p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
- {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
- {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
+ {p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, SpecialPart},
+ {p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, SpecialPart},
{NULL, 0, A_POP, TPS_Null, 0, NULL}
};
static const TParserStateActionItem actionTPS_InFile[] = {
- {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
+ {p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, SpecialPart},
{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
- {NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
+ {NULL, 0, A_BINGO, TPS_Base, FILEPATH, SpecialPart}
};
static const TParserStateActionItem actionTPS_InFileNext[] = {
@@ -1544,9 +1605,9 @@
};
static const TParserStateActionItem actionTPS_InURLPath[] = {
- {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
+ {p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, SpecialPart},
{p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
- {NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
+ {NULL, 0, A_BINGO, TPS_Base, URLPATH, SpecialPart}
};
static const TParserStateActionItem actionTPS_InFURL[] = {
Index: src/test/regress/expected/tsdicts.out
===================================================================
RCS file: /projects/cvsroot/pgsql/src/test/regress/expected/tsdicts.out,v
retrieving revision 1.6
diff -u -r1.6 tsdicts.out
--- src/test/regress/expected/tsdicts.out 14 Aug 2009 14:53:20 -0000 1.6
+++ src/test/regress/expected/tsdicts.out 4 Sep 2010 02:59:13 -0000
@@ -236,9 +236,9 @@
word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart
WITH ispell, english_stem;
SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
- to_tsvector
-----------------------------------------------------------------------------------------------------
- 'ball':7 'book':1,5 'booking':1,5 'foot':7,10 'football':7 'footballklubber':7 'klubber':7 'sky':3
+ to_tsvector
+---------------------------------------------------------------------------------------------------
+ 'ball':6 'book':0,4 'booking':0,4 'foot':6,9 'football':6 'footballklubber':6 'klubber':6 'sky':2
(1 row)
SELECT to_tsquery('ispell_tst', 'footballklubber');
@@ -260,9 +260,9 @@
ALTER TEXT SEARCH CONFIGURATION hunspell_tst ALTER MAPPING
REPLACE ispell WITH hunspell;
SELECT to_tsvector('hunspell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
- to_tsvector
-----------------------------------------------------------------------------------------------------
- 'ball':7 'book':1,5 'booking':1,5 'foot':7,10 'football':7 'footballklubber':7 'klubber':7 'sky':3
+ to_tsvector
+---------------------------------------------------------------------------------------------------
+ 'ball':6 'book':0,4 'booking':0,4 'foot':6,9 'football':6 'footballklubber':6 'klubber':6 'sky':2
(1 row)
SELECT to_tsquery('hunspell_tst', 'footballklubber');
@@ -285,21 +285,21 @@
asciiword, hword_asciipart, asciihword
WITH synonym, english_stem;
SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
- to_tsvector
----------------------------------------------------
- 'call':4 'often':3 'pgsql':1,6,8,12 'pronounc':10
+ to_tsvector
+--------------------------------------------------
+ 'call':3 'often':2 'pgsql':0,5,7,11 'pronounc':9
(1 row)
SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead of Google');
- to_tsvector
-----------------------------------------------------------
- 'common':2 'googl':7,10 'instead':8 'mistak':3 'write':6
+ to_tsvector
+---------------------------------------------------------
+ 'common':1 'googl':6,9 'instead':7 'mistak':2 'write':5
(1 row)
SELECT to_tsvector('synonym_tst', 'Indexes or indices - Which is right plural form of index?');
- to_tsvector
-----------------------------------------------
- 'form':8 'index':1,3,10 'plural':7 'right':6
+ to_tsvector
+---------------------------------------------
+ 'form':7 'index':0,2,9 'plural':6 'right':5
(1 row)
SELECT to_tsquery('synonym_tst', 'Index & indices');
@@ -319,18 +319,18 @@
SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one');
to_tsvector
----------------------------------
- '1':1,5 '12':3 '123':4 'pgsql':2
+ '1':0,4 '12':2 '123':3 'pgsql':1
(1 row)
SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usually called supernovae (abbrevation SN)');
- to_tsvector
--------------------------------------------------------------
- 'abbrev':10 'call':8 'new':4 'sn':1,9,11 'star':5 'usual':7
+ to_tsvector
+------------------------------------------------------------
+ 'abbrev':9 'call':7 'new':3 'sn':0,8,10 'star':4 'usual':6
(1 row)
SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets');
- to_tsvector
--------------------------------------------------------
- 'card':3,10 'invit':2,9 'like':6 'look':5 'order':1,8
+ to_tsvector
+------------------------------------------------------
+ 'card':2,9 'invit':1,8 'like':5 'look':4 'order':0,7
(1 row)
Index: src/test/regress/expected/tsearch.out
===================================================================
RCS file: /projects/cvsroot/pgsql/src/test/regress/expected/tsearch.out,v
retrieving revision 1.18
diff -u -r1.18 tsearch.out
--- src/test/regress/expected/tsearch.out 28 Apr 2010 02:04:16 -0000 1.18
+++ src/test/regress/expected/tsearch.out 4 Sep 2010 02:59:14 -0000
@@ -251,7 +251,8 @@
21 | int | Signed integer
22 | uint | Unsigned integer
23 | entity | XML entity
-(23 rows)
+ 24 | parttoken | Part of file/url/host/email
+(24 rows)
SELECT * FROM ts_parse('default', '345 [email protected] '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 [email protected] qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
@@ -263,35 +264,91 @@
1 | qwe
12 | @
19 | efd.r
+ 24 | efd
+ 12 | .
+ 24 | r
12 | '
14 | http://
6 | www.com
+ 24 | www
+ 12 | .
+ 24 | com
12 | /
14 | http://
5 | aew.werc.ewr/?ad=qwe&dw
6 | aew.werc.ewr
+ 24 | aew
+ 12 | .
+ 24 | werc
+ 12 | .
+ 24 | ewr
18 | /?ad=qwe&dw
+ 12 | /?
+ 24 | ad
+ 12 | =
+ 24 | qwe
+ 12 | &
+ 24 | dw
12 |
5 | 1aew.werc.ewr/?ad=qwe&dw
6 | 1aew.werc.ewr
+ 24 | 1aew
+ 12 | .
+ 24 | werc
+ 12 | .
+ 24 | ewr
18 | /?ad=qwe&dw
+ 12 | /?
+ 24 | ad
+ 12 | =
+ 24 | qwe
+ 12 | &
+ 24 | dw
12 |
6 | 2aew.werc.ewr
+ 24 | 2aew
+ 12 | .
+ 24 | werc
+ 12 | .
+ 24 | ewr
12 |
14 | http://
5 | 3aew.werc.ewr/?ad=qwe&dw
6 | 3aew.werc.ewr
+ 24 | 3aew
+ 12 | .
+ 24 | werc
+ 12 | .
+ 24 | ewr
18 | /?ad=qwe&dw
+ 12 | /?
+ 24 | ad
+ 12 | =
+ 24 | qwe
+ 12 | &
+ 24 | dw
12 |
14 | http://
6 | 4aew.werc.ewr
+ 24 | 4aew
+ 12 | .
+ 24 | werc
+ 12 | .
+ 24 | ewr
12 |
14 | http://
5 | 5aew.werc.ewr:8100/?
6 | 5aew.werc.ewr:8100
+ 24 | 5aew
+ 12 | .
+ 24 | werc
+ 12 | .
+ 24 | ewr
+ 12 | :
+ 24 | 8100
18 | /?
- 12 |
- 1 | ad
+ 12 | /?
+ 24 | ad
12 | =
1 | qwe
12 | &
@@ -299,11 +356,41 @@
12 |
5 | 6aew.werc.ewr:8100/?ad=qwe&dw
6 | 6aew.werc.ewr:8100
+ 24 | 6aew
+ 12 | .
+ 24 | werc
+ 12 | .
+ 24 | ewr
+ 12 | :
+ 24 | 8100
18 | /?ad=qwe&dw
+ 12 | /?
+ 24 | ad
+ 12 | =
+ 24 | qwe
+ 12 | &
+ 24 | dw
12 |
5 | 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32
6 | 7aew.werc.ewr:8100
+ 24 | 7aew
+ 12 | .
+ 24 | werc
+ 12 | .
+ 24 | ewr
+ 12 | :
+ 24 | 8100
18 | /?ad=qwe&dw=%20%32
+ 12 | /?
+ 24 | ad
+ 12 | =
+ 24 | qwe
+ 12 | &
+ 24 | dw
+ 12 | =%
+ 24 | 20
+ 12 | %
+ 24 | 32
12 |
7 | +4.0e-10
12 |
@@ -320,6 +407,11 @@
20 | 5.005
12 |
4 | [email protected]
+ 24 | teodor
+ 12 | @
+ 24 | stack
+ 12 | .
+ 24 | net
12 |
16 | qwe-wer
11 | qwe
@@ -349,20 +441,51 @@
12 | +
|
19 | /usr/local/fff
+ 12 | /
+ 24 | usr
+ 12 | /
+ 24 | local
+ 12 | /
+ 24 | fff
12 |
19 | /awdf/dwqe/4325
+ 12 | /
+ 24 | awdf
+ 12 | /
+ 24 | dwqe
+ 12 | /
+ 24 | 4325
12 |
19 | rewt/ewr
+ 24 | rewt
+ 12 | /
+ 24 | ewr
12 |
1 | wefjn
12 |
19 | /wqe-324/ewr
+ 12 | /
+ 24 | wqe
+ 21 | -324
+ 12 | /
+ 24 | ewr
12 |
19 | gist.h
+ 24 | gist
+ 12 | .
+ 24 | h
12 |
19 | gist.h.c
+ 24 | gist
+ 12 | .
+ 24 | h
+ 12 | .
+ 24 | c
12 |
19 | gist.c
+ 24 | gist
+ 12 | .
+ 24 | c
12 | .
1 | readline
12 |
@@ -393,14 +516,14 @@
12 |
12 | <>
1 | qwerty
-(133 rows)
+(255 rows)
SELECT to_tsvector('english', '345 [email protected] '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 [email protected] qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
/usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
<i <b> wow < jqw <> qwerty');
- to_tsvector
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
- '+4.0e-10':28 '-4.2':60,62 '/?':18 '/?ad=qwe&dw':7,10,14,24 '/?ad=qwe&dw=%20%32':27 '/awdf/dwqe/4325':48 '/usr/local/fff':47 '/wqe-324/ewr':51 '1aew.werc.ewr':9 '1aew.werc.ewr/?ad=qwe&dw':8 '234':63 '234.435':32 '2aew.werc.ewr':11 '345':1 '3aew.werc.ewr':13 '3aew.werc.ewr/?ad=qwe&dw':12 '4.2':56,57,58 '455':33 '4aew.werc.ewr':15 '5.005':34 '5aew.werc.ewr:8100':17 '5aew.werc.ewr:8100/?':16 '6aew.werc.ewr:8100':23 '6aew.werc.ewr:8100/?ad=qwe&dw':22 '7aew.werc.ewr:8100':26 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':25 'ad':19 'aew.werc.ewr':6 'aew.werc.ewr/?ad=qwe&dw':5 'asdf':39 'dw':21 'efd.r':3 'ewr1':45 'ewri2':46 'gist.c':54 'gist.h':52 'gist.h.c':53 'hjwer':44 'jf':41 'jqw':66 'qwe':2,20,29,30,37 'qwe-wer':36 'qwer':40 'qwerti':67 'qwqwe':31 'readlin':55,59,61 'rewt/ewr':49 'sdjk':42 '[email protected]':35 'wefjn':50 'wer':38 'wow':65 'www.com':4
+ to_tsvector
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ '+4.0e-10':53 '-324':84 '-4.2':98,100 '/?':34 '/?ad=qwe&dw':9,15,24,41 '/?ad=qwe&dw=%20%32':48 '/awdf/dwqe/4325':77 '/usr/local/fff':74 '/wqe-324/ewr':83 '1aew':12 '1aew.werc.ewr':12 '1aew.werc.ewr/?ad=qwe&dw':12 '20':51 '234':101 '234.435':57 '2aew':18 '2aew.werc.ewr':18 '32':52 '345':0 '3aew':21 '3aew.werc.ewr':21 '3aew.werc.ewr/?ad=qwe&dw':21 '4.2':94,95,96 '4325':79 '455':58 '4aew':27 '4aew.werc.ewr':27 '5.005':59 '5aew':30 '5aew.werc.ewr:8100':30 '5aew.werc.ewr:8100/?':30 '6aew':37 '6aew.werc.ewr:8100':37 '6aew.werc.ewr:8100/?ad=qwe&dw':37 '7aew':44 '7aew.werc.ewr:8100':44 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':44 '8100':33,40,47 'ad':9,15,24,34,41,48 'aew':6 'aew.werc.ewr':6 'aew.werc.ewr/?ad=qwe&dw':6 'asdf':66 'awdf':77 'c':90,92 'com':5 'dw':11,17,26,36,43,50 'dwqe':78 'efd':2 'efd.r':2 'ewr':8,14,20,23,29,32,39,46,81,85 'ewr1':72 'ewri2':73 'fff':76 'gist':86,88,91 'gist.c':91 'gist.h':86 'gist.h.c':88 'h':87,89 'hjwer':71 'jf':68 'jqw':104 'local':75 'net':62 'qwe':1,10,16,25,35,42,49,54,55,64 'qwe-wer':63 'qwer':67 'qwerti':105 'qwqwe':56 'r':3 'readlin':93,97,99 'rewt':80 'rewt/ewr':80 'sdjk':69 'stack':61 'teodor':60 '[email protected]':60 'usr':74 'wefjn':82 'wer':65 'werc':7,13,19,22,28,31,38,45 'wow':103 'wqe':83 'www':4 'www.com':4
(1 row)
SELECT length(to_tsvector('english', '345 [email protected] '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 [email protected] qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
@@ -408,7 +531,7 @@
<i <b> wow < jqw <> qwerty'));
length
--------
- 53
+ 85
(1 row)
-- ts_debug
@@ -428,41 +551,83 @@
-- check parsing of URLs
SELECT * from ts_debug('english', 'http://www.harewoodsolutions.co.uk/press.aspx</span>');
- alias | description | token | dictionaries | dictionary | lexemes
-----------+---------------+----------------------------------------+--------------+------------+------------------------------------------
- protocol | Protocol head | http:// | {} | |
- url | URL | www.harewoodsolutions.co.uk/press.aspx | {simple} | simple | {www.harewoodsolutions.co.uk/press.aspx}
- host | Host | www.harewoodsolutions.co.uk | {simple} | simple | {www.harewoodsolutions.co.uk}
- url_path | URL path | /press.aspx | {simple} | simple | {/press.aspx}
- tag | XML tag | </span> | {} | |
-(5 rows)
+ alias | description | token | dictionaries | dictionary | lexemes
+-----------+-----------------------------+----------------------------------------+----------------+--------------+------------------------------------------
+ protocol | Protocol head | http:// | {} | |
+ url | URL | www.harewoodsolutions.co.uk/press.aspx | {simple} | simple | {www.harewoodsolutions.co.uk/press.aspx}
+ host | Host | www.harewoodsolutions.co.uk | {simple} | simple | {www.harewoodsolutions.co.uk}
+ parttoken | Part of file/url/host/email | www | {english_stem} | english_stem | {www}
+ blank | Space symbols | . | {} | |
+ parttoken | Part of file/url/host/email | harewoodsolutions | {english_stem} | english_stem | {harewoodsolut}
+ blank | Space symbols | . | {} | |
+ parttoken | Part of file/url/host/email | co | {english_stem} | english_stem | {co}
+ blank | Space symbols | . | {} | |
+ parttoken | Part of file/url/host/email | uk | {english_stem} | english_stem | {uk}
+ url_path | URL path | /press.aspx | {simple} | simple | {/press.aspx}
+ blank | Space symbols | / | {} | |
+ parttoken | Part of file/url/host/email | press | {english_stem} | english_stem | {press}
+ blank | Space symbols | . | {} | |
+ parttoken | Part of file/url/host/email | aspx | {english_stem} | english_stem | {aspx}
+ tag | XML tag | </span> | {} | |
+(16 rows)
SELECT * from ts_debug('english', 'http://aew.wer0c.ewr/id?ad=qwe&dw<span>');
- alias | description | token | dictionaries | dictionary | lexemes
-----------+---------------+----------------------------+--------------+------------+------------------------------
- protocol | Protocol head | http:// | {} | |
- url | URL | aew.wer0c.ewr/id?ad=qwe&dw | {simple} | simple | {aew.wer0c.ewr/id?ad=qwe&dw}
- host | Host | aew.wer0c.ewr | {simple} | simple | {aew.wer0c.ewr}
- url_path | URL path | /id?ad=qwe&dw | {simple} | simple | {/id?ad=qwe&dw}
- tag | XML tag | <span> | {} | |
-(5 rows)
+ alias | description | token | dictionaries | dictionary | lexemes
+-----------+-----------------------------+----------------------------+----------------+--------------+------------------------------
+ protocol | Protocol head | http:// | {} | |
+ url | URL | aew.wer0c.ewr/id?ad=qwe&dw | {simple} | simple | {aew.wer0c.ewr/id?ad=qwe&dw}
+ host | Host | aew.wer0c.ewr | {simple} | simple | {aew.wer0c.ewr}
+ parttoken | Part of file/url/host/email | aew | {english_stem} | english_stem | {aew}
+ blank | Space symbols | . | {} | |
+ parttoken | Part of file/url/host/email | wer | {english_stem} | english_stem | {wer}
+ parttoken | Part of file/url/host/email | 0c | {english_stem} | english_stem | {0c}
+ blank | Space symbols | . | {} | |
+ parttoken | Part of file/url/host/email | ewr | {english_stem} | english_stem | {ewr}
+ url_path | URL path | /id?ad=qwe&dw | {simple} | simple | {/id?ad=qwe&dw}
+ blank | Space symbols | / | {} | |
+ parttoken | Part of file/url/host/email | id | {english_stem} | english_stem | {id}
+ blank | Space symbols | ? | {} | |
+ parttoken | Part of file/url/host/email | ad | {english_stem} | english_stem | {ad}
+ blank | Space symbols | = | {} | |
+ parttoken | Part of file/url/host/email | qwe | {english_stem} | english_stem | {qwe}
+ blank | Space symbols | & | {} | |
+ parttoken | Part of file/url/host/email | dw | {english_stem} | english_stem | {dw}
+ tag | XML tag | <span> | {} | |
+(19 rows)
SELECT * from ts_debug('english', 'http://5aew.werc.ewr:8100/?');
- alias | description | token | dictionaries | dictionary | lexemes
-----------+---------------+----------------------+--------------+------------+------------------------
- protocol | Protocol head | http:// | {} | |
- url | URL | 5aew.werc.ewr:8100/? | {simple} | simple | {5aew.werc.ewr:8100/?}
- host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100}
- url_path | URL path | /? | {simple} | simple | {/?}
-(4 rows)
+ alias | description | token | dictionaries | dictionary | lexemes
+-----------+-----------------------------+----------------------+----------------+--------------+------------------------
+ protocol | Protocol head | http:// | {} | |
+ url | URL | 5aew.werc.ewr:8100/? | {simple} | simple | {5aew.werc.ewr:8100/?}
+ host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100}
+ parttoken | Part of file/url/host/email | 5aew | {english_stem} | english_stem | {5aew}
+ blank | Space symbols | . | {} | |
+ parttoken | Part of file/url/host/email | werc | {english_stem} | english_stem | {werc}
+ blank | Space symbols | . | {} | |
+ parttoken | Part of file/url/host/email | ewr | {english_stem} | english_stem | {ewr}
+ blank | Space symbols | : | {} | |
+ parttoken | Part of file/url/host/email | 8100 | {english_stem} | english_stem | {8100}
+ url_path | URL path | /? | {simple} | simple | {/?}
+ blank | Space symbols | /? | {} | |
+(12 rows)
SELECT * from ts_debug('english', '5aew.werc.ewr:8100/?xx');
- alias | description | token | dictionaries | dictionary | lexemes
-----------+-------------+------------------------+--------------+------------+--------------------------
- url | URL | 5aew.werc.ewr:8100/?xx | {simple} | simple | {5aew.werc.ewr:8100/?xx}
- host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100}
- url_path | URL path | /?xx | {simple} | simple | {/?xx}
-(3 rows)
+ alias | description | token | dictionaries | dictionary | lexemes
+-----------+-----------------------------+------------------------+----------------+--------------+--------------------------
+ url | URL | 5aew.werc.ewr:8100/?xx | {simple} | simple | {5aew.werc.ewr:8100/?xx}
+ host | Host | 5aew.werc.ewr:8100 | {simple} | simple | {5aew.werc.ewr:8100}
+ parttoken | Part of file/url/host/email | 5aew | {english_stem} | english_stem | {5aew}
+ blank | Space symbols | . | {} | |
+ parttoken | Part of file/url/host/email | werc | {english_stem} | english_stem | {werc}
+ blank | Space symbols | . | {} | |
+ parttoken | Part of file/url/host/email | ewr | {english_stem} | english_stem | {ewr}
+ blank | Space symbols | : | {} | |
+ parttoken | Part of file/url/host/email | 8100 | {english_stem} | english_stem | {8100}
+ url_path | URL path | /?xx | {simple} | simple | {/?xx}
+ blank | Space symbols | /? | {} | |
+ parttoken | Part of file/url/host/email | xx | {english_stem} | english_stem | {xx}
+(12 rows)
-- to_tsquery
SELECT to_tsquery('english', 'qwe & sKies ');
@@ -1002,7 +1167,7 @@
SELECT to_tsvector('SKIES My booKs');
to_tsvector
----------------------------
- 'books':3 'my':2 'skies':1
+ 'books':2 'my':1 'skies':0
(1 row)
SELECT plainto_tsquery('SKIES My booKs');
@@ -1021,7 +1186,7 @@
SELECT to_tsvector('SKIES My booKs');
to_tsvector
------------------
- 'book':3 'sky':1
+ 'book':2 'sky':0
(1 row)
SELECT plainto_tsquery('SKIES My booKs');
@@ -1075,20 +1240,20 @@
select * from pendtest where 'ipsu:*'::tsquery @@ ts;
ts
--------------------
- 'ipsum':2 'lore':1
+ 'ipsum':1 'lore':0
(1 row)
select * from pendtest where 'ipsa:*'::tsquery @@ ts;
ts
--------------------
- 'ipsam':2 'lore':1
+ 'ipsam':1 'lore':0
(1 row)
select * from pendtest where 'ips:*'::tsquery @@ ts;
ts
--------------------
- 'ipsam':2 'lore':1
- 'ipsum':2 'lore':1
+ 'ipsam':1 'lore':0
+ 'ipsum':1 'lore':0
(2 rows)
select * from pendtest where 'ipt:*'::tsquery @@ ts;
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers