For the headline generation to work properly, email/file/url/host need
to become skip tokens. Updating the patch with that change.

-Sushant.

On Sat, 2010-09-04 at 13:25 +0530, Sushant Sinha wrote:
> Updating the patch with emitting parttoken and registering it with
> snowball config.
> 
> -Sushant.
> 
> On Fri, 2010-09-03 at 09:44 -0400, Robert Haas wrote:
> > On Wed, Sep 1, 2010 at 2:42 AM, Sushant Sinha <sushant...@gmail.com> wrote:
> > > I have attached a patch that emits parts of a host token, a url token,
> > > an email token and a file token. Further, it makes sure that a
> > > host/url/email/file token and the first part-token are at the same
> > > position in tsvector.
> > 
> > You should probably add this patch here:
> > 
> > https://commitfest.postgresql.org/action/commitfest_view/open
> > 
> 

Index: src/backend/snowball/snowball.sql.in
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/snowball/snowball.sql.in,v
retrieving revision 1.6
diff -u -r1.6 snowball.sql.in
--- src/backend/snowball/snowball.sql.in	27 Oct 2007 16:01:08 -0000	1.6
+++ src/backend/snowball/snowball.sql.in	7 Sep 2010 01:46:55 -0000
@@ -22,6 +22,6 @@
 	WITH _ASCDICTNAME_;
 
 ALTER TEXT SEARCH CONFIGURATION _CFGNAME_ ADD MAPPING
-    FOR word, hword_part, hword
+    FOR word, hword_part, hword, parttoken
 	WITH _NONASCDICTNAME_;
 
Index: src/backend/tsearch/ts_parse.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/tsearch/ts_parse.c,v
retrieving revision 1.17
diff -u -r1.17 ts_parse.c
--- src/backend/tsearch/ts_parse.c	26 Feb 2010 02:01:05 -0000	1.17
+++ src/backend/tsearch/ts_parse.c	7 Sep 2010 01:46:55 -0000
@@ -19,7 +19,7 @@
 #include "tsearch/ts_utils.h"
 
 #define IGNORE_LONGLEXEME	1
-
+#define COMPLEX_TOKEN(x) ( x == 4 || x == 5 || x == 6 || x == 18 || x == 17 || x == 18 || x == 19)   
 /*
  * Lexize subsystem
  */
@@ -407,8 +407,6 @@
 		{
 			TSLexeme   *ptr = norms;
 
-			prs->pos++;			/* set pos */
-
 			while (ptr->lexeme)
 			{
 				if (prs->curwords == prs->lenwords)
@@ -429,6 +427,10 @@
 				prs->curwords++;
 			}
 			pfree(norms);
+
+			if (!COMPLEX_TOKEN(type)) 
+				prs->pos++;			/* set pos */
+
 		}
 	} while (type > 0);
 
Index: src/backend/tsearch/wparser_def.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/tsearch/wparser_def.c,v
retrieving revision 1.33
diff -u -r1.33 wparser_def.c
--- src/backend/tsearch/wparser_def.c	19 Aug 2010 05:57:34 -0000	1.33
+++ src/backend/tsearch/wparser_def.c	7 Sep 2010 01:46:56 -0000
@@ -23,7 +23,7 @@
 
 
 /* Define me to enable tracing of parser behavior */
-/* #define WPARSER_TRACE */
+//#define WPARSER_TRACE 
 
 
 /* Output token categories */
@@ -51,8 +51,9 @@
 #define SIGNEDINT		21
 #define UNSIGNEDINT		22
 #define XMLENTITY		23
+#define PARTTOKEN		24
 
-#define LASTNUM			23
+#define LASTNUM			24
 
 static const char *const tok_alias[] = {
 	"",
@@ -78,7 +79,8 @@
 	"float",
 	"int",
 	"uint",
-	"entity"
+	"entity",
+	"parttoken"
 };
 
 static const char *const lex_descr[] = {
@@ -105,7 +107,8 @@
 	"Decimal notation",
 	"Signed integer",
 	"Unsigned integer",
-	"XML entity"
+	"XML entity",
+    "Part of file/url/host/email"
 };
 
 
@@ -249,7 +252,8 @@
 	TParserPosition *state;
 	bool		ignore;
 	bool		wanthost;
-
+	int 		partstop;
+	TParserState	afterpart;
 	/* silly char */
 	char		c;
 
@@ -617,8 +621,41 @@
 	}
 	return 1;
 }
+static int
+p_ispartbingo(TParser *prs)
+{
+	int ret = 0;
+	if (prs->partstop > 0)
+	{
+		ret = 1;
+		if (prs->partstop <= prs->state->posbyte)	
+		{
+			prs->state->state = prs->afterpart;
+			prs->partstop = 0;
+		}
+		else
+			prs->state->state = TPS_Base;
+	}
+	return ret; 
+}
 
+static int
+p_ispart(TParser *prs)
+{
+	if (prs->partstop > 0)
+		return  1;
+	else
+		return 0;
+}
 
+static int
+p_ispartEOF(TParser *prs)
+{
+	if (p_ispart(prs) && p_isEOF(prs))
+ 		return 1;
+	else
+		return 0;
+}
 /* deliberately suppress unused-function complaints for the above */
 void		_make_compiler_happy(void);
 void
@@ -688,6 +725,21 @@
 }
 
 static void
+SpecialPart(TParser *prs)
+{
+	prs->partstop = prs->state->posbyte;
+	prs->state->posbyte -= prs->state->lenbytetoken;
+	prs->state->poschar -= prs->state->lenchartoken;
+	prs->afterpart = TPS_Base;
+}
+static void
+SpecialUrlPart(TParser *prs)
+{
+	SpecialPart(prs);
+	prs->afterpart = TPS_InURLPathStart;
+}
+
+static void
 SpecialVerVersion(TParser *prs)
 {
 	prs->state->posbyte -= prs->state->lenbytetoken;
@@ -1057,6 +1109,7 @@
 	{p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
 	{p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL},
 	{p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL},
+	{p_ispart, 0, A_NEXT, TPS_InSpace, 0, NULL},
 	{p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL},
 	{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
 	{p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL},
@@ -1065,9 +1118,11 @@
 
 
 static const TParserStateActionItem actionTPS_InNumWord[] = {
+	{p_ispartEOF, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL},
 	{p_isEOF, 0, A_BINGO, TPS_Base, NUMWORD, NULL},
 	{p_isalnum, 0, A_NEXT, TPS_InNumWord, 0, NULL},
 	{p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+	{p_ispartbingo, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL},
 	{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
 	{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
 	{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
@@ -1076,8 +1131,10 @@
 };
 
 static const TParserStateActionItem actionTPS_InAsciiWord[] = {
+	{p_ispartEOF, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL},
 	{p_isEOF, 0, A_BINGO, TPS_Base, ASCIIWORD, NULL},
 	{p_isasclet, 0, A_NEXT, TPS_Null, 0, NULL},
+	{p_ispartbingo, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL},
 	{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
 	{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
 	{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
@@ -1094,6 +1151,7 @@
 };
 
 static const TParserStateActionItem actionTPS_InWord[] = {
+	{p_ispartEOF, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL},
 	{p_isEOF, 0, A_BINGO, TPS_Base, WORD_T, NULL},
 	{p_isalpha, 0, A_NEXT, TPS_Null, 0, NULL},
 	{p_isspecial, 0, A_NEXT, TPS_Null, 0, NULL},
@@ -1103,15 +1161,17 @@
 };
 
 static const TParserStateActionItem actionTPS_InUnsignedInt[] = {
+	{p_ispartEOF, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL},
 	{p_isEOF, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL},
 	{p_isdigit, 0, A_NEXT, TPS_Null, 0, NULL},
+	{p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
+	{p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+	{p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
+	{p_ispartbingo, 0, A_BINGO, TPS_Null, PARTTOKEN, NULL},
 	{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
 	{p_iseqC, '.', A_PUSH, TPS_InUDecimalFirst, 0, NULL},
 	{p_iseqC, 'e', A_PUSH, TPS_InMantissaFirst, 0, NULL},
 	{p_iseqC, 'E', A_PUSH, TPS_InMantissaFirst, 0, NULL},
-	{p_isasclet, 0, A_PUSH, TPS_InHost, 0, NULL},
-	{p_isalpha, 0, A_NEXT, TPS_InNumWord, 0, NULL},
-	{p_isspecial, 0, A_NEXT, TPS_InNumWord, 0, NULL},
 	{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
 	{NULL, 0, A_BINGO, TPS_Base, UNSIGNEDINT, NULL}
 };
@@ -1418,7 +1478,7 @@
 };
 
 static const TParserStateActionItem actionTPS_InHostDomain[] = {
-	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
+	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, SpecialPart},
 	{p_isasclet, 0, A_NEXT, TPS_InHostDomain, 0, NULL},
 	{p_isdigit, 0, A_PUSH, TPS_InHost, 0, NULL},
 	{p_iseqC, ':', A_PUSH, TPS_InPortFirst, 0, NULL},
@@ -1427,9 +1487,9 @@
 	{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
 	{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
 	{p_isdigit, 0, A_POP, TPS_Null, 0, NULL},
-	{p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
+	{p_isstophost, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, SpecialUrlPart},
 	{p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
-	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
+	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, SpecialPart}
 };
 
 static const TParserStateActionItem actionTPS_InPortFirst[] = {
@@ -1439,11 +1499,11 @@
 };
 
 static const TParserStateActionItem actionTPS_InPort[] = {
-	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL},
+	{p_isEOF, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, SpecialPart},
 	{p_isdigit, 0, A_NEXT, TPS_InPort, 0, NULL},
-	{p_isstophost, 0, A_BINGO | A_CLRALL, TPS_InURLPathStart, HOST, NULL},
+	{p_isstophost, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, SpecialUrlPart},
 	{p_iseqC, '/', A_PUSH, TPS_InFURL, 0, NULL},
-	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, NULL}
+	{NULL, 0, A_BINGO | A_CLRALL, TPS_Base, HOST, SpecialPart}
 };
 
 static const TParserStateActionItem actionTPS_InHostFirstAN[] = {
@@ -1457,6 +1517,7 @@
 	{p_isEOF, 0, A_POP, TPS_Null, 0, NULL},
 	{p_isdigit, 0, A_NEXT, TPS_InHost, 0, NULL},
 	{p_isasclet, 0, A_NEXT, TPS_InHost, 0, NULL},
+	{p_ispartbingo, 0, A_BINGO | A_CLRALL, TPS_Null, PARTTOKEN, NULL},
 	{p_iseqC, '@', A_PUSH, TPS_InEmail, 0, NULL},
 	{p_iseqC, '.', A_PUSH, TPS_InHostFirstDomain, 0, NULL},
 	{p_iseqC, '-', A_PUSH, TPS_InHostFirstAN, 0, NULL},
@@ -1466,7 +1527,7 @@
 
 static const TParserStateActionItem actionTPS_InEmail[] = {
 	{p_isstophost, 0, A_POP, TPS_Null, 0, NULL},
-	{p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, NULL},
+	{p_ishost, 0, A_BINGO | A_CLRALL, TPS_Base, EMAIL, SpecialPart},
 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
@@ -1507,22 +1568,22 @@
 };
 
 static const TParserStateActionItem actionTPS_InPathSecond[] = {
-	{p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
+	{p_isEOF, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, SpecialPart},
 	{p_iseqC, '/', A_NEXT | A_PUSH, TPS_InFileFirst, 0, NULL},
-	{p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
-	{p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, NULL},
+	{p_iseqC, '/', A_BINGO | A_CLEAR, TPS_Base, FILEPATH, SpecialPart},
+	{p_isspace, 0, A_BINGO | A_CLEAR, TPS_Base, FILEPATH, SpecialPart},
 	{NULL, 0, A_POP, TPS_Null, 0, NULL}
 };
 
 static const TParserStateActionItem actionTPS_InFile[] = {
-	{p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, NULL},
+	{p_isEOF, 0, A_BINGO, TPS_Base, FILEPATH, SpecialPart},
 	{p_isasclet, 0, A_NEXT, TPS_InFile, 0, NULL},
 	{p_isdigit, 0, A_NEXT, TPS_InFile, 0, NULL},
 	{p_iseqC, '.', A_PUSH, TPS_InFileNext, 0, NULL},
 	{p_iseqC, '_', A_NEXT, TPS_InFile, 0, NULL},
 	{p_iseqC, '-', A_NEXT, TPS_InFile, 0, NULL},
 	{p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL},
-	{NULL, 0, A_BINGO, TPS_Base, FILEPATH, NULL}
+	{NULL, 0, A_BINGO, TPS_Base, FILEPATH, SpecialPart}
 };
 
 static const TParserStateActionItem actionTPS_InFileNext[] = {
@@ -1544,9 +1605,9 @@
 };
 
 static const TParserStateActionItem actionTPS_InURLPath[] = {
-	{p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, NULL},
+	{p_isEOF, 0, A_BINGO, TPS_Base, URLPATH, SpecialPart},
 	{p_isurlchar, 0, A_NEXT, TPS_InURLPath, 0, NULL},
-	{NULL, 0, A_BINGO, TPS_Base, URLPATH, NULL}
+	{NULL, 0, A_BINGO, TPS_Base, URLPATH, SpecialPart}
 };
 
 static const TParserStateActionItem actionTPS_InFURL[] = {
@@ -2003,8 +2064,8 @@
 
 #define TS_IDIGNORE(x)  ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY )
 #define HLIDREPLACE(x)  ( (x)==TAG_T )
-#define HLIDSKIP(x)     ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
-#define XMLHLIDSKIP(x)  ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD )
+#define HLIDSKIP(x)     ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD || (x) == EMAIL || (x) == FILEPATH || (x) == URLPATH || (x) == HOST)
+#define XMLHLIDSKIP(x)  ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD || (x) == EMAIL || (x) == FILEPATH || (x) == URLPATH || (x) == HOST)
 #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDREPLACE(x) || HLIDSKIP(x) )
 #define NOENDTOKEN(x)	( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) )
 
Index: src/test/regress/expected/tsdicts.out
===================================================================
RCS file: /projects/cvsroot/pgsql/src/test/regress/expected/tsdicts.out,v
retrieving revision 1.6
diff -u -r1.6 tsdicts.out
--- src/test/regress/expected/tsdicts.out	14 Aug 2009 14:53:20 -0000	1.6
+++ src/test/regress/expected/tsdicts.out	7 Sep 2010 01:47:00 -0000
@@ -236,9 +236,9 @@
 	word, numword, asciiword, hword, numhword, asciihword, hword_part, hword_numpart, hword_asciipart
 	WITH ispell, english_stem;
 SELECT to_tsvector('ispell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
-                                            to_tsvector                                             
-----------------------------------------------------------------------------------------------------
- 'ball':7 'book':1,5 'booking':1,5 'foot':7,10 'football':7 'footballklubber':7 'klubber':7 'sky':3
+                                            to_tsvector                                            
+---------------------------------------------------------------------------------------------------
+ 'ball':6 'book':0,4 'booking':0,4 'foot':6,9 'football':6 'footballklubber':6 'klubber':6 'sky':2
 (1 row)
 
 SELECT to_tsquery('ispell_tst', 'footballklubber');
@@ -260,9 +260,9 @@
 ALTER TEXT SEARCH CONFIGURATION hunspell_tst ALTER MAPPING
 	REPLACE ispell WITH hunspell;
 SELECT to_tsvector('hunspell_tst', 'Booking the skies after rebookings for footballklubber from a foot');
-                                            to_tsvector                                             
-----------------------------------------------------------------------------------------------------
- 'ball':7 'book':1,5 'booking':1,5 'foot':7,10 'football':7 'footballklubber':7 'klubber':7 'sky':3
+                                            to_tsvector                                            
+---------------------------------------------------------------------------------------------------
+ 'ball':6 'book':0,4 'booking':0,4 'foot':6,9 'football':6 'footballklubber':6 'klubber':6 'sky':2
 (1 row)
 
 SELECT to_tsquery('hunspell_tst', 'footballklubber');
@@ -285,21 +285,21 @@
 	asciiword, hword_asciipart, asciihword 
 	WITH synonym, english_stem;
 SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
-                    to_tsvector                    
----------------------------------------------------
- 'call':4 'often':3 'pgsql':1,6,8,12 'pronounc':10
+                   to_tsvector                    
+--------------------------------------------------
+ 'call':3 'often':2 'pgsql':0,5,7,11 'pronounc':9
 (1 row)
 
 SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead of Google');
-                       to_tsvector                        
-----------------------------------------------------------
- 'common':2 'googl':7,10 'instead':8 'mistak':3 'write':6
+                       to_tsvector                       
+---------------------------------------------------------
+ 'common':1 'googl':6,9 'instead':7 'mistak':2 'write':5
 (1 row)
 
 SELECT to_tsvector('synonym_tst', 'Indexes or indices - Which is right plural form of index?');
-                 to_tsvector                  
-----------------------------------------------
- 'form':8 'index':1,3,10 'plural':7 'right':6
+                 to_tsvector                 
+---------------------------------------------
+ 'form':7 'index':0,2,9 'plural':6 'right':5
 (1 row)
 
 SELECT to_tsquery('synonym_tst', 'Index & indices');
@@ -319,18 +319,18 @@
 SELECT to_tsvector('thesaurus_tst', 'one postgres one two one two three one');
            to_tsvector            
 ----------------------------------
- '1':1,5 '12':3 '123':4 'pgsql':2
+ '1':0,4 '12':2 '123':3 'pgsql':1
 (1 row)
 
 SELECT to_tsvector('thesaurus_tst', 'Supernovae star is very new star and usually called supernovae (abbrevation SN)');
-                         to_tsvector                         
--------------------------------------------------------------
- 'abbrev':10 'call':8 'new':4 'sn':1,9,11 'star':5 'usual':7
+                        to_tsvector                         
+------------------------------------------------------------
+ 'abbrev':9 'call':7 'new':3 'sn':0,8,10 'star':4 'usual':6
 (1 row)
 
 SELECT to_tsvector('thesaurus_tst', 'Booking tickets is looking like a booking a tickets');
-                      to_tsvector                      
--------------------------------------------------------
- 'card':3,10 'invit':2,9 'like':6 'look':5 'order':1,8
+                     to_tsvector                      
+------------------------------------------------------
+ 'card':2,9 'invit':1,8 'like':5 'look':4 'order':0,7
 (1 row)
 
Index: src/test/regress/expected/tsearch.out
===================================================================
RCS file: /projects/cvsroot/pgsql/src/test/regress/expected/tsearch.out,v
retrieving revision 1.18
diff -u -r1.18 tsearch.out
--- src/test/regress/expected/tsearch.out	28 Apr 2010 02:04:16 -0000	1.18
+++ src/test/regress/expected/tsearch.out	7 Sep 2010 01:47:01 -0000
@@ -251,7 +251,8 @@
     21 | int             | Signed integer
     22 | uint            | Unsigned integer
     23 | entity          | XML entity
-(23 rows)
+    24 | parttoken       | Part of file/url/host/email
+(24 rows)
 
 SELECT * FROM ts_parse('default', '345 q...@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teo...@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
 /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
@@ -263,35 +264,91 @@
      1 | qwe
     12 | @
     19 | efd.r
+    24 | efd
+    12 | .
+    24 | r
     12 |  ' 
     14 | http://
      6 | www.com
+    24 | www
+    12 | .
+    24 | com
     12 | / 
     14 | http://
      5 | aew.werc.ewr/?ad=qwe&dw
      6 | aew.werc.ewr
+    24 | aew
+    12 | .
+    24 | werc
+    12 | .
+    24 | ewr
     18 | /?ad=qwe&dw
+    12 | /?
+    24 | ad
+    12 | =
+    24 | qwe
+    12 | &
+    24 | dw
     12 |  
      5 | 1aew.werc.ewr/?ad=qwe&dw
      6 | 1aew.werc.ewr
+    24 | 1aew
+    12 | .
+    24 | werc
+    12 | .
+    24 | ewr
     18 | /?ad=qwe&dw
+    12 | /?
+    24 | ad
+    12 | =
+    24 | qwe
+    12 | &
+    24 | dw
     12 |  
      6 | 2aew.werc.ewr
+    24 | 2aew
+    12 | .
+    24 | werc
+    12 | .
+    24 | ewr
     12 |  
     14 | http://
      5 | 3aew.werc.ewr/?ad=qwe&dw
      6 | 3aew.werc.ewr
+    24 | 3aew
+    12 | .
+    24 | werc
+    12 | .
+    24 | ewr
     18 | /?ad=qwe&dw
+    12 | /?
+    24 | ad
+    12 | =
+    24 | qwe
+    12 | &
+    24 | dw
     12 |  
     14 | http://
      6 | 4aew.werc.ewr
+    24 | 4aew
+    12 | .
+    24 | werc
+    12 | .
+    24 | ewr
     12 |  
     14 | http://
      5 | 5aew.werc.ewr:8100/?
      6 | 5aew.werc.ewr:8100
+    24 | 5aew
+    12 | .
+    24 | werc
+    12 | .
+    24 | ewr
+    12 | :
+    24 | 8100
     18 | /?
-    12 |   
-     1 | ad
+    12 | /?  
+    24 | ad
     12 | =
      1 | qwe
     12 | &
@@ -299,11 +356,41 @@
     12 |  
      5 | 6aew.werc.ewr:8100/?ad=qwe&dw
      6 | 6aew.werc.ewr:8100
+    24 | 6aew
+    12 | .
+    24 | werc
+    12 | .
+    24 | ewr
+    12 | :
+    24 | 8100
     18 | /?ad=qwe&dw
+    12 | /?
+    24 | ad
+    12 | =
+    24 | qwe
+    12 | &
+    24 | dw
     12 |  
      5 | 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32
      6 | 7aew.werc.ewr:8100
+    24 | 7aew
+    12 | .
+    24 | werc
+    12 | .
+    24 | ewr
+    12 | :
+    24 | 8100
     18 | /?ad=qwe&dw=%20%32
+    12 | /?
+    24 | ad
+    12 | =
+    24 | qwe
+    12 | &
+    24 | dw
+    12 | =%
+    24 | 20
+    12 | %
+    24 | 32
     12 |  
      7 | +4.0e-10
     12 |  
@@ -320,6 +407,11 @@
     20 | 5.005
     12 |  
      4 | teo...@stack.net
+    24 | teodor
+    12 | @
+    24 | stack
+    12 | .
+    24 | net
     12 |  
     16 | qwe-wer
     11 | qwe
@@ -349,20 +441,51 @@
     12 |                                     +
        | 
     19 | /usr/local/fff
+    12 | /
+    24 | usr
+    12 | /
+    24 | local
+    12 | /
+    24 | fff
     12 |  
     19 | /awdf/dwqe/4325
+    12 | /
+    24 | awdf
+    12 | /
+    24 | dwqe
+    12 | /
+    24 | 4325
     12 |  
     19 | rewt/ewr
+    24 | rewt
+    12 | /
+    24 | ewr
     12 |  
      1 | wefjn
     12 |  
     19 | /wqe-324/ewr
+    12 | /
+    24 | wqe
+    21 | -324
+    12 | /
+    24 | ewr
     12 |  
     19 | gist.h
+    24 | gist
+    12 | .
+    24 | h
     12 |  
     19 | gist.h.c
+    24 | gist
+    12 | .
+    24 | h
+    12 | .
+    24 | c
     12 |  
     19 | gist.c
+    24 | gist
+    12 | .
+    24 | c
     12 | . 
      1 | readline
     12 |  
@@ -393,14 +516,14 @@
     12 |  
     12 | <> 
      1 | qwerty
-(133 rows)
+(255 rows)
 
 SELECT to_tsvector('english', '345 q...@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teo...@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
 /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234
 <i <b> wow  < jqw <> qwerty');
-                                                                                                                                                                                                                                                                                                                                                                                                                                       to_tsvector                                                                                                                                                                                                                                                                                                                                                                                                                                        
-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
- '+4.0e-10':28 '-4.2':60,62 '/?':18 '/?ad=qwe&dw':7,10,14,24 '/?ad=qwe&dw=%20%32':27 '/awdf/dwqe/4325':48 '/usr/local/fff':47 '/wqe-324/ewr':51 '1aew.werc.ewr':9 '1aew.werc.ewr/?ad=qwe&dw':8 '234':63 '234.435':32 '2aew.werc.ewr':11 '345':1 '3aew.werc.ewr':13 '3aew.werc.ewr/?ad=qwe&dw':12 '4.2':56,57,58 '455':33 '4aew.werc.ewr':15 '5.005':34 '5aew.werc.ewr:8100':17 '5aew.werc.ewr:8100/?':16 '6aew.werc.ewr:8100':23 '6aew.werc.ewr:8100/?ad=qwe&dw':22 '7aew.werc.ewr:8100':26 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':25 'ad':19 'aew.werc.ewr':6 'aew.werc.ewr/?ad=qwe&dw':5 'asdf':39 'dw':21 'efd.r':3 'ewr1':45 'ewri2':46 'gist.c':54 'gist.h':52 'gist.h.c':53 'hjwer':44 'jf':41 'jqw':66 'qwe':2,20,29,30,37 'qwe-wer':36 'qwer':40 'qwerti':67 'qwqwe':31 'readlin':55,59,61 'rewt/ewr':49 'sdjk':42 'teo...@stack.net':35 'wefjn':50 'wer':38 'wow':65 'www.com':4
+                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     to_tsvector                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      
+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
+ '+4.0e-10':53 '-324':84 '-4.2':98,100 '/?':34 '/?ad=qwe&dw':9,15,24,41 '/?ad=qwe&dw=%20%32':48 '/awdf/dwqe/4325':77 '/usr/local/fff':74 '/wqe-324/ewr':83 '1aew':12 '1aew.werc.ewr':12 '1aew.werc.ewr/?ad=qwe&dw':12 '20':51 '234':101 '234.435':57 '2aew':18 '2aew.werc.ewr':18 '32':52 '345':0 '3aew':21 '3aew.werc.ewr':21 '3aew.werc.ewr/?ad=qwe&dw':21 '4.2':94,95,96 '4325':79 '455':58 '4aew':27 '4aew.werc.ewr':27 '5.005':59 '5aew':30 '5aew.werc.ewr:8100':30 '5aew.werc.ewr:8100/?':30 '6aew':37 '6aew.werc.ewr:8100':37 '6aew.werc.ewr:8100/?ad=qwe&dw':37 '7aew':44 '7aew.werc.ewr:8100':44 '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32':44 '8100':33,40,47 'ad':9,15,24,34,41,48 'aew':6 'aew.werc.ewr':6 'aew.werc.ewr/?ad=qwe&dw':6 'asdf':66 'awdf':77 'c':90,92 'com':5 'dw':11,17,26,36,43,50 'dwqe':78 'efd':2 'efd.r':2 'ewr':8,14,20,23,29,32,39,46,81,85 'ewr1':72 'ewri2':73 'fff':76 'gist':86,88,91 'gist.c':91 'gist.h':86 'gist.h.c':88 'h':87,89 'hjwer':71 'jf':68 'jqw':104 'local':75 'net':62 'qwe':1,10,16,25,35,42,49,54,55,64 'qwe-wer':63 'qwer':67 'qwerti':105 'qwqwe':56 'r':3 'readlin':93,97,99 'rewt':80 'rewt/ewr':80 'sdjk':69 'stack':61 'teodor':60 'teo...@stack.net':60 'usr':74 'wefjn':82 'wer':65 'werc':7,13,19,22,28,31,38,45 'wow':103 'wqe':83 'www':4 'www.com':4
 (1 row)
 
 SELECT length(to_tsvector('english', '345 q...@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/?  ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teo...@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">
@@ -408,7 +531,7 @@
 <i <b> wow  < jqw <> qwerty'));
  length 
 --------
-     53
+     85
 (1 row)
 
 -- ts_debug
@@ -428,41 +551,83 @@
 
 -- check parsing of URLs
 SELECT * from ts_debug('english', 'http://www.harewoodsolutions.co.uk/press.aspx</span>');
-  alias   |  description  |                 token                  | dictionaries | dictionary |                 lexemes                  
-----------+---------------+----------------------------------------+--------------+------------+------------------------------------------
- protocol | Protocol head | http://                                | {}           |            | 
- url      | URL           | www.harewoodsolutions.co.uk/press.aspx | {simple}     | simple     | {www.harewoodsolutions.co.uk/press.aspx}
- host     | Host          | www.harewoodsolutions.co.uk            | {simple}     | simple     | {www.harewoodsolutions.co.uk}
- url_path | URL path      | /press.aspx                            | {simple}     | simple     | {/press.aspx}
- tag      | XML tag       | </span>                                | {}           |            | 
-(5 rows)
+   alias   |         description         |                 token                  |  dictionaries  |  dictionary  |                 lexemes                  
+-----------+-----------------------------+----------------------------------------+----------------+--------------+------------------------------------------
+ protocol  | Protocol head               | http://                                | {}             |              | 
+ url       | URL                         | www.harewoodsolutions.co.uk/press.aspx | {simple}       | simple       | {www.harewoodsolutions.co.uk/press.aspx}
+ host      | Host                        | www.harewoodsolutions.co.uk            | {simple}       | simple       | {www.harewoodsolutions.co.uk}
+ parttoken | Part of file/url/host/email | www                                    | {english_stem} | english_stem | {www}
+ blank     | Space symbols               | .                                      | {}             |              | 
+ parttoken | Part of file/url/host/email | harewoodsolutions                      | {english_stem} | english_stem | {harewoodsolut}
+ blank     | Space symbols               | .                                      | {}             |              | 
+ parttoken | Part of file/url/host/email | co                                     | {english_stem} | english_stem | {co}
+ blank     | Space symbols               | .                                      | {}             |              | 
+ parttoken | Part of file/url/host/email | uk                                     | {english_stem} | english_stem | {uk}
+ url_path  | URL path                    | /press.aspx                            | {simple}       | simple       | {/press.aspx}
+ blank     | Space symbols               | /                                      | {}             |              | 
+ parttoken | Part of file/url/host/email | press                                  | {english_stem} | english_stem | {press}
+ blank     | Space symbols               | .                                      | {}             |              | 
+ parttoken | Part of file/url/host/email | aspx                                   | {english_stem} | english_stem | {aspx}
+ tag       | XML tag                     | </span>                                | {}             |              | 
+(16 rows)
 
 SELECT * from ts_debug('english', 'http://aew.wer0c.ewr/id?ad=qwe&dw<span>');
-  alias   |  description  |           token            | dictionaries | dictionary |           lexemes            
-----------+---------------+----------------------------+--------------+------------+------------------------------
- protocol | Protocol head | http://                    | {}           |            | 
- url      | URL           | aew.wer0c.ewr/id?ad=qwe&dw | {simple}     | simple     | {aew.wer0c.ewr/id?ad=qwe&dw}
- host     | Host          | aew.wer0c.ewr              | {simple}     | simple     | {aew.wer0c.ewr}
- url_path | URL path      | /id?ad=qwe&dw              | {simple}     | simple     | {/id?ad=qwe&dw}
- tag      | XML tag       | <span>                     | {}           |            | 
-(5 rows)
+   alias   |         description         |           token            |  dictionaries  |  dictionary  |           lexemes            
+-----------+-----------------------------+----------------------------+----------------+--------------+------------------------------
+ protocol  | Protocol head               | http://                    | {}             |              | 
+ url       | URL                         | aew.wer0c.ewr/id?ad=qwe&dw | {simple}       | simple       | {aew.wer0c.ewr/id?ad=qwe&dw}
+ host      | Host                        | aew.wer0c.ewr              | {simple}       | simple       | {aew.wer0c.ewr}
+ parttoken | Part of file/url/host/email | aew                        | {english_stem} | english_stem | {aew}
+ blank     | Space symbols               | .                          | {}             |              | 
+ parttoken | Part of file/url/host/email | wer                        | {english_stem} | english_stem | {wer}
+ parttoken | Part of file/url/host/email | 0c                         | {english_stem} | english_stem | {0c}
+ blank     | Space symbols               | .                          | {}             |              | 
+ parttoken | Part of file/url/host/email | ewr                        | {english_stem} | english_stem | {ewr}
+ url_path  | URL path                    | /id?ad=qwe&dw              | {simple}       | simple       | {/id?ad=qwe&dw}
+ blank     | Space symbols               | /                          | {}             |              | 
+ parttoken | Part of file/url/host/email | id                         | {english_stem} | english_stem | {id}
+ blank     | Space symbols               | ?                          | {}             |              | 
+ parttoken | Part of file/url/host/email | ad                         | {english_stem} | english_stem | {ad}
+ blank     | Space symbols               | =                          | {}             |              | 
+ parttoken | Part of file/url/host/email | qwe                        | {english_stem} | english_stem | {qwe}
+ blank     | Space symbols               | &                          | {}             |              | 
+ parttoken | Part of file/url/host/email | dw                         | {english_stem} | english_stem | {dw}
+ tag       | XML tag                     | <span>                     | {}             |              | 
+(19 rows)
 
 SELECT * from ts_debug('english', 'http://5aew.werc.ewr:8100/?');
-  alias   |  description  |        token         | dictionaries | dictionary |        lexemes         
-----------+---------------+----------------------+--------------+------------+------------------------
- protocol | Protocol head | http://              | {}           |            | 
- url      | URL           | 5aew.werc.ewr:8100/? | {simple}     | simple     | {5aew.werc.ewr:8100/?}
- host     | Host          | 5aew.werc.ewr:8100   | {simple}     | simple     | {5aew.werc.ewr:8100}
- url_path | URL path      | /?                   | {simple}     | simple     | {/?}
-(4 rows)
+   alias   |         description         |        token         |  dictionaries  |  dictionary  |        lexemes         
+-----------+-----------------------------+----------------------+----------------+--------------+------------------------
+ protocol  | Protocol head               | http://              | {}             |              | 
+ url       | URL                         | 5aew.werc.ewr:8100/? | {simple}       | simple       | {5aew.werc.ewr:8100/?}
+ host      | Host                        | 5aew.werc.ewr:8100   | {simple}       | simple       | {5aew.werc.ewr:8100}
+ parttoken | Part of file/url/host/email | 5aew                 | {english_stem} | english_stem | {5aew}
+ blank     | Space symbols               | .                    | {}             |              | 
+ parttoken | Part of file/url/host/email | werc                 | {english_stem} | english_stem | {werc}
+ blank     | Space symbols               | .                    | {}             |              | 
+ parttoken | Part of file/url/host/email | ewr                  | {english_stem} | english_stem | {ewr}
+ blank     | Space symbols               | :                    | {}             |              | 
+ parttoken | Part of file/url/host/email | 8100                 | {english_stem} | english_stem | {8100}
+ url_path  | URL path                    | /?                   | {simple}       | simple       | {/?}
+ blank     | Space symbols               | /?                   | {}             |              | 
+(12 rows)
 
 SELECT * from ts_debug('english', '5aew.werc.ewr:8100/?xx');
-  alias   | description |         token          | dictionaries | dictionary |         lexemes          
-----------+-------------+------------------------+--------------+------------+--------------------------
- url      | URL         | 5aew.werc.ewr:8100/?xx | {simple}     | simple     | {5aew.werc.ewr:8100/?xx}
- host     | Host        | 5aew.werc.ewr:8100     | {simple}     | simple     | {5aew.werc.ewr:8100}
- url_path | URL path    | /?xx                   | {simple}     | simple     | {/?xx}
-(3 rows)
+   alias   |         description         |         token          |  dictionaries  |  dictionary  |         lexemes          
+-----------+-----------------------------+------------------------+----------------+--------------+--------------------------
+ url       | URL                         | 5aew.werc.ewr:8100/?xx | {simple}       | simple       | {5aew.werc.ewr:8100/?xx}
+ host      | Host                        | 5aew.werc.ewr:8100     | {simple}       | simple       | {5aew.werc.ewr:8100}
+ parttoken | Part of file/url/host/email | 5aew                   | {english_stem} | english_stem | {5aew}
+ blank     | Space symbols               | .                      | {}             |              | 
+ parttoken | Part of file/url/host/email | werc                   | {english_stem} | english_stem | {werc}
+ blank     | Space symbols               | .                      | {}             |              | 
+ parttoken | Part of file/url/host/email | ewr                    | {english_stem} | english_stem | {ewr}
+ blank     | Space symbols               | :                      | {}             |              | 
+ parttoken | Part of file/url/host/email | 8100                   | {english_stem} | english_stem | {8100}
+ url_path  | URL path                    | /?xx                   | {simple}       | simple       | {/?xx}
+ blank     | Space symbols               | /?                     | {}             |              | 
+ parttoken | Part of file/url/host/email | xx                     | {english_stem} | english_stem | {xx}
+(12 rows)
 
 -- to_tsquery
 SELECT to_tsquery('english', 'qwe & sKies ');
@@ -1002,7 +1167,7 @@
 SELECT to_tsvector('SKIES My booKs');
         to_tsvector         
 ----------------------------
- 'books':3 'my':2 'skies':1
+ 'books':2 'my':1 'skies':0
 (1 row)
 
 SELECT plainto_tsquery('SKIES My booKs');
@@ -1021,7 +1186,7 @@
 SELECT to_tsvector('SKIES My booKs');
    to_tsvector    
 ------------------
- 'book':3 'sky':1
+ 'book':2 'sky':0
 (1 row)
 
 SELECT plainto_tsquery('SKIES My booKs');
@@ -1075,20 +1240,20 @@
 select * from pendtest where 'ipsu:*'::tsquery @@ ts;
          ts         
 --------------------
- 'ipsum':2 'lore':1
+ 'ipsum':1 'lore':0
 (1 row)
 
 select * from pendtest where 'ipsa:*'::tsquery @@ ts;
          ts         
 --------------------
- 'ipsam':2 'lore':1
+ 'ipsam':1 'lore':0
 (1 row)
 
 select * from pendtest where 'ips:*'::tsquery @@ ts;
          ts         
 --------------------
- 'ipsam':2 'lore':1
- 'ipsum':2 'lore':1
+ 'ipsam':1 'lore':0
+ 'ipsum':1 'lore':0
 (2 rows)
 
 select * from pendtest where 'ipt:*'::tsquery @@ ts;
-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Reply via email to