[HACKERS] experimental: TSearch dictionary [de]serialization

Pavel Stehule Tue, 31 Aug 2010 15:20:33 -0700

Hello

I wrote a some very primitive code for testing serialization and de
serialization of TSearch ISpell dictionary. This code working - but it
is useful only for speed test now.


Czech fulltext dictionary is serialized to cca 9MB long file. Saving
needs about 90ms and reading needs same time.

 postgres=# select * from ts_debug('cs','příliš žluťoučký kůň se napil
žluté vody');
   alias   │    description    │   token   │  dictionaries   │
dictionary │   lexemes
───────────┼───────────────────┼───────────┼─────────────────┼────────────┼─────────────
 word      │ Word, all letters │ příliš    │ {cspell,simple} │ cspell
   │ {příliš}
 blank     │ Space symbols     │           │ {}              │ [null]
   │ [null]
 word      │ Word, all letters │ žluťoučký │ {cspell,simple} │ cspell
   │ {žluťoučký}
 blank     │ Space symbols     │           │ {}              │ [null]
   │ [null]
 word      │ Word, all letters │ kůň       │ {cspell,simple} │ cspell
   │ {kůň}
 blank     │ Space symbols     │           │ {}              │ [null]
   │ [null]
 asciiword │ Word, all ASCII   │ se        │ {cspell,simple} │ cspell     │ {}
 blank     │ Space symbols     │           │ {}              │ [null]
   │ [null]
 asciiword │ Word, all ASCII   │ napil     │ {cspell,simple} │ cspell
   │ {napít}
 blank     │ Space symbols     │           │ {}              │ [null]
   │ [null]
 word      │ Word, all letters │ žluté     │ {cspell,simple} │ cspell
   │ {žlutý}
 blank     │ Space symbols     │           │ {}              │ [null]
   │ [null]
 asciiword │ Word, all ASCII   │ vody      │ {cspell,simple} │ cspell
   │ {voda}
(13 rows)

Time: 92.708 ms -- with using a preprocessed dictionary

postgres=# select * from ts_debug('cs','příliš žluťoučký kůň se napil
žluté vody');
   alias   │    description    │   token   │  dictionaries   │
dictionary │   lexemes
───────────┼───────────────────┼───────────┼─────────────────┼────────────┼─────────────
 word      │ Word, all letters │ příliš    │ {cspell,simple} │ cspell
   │ {příliš}
 blank     │ Space symbols     │           │ {}              │ [null]
   │ [null]
 word      │ Word, all letters │ žluťoučký │ {cspell,simple} │ cspell
   │ {žluťoučký}
 blank     │ Space symbols     │           │ {}              │ [null]
   │ [null]
 word      │ Word, all letters │ kůň       │ {cspell,simple} │ cspell
   │ {kůň}
 blank     │ Space symbols     │           │ {}              │ [null]
   │ [null]
 asciiword │ Word, all ASCII   │ se        │ {cspell,simple} │ cspell     │ {}
 blank     │ Space symbols     │           │ {}              │ [null]
   │ [null]
 asciiword │ Word, all ASCII   │ napil     │ {cspell,simple} │ cspell
   │ {napít}
 blank     │ Space symbols     │           │ {}              │ [null]
   │ [null]
 word      │ Word, all letters │ žluté     │ {cspell,simple} │ cspell
   │ {žlutý}
 blank     │ Space symbols     │           │ {}              │ [null]
   │ [null]
 asciiword │ Word, all ASCII   │ vody      │ {cspell,simple} │ cspell
   │ {voda}
(13 rows)

Time: 3.758 ms -- standard time (dictionary is loaded)

postgres=# select * from ts_debug('cs','příliš žluťoučký kůň se napil
žluté vody');
   alias   │    description    │   token   │  dictionaries   │
dictionary │   lexemes
───────────┼───────────────────┼───────────┼─────────────────┼────────────┼─────────────
 word      │ Word, all letters │ příliš    │ {cspell,simple} │ cspell
   │ {příliš}
 blank     │ Space symbols     │           │ {}              │ [null]
   │ [null]
 word      │ Word, all letters │ žluťoučký │ {cspell,simple} │ cspell
   │ {žluťoučký}
 blank     │ Space symbols     │           │ {}              │ [null]
   │ [null]
 word      │ Word, all letters │ kůň       │ {cspell,simple} │ cspell
   │ {kůň}
 blank     │ Space symbols     │           │ {}              │ [null]
   │ [null]
 asciiword │ Word, all ASCII   │ se        │ {cspell,simple} │ cspell     │ {}
 blank     │ Space symbols     │           │ {}              │ [null]
   │ [null]
 asciiword │ Word, all ASCII   │ napil     │ {cspell,simple} │ cspell
   │ {napít}
 blank     │ Space symbols     │           │ {}              │ [null]
   │ [null]
 word      │ Word, all letters │ žluté     │ {cspell,simple} │ cspell
   │ {žlutý}
 blank     │ Space symbols     │           │ {}              │ [null]
   │ [null]
 asciiword │ Word, all ASCII   │ vody      │ {cspell,simple} │ cspell
   │ {voda}
(13 rows)

Time: 518.528 ms --- typical first evaluation time

So using a preprocessed file helps - the time of first processing is
about 4x better. But still this time is 20x slower than using a loaded
dictionary. I found a one issue - I am not able to serialize a full
regexp. Czech dictionary doesn't use it, so I didn't solve this task.
I would to like implement a few hooks to ISpellDictionary to be
possible implement own memory management for ispell dictionaries. I
understand to problems with shared memory or mmap - but I don't see
any different way, than use a third party mmap support. This module
must not be in core - probably this is only local Czech (and maybe
Japan) problem.

Regards

Pavel Stehule

*** ./src/backend/tsearch/dict_ispell.c.orig	2010-08-23 09:16:49.000000000 +0200
--- ./src/backend/tsearch/dict_ispell.c	2010-08-31 23:46:00.178669635 +0200
***************
*** 37,113 ****
  				dictloaded = false,
  				stoploaded = false;
  	ListCell   *l;
  
  	d = (DictISpell *) palloc0(sizeof(DictISpell));
  
! 	foreach(l, dictoptions)
  	{
! 		DefElem    *defel = (DefElem *) lfirst(l);
! 
! 		if (pg_strcasecmp(defel->defname, "DictFile") == 0)
  		{
! 			if (dictloaded)
  				ereport(ERROR,
  						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
! 						 errmsg("multiple DictFile parameters")));
! 			NIImportDictionary(&(d->obj),
! 							 get_tsearch_config_filename(defGetString(defel),
! 														 "dict"));
! 			dictloaded = true;
  		}
! 		else if (pg_strcasecmp(defel->defname, "AffFile") == 0)
  		{
! 			if (affloaded)
! 				ereport(ERROR,
! 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
! 						 errmsg("multiple AffFile parameters")));
! 			NIImportAffixes(&(d->obj),
! 							get_tsearch_config_filename(defGetString(defel),
! 														"affix"));
! 			affloaded = true;
  		}
! 		else if (pg_strcasecmp(defel->defname, "StopWords") == 0)
  		{
! 			if (stoploaded)
! 				ereport(ERROR,
! 						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
! 						 errmsg("multiple StopWords parameters")));
! 			readstoplist(defGetString(defel), &(d->stoplist), lowerstr);
! 			stoploaded = true;
  		}
  		else
  		{
  			ereport(ERROR,
  					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
! 					 errmsg("unrecognized Ispell parameter: \"%s\"",
! 							defel->defname)));
  		}
! 	}
  
- 	if (affloaded && dictloaded)
- 	{
- 		NISortDictionary(&(d->obj));
- 		NISortAffixes(&(d->obj));
- 	}
- 	else if (!affloaded)
- 	{
- 		ereport(ERROR,
- 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- 				 errmsg("missing AffFile parameter")));
- 	}
- 	else
- 	{
- 		ereport(ERROR,
- 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
- 				 errmsg("missing DictFile parameter")));
  	}
  
  	MemoryContextDeleteChildren(CurrentMemoryContext);
  	
  	MemoryContextStats(CurrentMemoryContext);
  	
  	
- 
  	PG_RETURN_POINTER(d);
  }
  
--- 37,132 ----
  				dictloaded = false,
  				stoploaded = false;
  	ListCell   *l;
+ 	int i;
  
  	d = (DictISpell *) palloc0(sizeof(DictISpell));
+ 	
+ 	d->obj.stream = fopen("/tmp/xxx.ft", "r");
+ 	d->obj.mode = 'r';
  
! 	if (d->obj.mode == 'r')
  	{
! 		readSPDict(d->obj.stream, &d->obj);
! 		readAffix(d->obj.stream, &d->obj);
! 		postProcessAffixes(&d->obj);
! 		readStopList(d->obj.stream, &d->stoplist);
! 	}
! 	else
! 	{
! 		foreach(l, dictoptions)
  		{
! 			DefElem    *defel = (DefElem *) lfirst(l);
! 
! 			if (pg_strcasecmp(defel->defname, "DictFile") == 0)
! 			{
! 				if (dictloaded)
! 					ereport(ERROR,
! 							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
! 							 errmsg("multiple DictFile parameters")));
! 				NIImportDictionary(&(d->obj),
! 								 get_tsearch_config_filename(defGetString(defel),
! 															 "dict"));
! 				dictloaded = true;
! 			}
! 			else if (pg_strcasecmp(defel->defname, "AffFile") == 0)
! 			{
! 				if (affloaded)
! 					ereport(ERROR,
! 							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
! 							 errmsg("multiple AffFile parameters")));
! 				NIImportAffixes(&(d->obj),
! 								get_tsearch_config_filename(defGetString(defel),
! 															"affix"));
! 				affloaded = true;
! 			}
! 			else if (pg_strcasecmp(defel->defname, "StopWords") == 0)
! 			{
! 				if (stoploaded)
! 					ereport(ERROR,
! 							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
! 							 errmsg("multiple StopWords parameters")));
! 				readstoplist(defGetString(defel), &(d->stoplist), lowerstr);
! 				stoploaded = true;
! 				
! 			}
! 			else
! 			{
  				ereport(ERROR,
  						(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
! 						 errmsg("unrecognized Ispell parameter: \"%s\"",
! 								defel->defname)));
! 			}
  		}
! 
! 		if (affloaded && dictloaded)
  		{
! 			NISortDictionary(&(d->obj));
! 			NISortAffixes(&(d->obj));
  		}
! 		else if (!affloaded)
  		{
! 			ereport(ERROR,
! 					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
! 					 errmsg("missing AffFile parameter")));
  		}
  		else
  		{
  			ereport(ERROR,
  					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
! 					 errmsg("missing DictFile parameter")));
  		}
! 		
! 		if (d->obj.stream != NULL && d->obj.mode == 'w')
! 			outStopList(d->obj.stream, &d->stoplist);
  
  	}
  
  	MemoryContextDeleteChildren(CurrentMemoryContext);
  	
  	MemoryContextStats(CurrentMemoryContext);
  	
+ 	fclose(d->obj.stream);
  	
  	PG_RETURN_POINTER(d);
  }
  
*** ./src/backend/tsearch/spell.c.orig	2010-01-02 17:57:53.000000000 +0100
--- ./src/backend/tsearch/spell.c	2010-08-31 23:55:16.054672520 +0200
***************
*** 11,23 ****
   *
   *-------------------------------------------------------------------------
   */
- 
  #include "postgres.h"
  
  #include "tsearch/dicts/spell.h"
  #include "tsearch/ts_locale.h"
  #include "utils/memutils.h"
  
  
  /*
   * Initialization requires a lot of memory that's not needed
--- 11,26 ----
   *
   *-------------------------------------------------------------------------
   */
  #include "postgres.h"
  
  #include "tsearch/dicts/spell.h"
  #include "tsearch/ts_locale.h"
+ #include "tsearch/ts_public.h"
  #include "utils/memutils.h"
  
+ #include <stdio.h>
+ #include <time.h>
+ 
  
  /*
   * Initialization requires a lot of memory that's not needed
***************
*** 28,36 ****
--- 31,367 ----
   */
  static MemoryContext tmpCtx = NULL;
  
+ static void *prealloc_mem = NULL;
+ static Size prealloc_free_size;
+ 
+ static void checkTmpCtx(void);
+ 
  #define tmpalloc(sz)  MemoryContextAlloc(tmpCtx, (sz))
  #define tmpalloc0(sz)  MemoryContextAllocZero(tmpCtx, (sz))
  
+ #define WRITE_BINARY(buff, stream)  \
+ 	do { \
+ 		if (fwrite(&(buff), sizeof(buff), 1, stream) != 1) \
+ 			elog(ERROR, "cannot to write to prepared dictionary file"); \
+ 	} while (0);
+ 
+ #define WRITE_STRING(buff, stream) \
+ 	do { \
+ 		int len = -1; \
+ 		if ((buff) != NULL) \
+ 		{ \
+ 			int len = strlen(buff) + 1; \
+ 			WRITE_BINARY(len, stream); \
+ 			if (fwrite(buff, len, 1, stream) != 1) \
+ 				elog(ERROR, "cannot to write to prepared dictionary file"); \
+ 		} \
+ 		else \
+ 		{ \
+ 			WRITE_BINARY(len, stream); \
+ 		} \
+ 	} while (0);
+ 
+ #define WRITE_BINARY_STRING(buff, size, stream) \
+ 	do { \
+ 		if (fwrite(buff, size, 1, stream) != 1) \
+ 			elog(ERROR, "cannot to write to prepared dictionary file"); \
+ 	} while (0);
+ 
+ #define READ_BINARY(buff, stream)  \
+ 	do { \
+ 		if (fread(&(buff), sizeof(buff), 1, stream) != 1) \
+ 			elog(ERROR, "cannot to load a prepared dictionary file"); \
+ 	} while (0)
+ 
+ #define READ_STRING(target, stream)  \
+ 	do { \
+ 		int len; \
+ 		READ_BINARY(len, stream); \
+ 		if (len != -1) \
+ 		{ \
+ 			target = (char *) palloc(len); \
+ 			if (fread(target, len, 1, stream) != 1) \
+ 				elog(ERROR, "cannot to load a prepared dictionary file"); \
+ 		} \
+ 		else \
+ 			target = NULL; \
+ 	} while (0)
+ 
+ #define READ_BINARY_STRING(buff, size, stream) \
+ 	do { \
+ 		if (fread(buff, size, 1, stream) != 1) \
+ 			elog(ERROR, "cannot to load a prepared dictionary file"); \
+ 	} while(0);
+ 
+ /*
+  * spell dictionary uses a thousands SPNodes. These nodes are never
+  * individually released, so we can pass by memory context managament 
+  * and solve a interesting size of memory.
+  */
+ static SPNode *
+ allocSPNode(int nchar)
+ {
+ 	Size size = MAXALIGN(SPNHDRSZ + nchar * sizeof(SPNodeData));
+ 	void *ret;
+ 
+ 	/* use a prealloc_mem only for small requests */
+ 	if (size > ALLOCSET_DEFAULT_INITSIZE / 3)
+ 		return palloc(size);
+ 
+ 	if (prealloc_mem == NULL || size > prealloc_free_size)
+ 	{
+ 		prealloc_mem = palloc(ALLOCSET_DEFAULT_INITSIZE);
+ 		prealloc_free_size = ALLOCSET_DEFAULT_INITSIZE;
+ 	}
+ 	
+ 	Assert(prealloc_mem != NULL);
+ 	Assert(prealloc_mem == (void *) MAXALIGN(prealloc_mem));
+ 	
+ 	ret = memset(prealloc_mem, 0, size);
+ 	
+ 	/* reduce a used block from preallocated memory */
+ 	prealloc_free_size -= size;
+ 	prealloc_mem = (char *) prealloc_mem + size;
+ 	
+ 	return ret;
+ }
+ 
+ /*
+  * Parsing a spell dictionary is slow, so we must to mimimalize
+  * the number of this task. One possibility is serialisation
+  * and deseralisation of Ispell dictionary.
+  */
+ static void 
+ outSPNode(FILE *stream, SPNode *node)
+ {
+ 	int i;
+ 	uint32 length = node->length;
+ 	
+ 	WRITE_BINARY(length, stream);
+ 	
+ 	for (i = 0; i < node->length; i++)
+ 	{
+ 		SPNodeData *data = &node->data[i];
+ 		uint32 aux = data->val | data->isword <<  8 
+ 				| data->compoundflag << 9 | data->affix << 13;
+ 				
+ 		WRITE_BINARY(aux, stream);
+ 		
+ 		if (data->node)
+ 			outSPNode(stream, data->node);
+ 		else
+ 		{
+ 			length = 0;
+ 			WRITE_BINARY(length, stream);
+ 		}
+ 	}
+ }
+ 
+ static SPNode *
+ readSPNode(FILE *stream)
+ {
+ 	int i;
+ 	uint32 length;
+ 	SPNode *node;
+ 	
+ 	READ_BINARY(length, stream);
+ 	
+ 	/* there are not other node */
+ 	if (length == 0)
+ 		return NULL;
+ 	
+ 	node = allocSPNode(length);
+ 	node->length = length;
+ 	
+ 	for (i = 0; i < node->length; i++)
+ 	{
+ 		SPNodeData *data = &node->data[i];
+ 		uint32 aux;
+ 		
+ 		READ_BINARY(aux, stream);
+ 		
+ 		data->val = aux & 0xFF;
+ 		data->isword = aux >> 8 & 1;
+ 		data->compoundflag = aux >> 9 & 0xF;
+ 		data->affix = aux >> 13 & 0x7FFFF;
+ 		
+ 		data->node = readSPNode(stream);
+ 	}
+ 
+ 	return node;
+ }
+ 
+ static void 
+ outSPDict(FILE *stream, IspellDict *Conf)
+ {
+ 	int i;
+ 
+ 	WRITE_BINARY(Conf->nAffixData, stream);
+ 
+ 	for (i = 0; i < Conf->nAffixData; i++)
+ 	{
+ 		WRITE_STRING(Conf->AffixData[i], stream);
+ 	}
+ 	
+ 	outSPNode(stream, Conf->Dictionary);
+ }
+ 
+ void
+ readSPDict(FILE *stream, IspellDict *Conf)
+ {
+ 	int i;
+ 
+ 	checkTmpCtx();
+ 
+ 	READ_BINARY(Conf->nAffixData, stream);
+ 	
+ 	Conf->AffixData = (char **) palloc(Conf->nAffixData * sizeof(char *));
+ 	
+ 	for (i = 0; i < Conf->nAffixData; i++)
+ 	{
+ 		READ_STRING(Conf->AffixData[i], stream);
+ 	}
+ 	
+ 	Conf->Dictionary = readSPNode(stream);
+ }
+ 
+ static void 
+ outRegisNode(FILE *stream, RegisNode *node)
+ {
+ 	do
+ 	{
+ 		int len = node->len;
+ 		uint32 aux = node->type | node->len << 2;
+ 		
+ 		WRITE_BINARY(len, stream);
+ 		WRITE_BINARY(aux, stream);
+ 		WRITE_BINARY_STRING(&node->data, len, stream);
+ 		
+ 		node = node->next;
+ 		if (!node)
+ 		{
+ 			/* append end tag */
+ 			len = 0;
+ 			WRITE_BINARY(len, stream);
+ 		}
+ 		
+ 	} while (node != NULL);
+ }
+ 
+ static RegisNode *
+ readRegisNode(FILE *stream)
+ {
+ 	int len;
+ 	RegisNode *result = NULL;
+ 	RegisNode *node,
+ 			*prev = NULL;
+ 
+ 	do 
+ 	{
+ 		READ_BINARY(len, stream);
+ 		if (len > 0)
+ 		{
+ 			uint32 aux;
+ 		
+ 			node = (RegisNode *) palloc0(RNHDRSZ + len + 1);
+ 			if (result == NULL)
+ 				result = node;
+ 			else
+ 				prev->next = node;
+ 			
+ 			READ_BINARY(aux, stream);
+ 			node->type = aux & 3;
+ 			node->len = aux >> 2 & 65535;
+ 			READ_BINARY_STRING(node->data, len, stream);
+ 			prev = node;
+ 		}
+ 	} while (len > 0);
+ 
+ 	return result;
+ }
+ 
+ static void
+ outRegis(FILE *stream, Regis *regis)
+ {
+ 	uint32 aux = regis->issuffix | regis->nchar << 1;
+ 	
+ 	WRITE_BINARY(aux, stream);
+ 	outRegisNode(stream, regis->node);
+ }
+ 
+ static void 
+ readRegis(FILE *stream, Regis *regis)
+ {
+ 	uint32 aux;
+ 	
+ 	READ_BINARY(aux, stream);
+ 	regis->issuffix = aux & 1;
+ 	regis->nchar = aux >> 1 & 65535;
+ 	regis->node = readRegisNode(stream);
+ }
+ 
+ static void 
+ outAFFIX(FILE *stream, AFFIX *aff)
+ {
+ 	uint32 aux = aff->flag | aff->type << 8 | aff->flagflags << 9 |
+ 				aff->issimple << 16 | aff->isregis << 17 | aff->replen << 18;
+ 	
+ 	WRITE_BINARY(aux, stream);
+ 	WRITE_STRING(aff->find, stream);
+ 	WRITE_STRING(aff->repl, stream);
+ 	
+ 	if (aff->isregis)
+ 		outRegis(stream, &aff->reg.regis);
+ }
+ 
+ static void
+ readAFFIX(FILE *stream, AFFIX *aff)
+ {
+ 	uint32 aux;
+ 	
+ 	checkTmpCtx();
+ 	
+ 	READ_BINARY(aux, stream);
+ 	aff->flag = aux & 255;
+ 	aff->type = aux >> 8 & 1;
+ 	aff->flagflags = aux >> 9 & 127;
+ 	aff->issimple = aux >> 16 & 1;
+ 	aff->isregis = aux >> 17 & 1;
+ 	aff->replen = (aux >> 18) & 16383;
+ 	
+ 	READ_STRING(aff->find, stream);
+ 	READ_STRING(aff->repl, stream);
+ 	
+ 	if (aff->isregis)
+ 		readRegis(stream, &aff->reg.regis);
+ }
+ 
+ static void
+ outAffix(FILE *stream, IspellDict *Conf)
+ {
+ 	int	i;
+ 
+ 	WRITE_BINARY(Conf->naffixes, stream);
+ 	for (i = 0; i < Conf->naffixes; i++)
+ 	{
+ 		outAFFIX(stream, &Conf->Affix[i]);
+ 	}
+ }
+ 
+ void
+ readAffix(FILE *stream, IspellDict *Conf)
+ {
+ 	int i;
+ 	
+ 	READ_BINARY(Conf->naffixes, stream);
+ 	
+ 	Conf->Affix = (AFFIX *) palloc(Conf->naffixes * sizeof(AFFIX));
+ 	for (i = 0; i < Conf->naffixes; i++)
+ 	{
+ 		readAFFIX(stream, &Conf->Affix[i]);
+ 	}
+ }
+ 
  static void
  checkTmpCtx(void)
  {
***************
*** 63,68 ****
--- 394,424 ----
  	return dst;
  }
  
+ void 
+ outStopList(FILE *stream, StopList *s)
+ {
+ 	int	i;
+ 	
+ 	WRITE_BINARY(s->len, stream);
+ 	for (i = 0; i < s->len; i++)
+ 	{
+ 		WRITE_STRING(s->stop[i], stream);
+ 	}
+ }
+ 
+ void
+ readStopList(FILE *stream, StopList *s)
+ {
+ 	int i;
+ 	
+ 	READ_BINARY(s->len, stream);
+ 	s->stop = (char **) palloc(s->len * sizeof(char *));
+ 	for(i = 0; i < s->len; i++)
+ 	{
+ 		READ_STRING(s->stop[i], stream);
+ 	}
+ }
+ 
  #define MAX_NORM 1024
  #define MAXNORMLEN 256
  
***************
*** 252,258 ****
  	tsearch_readline_end(&trst);
  }
  
- 
  static int
  FindWord(IspellDict *Conf, const char *word, int affixflag, int flag)
  {
--- 608,613 ----
***************
*** 261,266 ****
--- 616,623 ----
  			   *StopHigh,
  			   *StopMiddle;
  	uint8	   *ptr = (uint8 *) word;
+ 	static int xx = 0;
+ 
  
  	flag &= FF_DICTFLAGMASK;
  
***************
*** 270,276 ****
--- 627,635 ----
  		StopHigh = node->data + node->length;
  		while (StopLow < StopHigh)
  		{
+ 
  			StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
+ 			
  			if (StopMiddle->val == *ptr)
  			{
  				if (*(ptr + 1) == '\0' && StopMiddle->isword)
***************
*** 321,326 ****
--- 680,686 ----
  	}
  
  	Affix = Conf->Affix + Conf->naffixes;
+ 	Affix->mask = pstrdup(mask);
  
  	if (strcmp(mask, ".") == 0)
  	{
***************
*** 878,884 ****
  	if (!nchar)
  		return NULL;
  
! 	rs = (SPNode *) palloc0(SPNHDRSZ + nchar * sizeof(SPNodeData));
  	rs->length = nchar;
  	data = rs->data;
  
--- 1238,1244 ----
  	if (!nchar)
  		return NULL;
  
! 	rs = allocSPNode(nchar);
  	rs->length = nchar;
  	data = rs->data;
  
***************
*** 987,992 ****
--- 1347,1358 ----
  	Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
  
  	Conf->Spell = NULL;
+ 	
+ 	/* serialize a dictionary */
+ 	if (Conf->stream && Conf->mode == 'w') 
+ 	{
+ 		outSPDict(Conf->stream, Conf);
+ 	}
  }
  
  static AffixNode *
***************
*** 1000,1012 ****
  	int			lownew = low;
  	int			naff;
  	AFFIX	  **aff;
! 
  	for (i = low; i < high; i++)
  		if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type))
  		{
  			nchar++;
  			lastchar = GETCHAR(Conf->Affix + i, level, type);
  		}
  
  	if (!nchar)
  		return NULL;
--- 1366,1380 ----
  	int			lownew = low;
  	int			naff;
  	AFFIX	  **aff;
! 	
  	for (i = low; i < high; i++)
+ 	{
  		if (Conf->Affix[i].replen > level && lastchar != GETCHAR(Conf->Affix + i, level, type))
  		{
  			nchar++;
  			lastchar = GETCHAR(Conf->Affix + i, level, type);
  		}
+ 	}
  
  	if (!nchar)
  		return NULL;
***************
*** 1092,1097 ****
--- 1460,1466 ----
  		return;
  
  	Affix->data->aff = (AFFIX **) palloc(sizeof(AFFIX *) * cnt);
+ 	
  	Affix->data->naff = (uint32) cnt;
  
  	cnt = 0;
***************
*** 1130,1135 ****
--- 1499,1555 ----
  
  	if (Conf->naffixes > 1)
  		qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
+ 
+ 	/* Serialize affix */
+ 	if (Conf->stream && Conf->mode == 'w')
+ 	{
+ 		outAffix(Conf->stream, Conf);
+ 	}
+ 		
+ 	Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes);
+ 	ptr->affix = NULL;
+ 
+ 	for (i = 0; i < Conf->naffixes; i++)
+ 	{
+ 		Affix = &(((AFFIX *) Conf->Affix)[i]);
+ 		if (Affix->type == FF_SUFFIX && i < firstsuffix)
+ 			firstsuffix = i;
+ 
+ 		if ((Affix->flagflags & FF_COMPOUNDFLAG) && Affix->replen > 0 &&
+ 			isAffixInUse(Conf, (char) Affix->flag))
+ 		{
+ 			if (ptr == Conf->CompoundAffix ||
+ 				ptr->issuffix != (ptr - 1)->issuffix ||
+ 				strbncmp((const unsigned char *) (ptr - 1)->affix,
+ 						 (const unsigned char *) Affix->repl,
+ 						 (ptr - 1)->len))
+ 			{
+ 				/* leave only unique and minimals suffixes */
+ 				ptr->affix = Affix->repl;
+ 				ptr->len = Affix->replen;
+ 				ptr->issuffix = (Affix->type == FF_SUFFIX) ? true : false;
+ 				ptr++;
+ 			}
+ 		}
+ 	}
+ 	ptr->affix = NULL;
+ 	Conf->CompoundAffix = (CMPDAffix *) repalloc(Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr - Conf->CompoundAffix + 1));
+ 
+ 	Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, FF_PREFIX);
+ 	Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, FF_SUFFIX);
+ 	mkVoidAffix(Conf, true, firstsuffix);
+ 	mkVoidAffix(Conf, false, firstsuffix);
+ }
+ 
+ 
+ void 
+ postProcessAffixes(IspellDict *Conf)
+ {
+ 	AFFIX	   *Affix;
+ 	size_t		i;
+ 	CMPDAffix  *ptr;
+ 	int			firstsuffix = Conf->naffixes;
+ 	
  	Conf->CompoundAffix = ptr = (CMPDAffix *) palloc(sizeof(CMPDAffix) * Conf->naffixes);
  	ptr->affix = NULL;
  
***************
*** 1172,1177 ****
--- 1592,1598 ----
  			   *StopHigh,
  			   *StopMiddle;
  	uint8 symbol;
+ 	static int xx = 0;
  
  	if (node->isvoid)
  	{							/* search void affixes */
***************
*** 1188,1199 ****
  		{
  			StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
  			symbol = GETWCHAR(word, wrdlen, *level, type);
! 
  			if (StopMiddle->val == symbol)
  			{
  				(*level)++;
  				if (StopMiddle->naff)
  					return StopMiddle;
  				node = StopMiddle->node;
  				break;
  			}
--- 1609,1622 ----
  		{
  			StopMiddle = StopLow + ((StopHigh - StopLow) >> 1);
  			symbol = GETWCHAR(word, wrdlen, *level, type);
! 			
  			if (StopMiddle->val == symbol)
  			{
  				(*level)++;
  				if (StopMiddle->naff)
+ 				{
  					return StopMiddle;
+ 				}
  				node = StopMiddle->node;
  				break;
  			}
***************
*** 1372,1378 ****
  	while (snode)
  	{
  		int			baselen = 0;
- 
  		/* find possible suffix */
  		suffix = FindAffixes(snode, word, wrdlen, &slevel, FF_SUFFIX);
  		if (!suffix)
--- 1795,1800 ----
***************
*** 1402,1408 ****
  							/* prefix success */
  							int			ff = (prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT) ?
  							0 : prefix->aff[j]->flag;
- 
  							if (FindWord(Conf, pnewword, ff, flag))
  								cur += addToResult(forms, cur, pnewword);
  						}
--- 1824,1829 ----
***************
*** 1420,1425 ****
--- 1841,1849 ----
  		pfree(forms);
  		return (NULL);
  	}
+ 	
+ 	cur = forms;
+ 	
  	return (forms);
  }
  
*** ./src/include/tsearch/dicts/spell.h.orig	2010-08-31 23:46:38.653669628 +0200
--- ./src/include/tsearch/dicts/spell.h	2010-08-31 23:46:47.469669487 +0200
***************
*** 161,166 ****
--- 161,168 ----
  
  	unsigned char flagval[256];
  	bool		usecompound;
+ 	FILE		*stream;
+ 	char mode;
  } IspellDict;
  
  extern TSLexeme *NINormalizeWord(IspellDict *Conf, char *word);
*** ./src/include/tsearch/ts_public.h.orig	2010-01-02 17:58:09.000000000 +0100
--- ./src/include/tsearch/ts_public.h	2010-08-31 23:46:00.185669425 +0200
***************
*** 78,83 ****
--- 78,87 ----
  			 char *(*wordop) (const char *));
  extern bool searchstoplist(StopList *s, char *key);
  
+ extern void outStopList(FILE *stream, StopList *s);
+ extern void readStopList(FILE *stream, StopList *s);
+ 
+ 
  /*
   * Interface with dictionaries
   */

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

[HACKERS] experimental: TSearch dictionary [de]serialization

Reply via email to