Re: [HACKERS] Mac OS: invalid byte sequence for encoding "UTF8"

Artur Zakirov Thu, 28 Jan 2016 06:42:59 -0800

On 27.01.2016 15:28, Artur Zakirov wrote:

On 27.01.2016 14:14, Stas Kelvich wrote:

Hi.


I tried that and confirm strange behaviour. It seems that problem with
small cyrillic letter ‘х’. (simplest obscene language filter? =)

That can be reproduced with simpler test

Stas


The test program was corrected. Now it uses wchar_t type. And it works
correctly and gives right output.

I think the NIImportOOAffixes() in spell.c should be corrected to avoid
this bug.

I have attached a patch. It adds new functions parse_ooaffentry() andget_nextentry() and fixes a couple comments.

Now russian and other supported dictionaries can be used for text searchin Mac OS.

parse_ooaffentry() parses an affix file entry instead of sscanf(). Ithas a similar algorithm to the parse_affentry() function.

Should I create a new patch to fix this bug (as I did) or this patchshould go with the patchhttp://www.postgresql.org/message-id/56aa02ee.6090...@postgrespro.ru ?


--
Artur Zakirov
Postgres Professional: http://www.postgrespro.com
Russian Postgres Company

*** a/src/backend/tsearch/spell.c
--- b/src/backend/tsearch/spell.c
***************
*** 458,469 **** NIAddAffix(IspellDict *Conf, int flag, char flagflags, const char *mask, const c
  }
  
  #define PAE_WAIT_MASK	0
! #define PAE_INMASK	1
  #define PAE_WAIT_FIND	2
! #define PAE_INFIND	3
  #define PAE_WAIT_REPL	4
! #define PAE_INREPL	5
  
  static bool
  parse_affentry(char *str, char *mask, char *find, char *repl)
  {
--- 458,579 ----
  }
  
  #define PAE_WAIT_MASK	0
! #define PAE_INMASK		1
  #define PAE_WAIT_FIND	2
! #define PAE_INFIND		3
  #define PAE_WAIT_REPL	4
! #define PAE_INREPL		5
! #define PAE_WAIT_TYPE	6
! #define PAE_WAIT_FLAG	7
  
+ /*
+  * Used in parse_ooaffentry() to parse an .affix file entry.
+  */
+ static bool
+ get_nextentry(char **str, char *next)
+ {
+ 	int			state = PAE_WAIT_MASK;
+ 	char	   *pnext = next;
+ 
+ 	*next = '\0';
+ 
+ 	while (**str)
+ 	{
+ 		if (state == PAE_WAIT_MASK)
+ 		{
+ 			if (t_iseq(*str, '#'))
+ 				return false;
+ 			else if (!t_isspace(*str))
+ 			{
+ 				COPYCHAR(pnext, *str);
+ 				pnext += pg_mblen(*str);
+ 				state = PAE_INMASK;
+ 			}
+ 		}
+ 		else if (state == PAE_INMASK)
+ 		{
+ 			if (t_isspace(*str))
+ 			{
+ 				*pnext = '\0';
+ 				return true;
+ 			}
+ 			else
+ 			{
+ 				COPYCHAR(pnext, *str);
+ 				pnext += pg_mblen(*str);
+ 			}
+ 		}
+ 		*str += pg_mblen(*str);
+ 	}
+ 
+ 	*pnext ='\0';
+ 
+ 	return *next;
+ }
+ 
+ /*
+  * Parses entry of an .affix file of MySpell or Hunspell format.
+  *
+  * An .affix file entry has the following format:
+  * - header
+  *   <type>  <flag>  <cross_flag>  <flag_count>
+  * - fields after header:
+  *   <type>  <flag>  <find>  <replace>  <mask>
+  */
+ static int
+ parse_ooaffentry(char *str, char *type, char *flag, char *find,
+ 				char *repl, char *mask)
+ {
+ 	int			state = PAE_WAIT_TYPE,
+ 				next_state = PAE_WAIT_FLAG;
+ 	int			parse_read = 0;
+ 	bool		valid = true;
+ 
+ 	*type = *flag = *find = *repl = *mask = '\0';
+ 
+ 	while (*str && valid)
+ 	{
+ 		switch (state)
+ 		{
+ 			case PAE_WAIT_TYPE:
+ 				valid = get_nextentry(&str, type);
+ 				break;
+ 			case PAE_WAIT_FLAG:
+ 				valid = get_nextentry(&str, flag);
+ 				next_state = PAE_WAIT_FIND;
+ 				break;
+ 			case PAE_WAIT_FIND:
+ 				valid = get_nextentry(&str, find);
+ 				next_state = PAE_WAIT_REPL;
+ 				break;
+ 			case PAE_WAIT_REPL:
+ 				valid = get_nextentry(&str, repl);
+ 				next_state = PAE_WAIT_MASK;
+ 				break;
+ 			case PAE_WAIT_MASK:
+ 				get_nextentry(&str, mask);
+ 				/* break loop */
+ 				valid = false;
+ 				break;
+ 			default:
+ 				elog(ERROR, "unrecognized state in parse_ooaffentry: %d", state);
+ 		}
+ 		state = next_state;
+ 		if (*str)
+ 			str += pg_mblen(str);
+ 
+ 		parse_read++;
+ 	}
+ 
+ 	return parse_read;
+ }
+ 
+ /*
+  * Parses entry of an .affix file of Ispell format
+  *
+  * An .affix file entry has the following format:
+  * <mask>  >  [-<find>,]<replace>
+  */
  static bool
  parse_affentry(char *str, char *mask, char *find, char *repl)
  {
***************
*** 618,625 **** NIImportOOAffixes(IspellDict *Conf, const char *filename)
  	int			flag = 0;
  	char		flagflags = 0;
  	tsearch_readline_state trst;
! 	int			scanread = 0;
! 	char		scanbuf[BUFSIZ];
  	char	   *recoded;
  
  	/* read file to find any flag */
--- 728,734 ----
  	int			flag = 0;
  	char		flagflags = 0;
  	tsearch_readline_state trst;
! 	int			parseread = 0;
  	char	   *recoded;
  
  	/* read file to find any flag */
***************
*** 682,689 **** NIImportOOAffixes(IspellDict *Conf, const char *filename)
  	}
  	tsearch_readline_end(&trst);
  
- 	sprintf(scanbuf, "%%6s %%%ds %%%ds %%%ds %%%ds", BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5, BUFSIZ / 5);
- 
  	if (!tsearch_readline_begin(&trst, filename))
  		ereport(ERROR,
  				(errcode(ERRCODE_CONFIG_FILE_ERROR),
--- 791,796 ----
***************
*** 695,709 **** NIImportOOAffixes(IspellDict *Conf, const char *filename)
  		if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
  			goto nextline;
  
! 		scanread = sscanf(recoded, scanbuf, type, sflag, find, repl, mask);
  
  		if (ptype)
  			pfree(ptype);
  		ptype = lowerstr_ctx(Conf, type);
! 		if (scanread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
  			goto nextline;
  
! 		if (scanread == 4)
  		{
  			if (strlen(sflag) != 1)
  				goto nextline;
--- 802,816 ----
  		if (*recoded == '\0' || t_isspace(recoded) || t_iseq(recoded, '#'))
  			goto nextline;
  
! 		parseread = parse_ooaffentry(recoded, type, sflag, find, repl, mask);
  
  		if (ptype)
  			pfree(ptype);
  		ptype = lowerstr_ctx(Conf, type);
! 		if (parseread < 4 || (STRNCMP(ptype, "sfx") && STRNCMP(ptype, "pfx")))
  			goto nextline;
  
! 		if (parseread == 4)
  		{
  			if (strlen(sflag) != 1)
  				goto nextline;
***************
*** 722,730 **** NIImportOOAffixes(IspellDict *Conf, const char *filename)
  			if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
  				goto nextline;
  			prepl = lowerstr_ctx(Conf, repl);
! 			/* affix flag */
  			if ((ptr = strchr(prepl, '/')) != NULL)
  			{
  				*ptr = '\0';
  				ptr = repl + (ptr - prepl) + 1;
  				while (*ptr)
--- 829,841 ----
  			if (strlen(sflag) != 1 || flag != *sflag || flag == 0)
  				goto nextline;
  			prepl = lowerstr_ctx(Conf, repl);
! 			/* Find position of '/' in lowercased string "prepl" */
  			if ((ptr = strchr(prepl, '/')) != NULL)
  			{
+ 				/*
+ 				 * Here we use non-lowercased string "repl". We need position of
+ 				 * '/' in "repl".
+ 				 */
  				*ptr = '\0';
  				ptr = repl + (ptr - prepl) + 1;
  				while (*ptr)
***************
*** 800,810 **** NIImportAffixes(IspellDict *Conf, const char *filename)
  
  		if (STRNCMP(pstr, "compoundwords") == 0)
  		{
  			s = findchar(pstr, 'l');
  			if (s)
  			{
! 				s = recoded + (s - pstr);		/* we need non-lowercased
! 												 * string */
  				while (*s && !t_isspace(s))
  					s += pg_mblen(s);
  				while (*s && t_isspace(s))
--- 911,922 ----
  
  		if (STRNCMP(pstr, "compoundwords") == 0)
  		{
+ 			/* Find position in lowercased string "pstr" */
  			s = findchar(pstr, 'l');
  			if (s)
  			{
! 				/* Here we use non-lowercased string "recoded" */
! 				s = recoded + (s - pstr);
  				while (*s && !t_isspace(s))
  					s += pg_mblen(s);
  				while (*s && t_isspace(s))

-- 
Sent via pgsql-hackers mailing list (pgsql-hackers@postgresql.org)
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers

Re: [HACKERS] Mac OS: invalid byte sequence for encoding "UTF8"

Reply via email to