Re: [PATCHES] [HACKERS] like/ilike improvements

Andrew Dunstan Thu, 31 May 2007 20:42:11 -0700


Tom Lane wrote:

Andrew Dunstan <[EMAIL PROTECTED]> writes:

  do { (t)++; (tlen)--}  while ((*(t) & 0xC0) == 0x80 && tlen > 0)


The while *must* test those two conditions in the other order.
(Don't laugh --- we've had reproducible bugs before in which the backend
dumped core because of running off the end of memory due to this type
of mistake.)

In fact, I'm wondering if that might make the other UTF8 stuff redundant- the whole point of what we're doing is to avoid expensive calls toNextChar;


+1 I think.  This test will be approximately the same expense as what
the outer loop would otherwise be (tlen > 0 and *t != firstpat), and
doing it this way removes an entire layer of intellectual complexity.
Even though the code is hardly different, we are no longer dealing in
misaligned pointers anywhere in the match algorithm.

OK, here is a patch that I think incorporates all the ideas discussed(including part of Mark Mielke's suggestion about optimising %_). Thereis now no special treatment of UTF8 other than its use of a fasterNextChar macro.


cheers

andrew

Index: src/backend/utils/adt/like.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/adt/like.c,v
retrieving revision 1.68
diff -c -r1.68 like.c
*** src/backend/utils/adt/like.c	27 Feb 2007 23:48:08 -0000	1.68
--- src/backend/utils/adt/like.c	1 Jun 2007 02:21:59 -0000
***************
*** 28,48 ****
  #define LIKE_ABORT						(-1)
  
  
! static int	MatchText(char *t, int tlen, char *p, int plen);
! static int	MatchTextIC(char *t, int tlen, char *p, int plen);
! static int	MatchBytea(char *t, int tlen, char *p, int plen);
! static text *do_like_escape(text *, text *);
  
! static int	MBMatchText(char *t, int tlen, char *p, int plen);
! static int	MBMatchTextIC(char *t, int tlen, char *p, int plen);
  static text *MB_do_like_escape(text *, text *);
  
  /*--------------------
   * Support routine for MatchText. Compares given multibyte streams
   * as wide characters. If they match, returns 1 otherwise returns 0.
   *--------------------
   */
! static int
  wchareq(char *p1, char *p2)
  {
  	int			p1_len;
--- 28,50 ----
  #define LIKE_ABORT						(-1)
  
  
! static int	SB_MatchText(char *t, int tlen, char *p, int plen);
! static text *SB_do_like_escape(text *, text *);
  
! static int	MB_MatchText(char *t, int tlen, char *p, int plen);
  static text *MB_do_like_escape(text *, text *);
  
+ static int	UTF8_MatchText(char *t, int tlen, char *p, int plen);
+ 
+ static int	GenericMatchText(char *s, int slen, char* p, int plen);
+ static int	Generic_Text_IC_like(text *str, text *pat);
+ 
  /*--------------------
   * Support routine for MatchText. Compares given multibyte streams
   * as wide characters. If they match, returns 1 otherwise returns 0.
   *--------------------
   */
! static inline int
  wchareq(char *p1, char *p2)
  {
  	int			p1_len;
***************
*** 72,86 ****
   * of getting a single character transformed to the system's wchar_t format.
   * So now, we just downcase the strings using lower() and apply regular LIKE
   * comparison.	This should be revisited when we install better locale support.
-  *
-  * Note that MBMatchText and MBMatchTextIC do exactly the same thing now.
-  * Is it worth refactoring to avoid duplicated code?  They might become
-  * different again in the future.
   */
  
  /* Set up to compile like_match.c for multibyte characters */
! #define CHAREQ(p1, p2) wchareq(p1, p2)
! #define ICHAREQ(p1, p2) wchareq(p1, p2)
  #define NextChar(p, plen) \
  	do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0)
  #define CopyAdvChar(dst, src, srclen) \
--- 74,85 ----
   * of getting a single character transformed to the system's wchar_t format.
   * So now, we just downcase the strings using lower() and apply regular LIKE
   * comparison.	This should be revisited when we install better locale support.
   */
  
+ #define NextByte(p, plen)	((p)++, (plen)--)
+ 
  /* Set up to compile like_match.c for multibyte characters */
! #define CHAREQ(p1, p2) wchareq((p1), (p2))
  #define NextChar(p, plen) \
  	do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0)
  #define CopyAdvChar(dst, src, srclen) \
***************
*** 90,122 ****
  			 *(dst)++ = *(src)++; \
  	   } while (0)
  
! #define MatchText	MBMatchText
! #define MatchTextIC MBMatchTextIC
  #define do_like_escape	MB_do_like_escape
  
  #include "like_match.c"
  
- #undef CHAREQ
- #undef ICHAREQ
- #undef NextChar
- #undef CopyAdvChar
- #undef MatchText
- #undef MatchTextIC
- #undef do_like_escape
- 
  /* Set up to compile like_match.c for single-byte characters */
  #define CHAREQ(p1, p2) (*(p1) == *(p2))
! #define ICHAREQ(p1, p2) (tolower((unsigned char) *(p1)) == tolower((unsigned char) *(p2)))
! #define NextChar(p, plen) ((p)++, (plen)--)
  #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--)
  
  #include "like_match.c"
  
- /* And some support for BYTEA */
- #define BYTEA_CHAREQ(p1, p2) (*(p1) == *(p2))
- #define BYTEA_NextChar(p, plen) ((p)++, (plen)--)
- #define BYTEA_CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--)
  
  
  /*
   *	interface routines called by the function manager
--- 89,147 ----
  			 *(dst)++ = *(src)++; \
  	   } while (0)
  
! #define MatchText	MB_MatchText
  #define do_like_escape	MB_do_like_escape
  
  #include "like_match.c"
  
  /* Set up to compile like_match.c for single-byte characters */
  #define CHAREQ(p1, p2) (*(p1) == *(p2))
! #define NextChar(p, plen) NextByte((p), (plen))
  #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--)
  
+ #define MatchText	SB_MatchText
+ #define do_like_escape	SB_do_like_escape
+ 
  #include "like_match.c"
  
  
+ /* setup to compile like_match.c for UTF8 encoding, using fast NextChar */
+ 
+ #define NextChar(p, plen) \
+ 	do { (p)++; (plen)--; } while ((plen) > 0 && (*(p) & 0xC0) == 0x80 ) 
+ #define MatchText	UTF8_MatchText
+ 
+ #include "like_match.c"
+ 
+ static inline int
+ GenericMatchText(char *s, int slen, char* p, int plen)
+ {
+ 	if (pg_database_encoding_max_length() == 1)
+ 		return SB_MatchText(s, slen, p, plen);
+ 	else if (GetDatabaseEncoding() == PG_UTF8)
+ 		return UTF8_MatchText(s, slen, p, plen);
+ 	else
+ 		return MB_MatchText(s, slen, p, plen);
+ }
+ 
+ static inline int
+ Generic_Text_IC_like(text *str, text *pat)
+ {
+ 	char	   *s,
+ 			   *p;
+ 	int			slen,
+ 				plen;
+ 
+ 	/* Force inputs to lower case to achieve case insensitivity */
+ 	str = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(str)));
+ 	pat = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(pat)));
+ 	s = VARDATA(str);
+ 	slen = (VARSIZE(str) - VARHDRSZ);
+ 	p = VARDATA(pat);
+ 	plen = (VARSIZE(pat) - VARHDRSZ);
+ 
+ 	return GenericMatchText(s, slen, p, plen);
+ }
  
  /*
   *	interface routines called by the function manager
***************
*** 138,147 ****
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	if (pg_database_encoding_max_length() == 1)
! 		result = (MatchText(s, slen, p, plen) == LIKE_TRUE);
! 	else
! 		result = (MBMatchText(s, slen, p, plen) == LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
--- 163,169 ----
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	result = (GenericMatchText(s, slen, p, plen) == LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
***************
*** 162,171 ****
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	if (pg_database_encoding_max_length() == 1)
! 		result = (MatchText(s, slen, p, plen) != LIKE_TRUE);
! 	else
! 		result = (MBMatchText(s, slen, p, plen) != LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
--- 184,190 ----
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	result = (GenericMatchText(s, slen, p, plen) != LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
***************
*** 186,195 ****
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	if (pg_database_encoding_max_length() == 1)
! 		result = (MatchText(s, slen, p, plen) == LIKE_TRUE);
! 	else
! 		result = (MBMatchText(s, slen, p, plen) == LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
--- 205,211 ----
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	result = (GenericMatchText(s, slen, p, plen) == LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
***************
*** 210,219 ****
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	if (pg_database_encoding_max_length() == 1)
! 		result = (MatchText(s, slen, p, plen) != LIKE_TRUE);
! 	else
! 		result = (MBMatchText(s, slen, p, plen) != LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
--- 226,232 ----
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	result = (GenericMatchText(s, slen, p, plen) != LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
***************
*** 234,240 ****
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	result = (MatchBytea(s, slen, p, plen) == LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
--- 247,253 ----
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	result = (SB_MatchText(s, slen, p, plen) == LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
***************
*** 255,261 ****
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	result = (MatchBytea(s, slen, p, plen) != LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
--- 268,274 ----
  	p = VARDATA(pat);
  	plen = (VARSIZE(pat) - VARHDRSZ);
  
! 	result = (SB_MatchText(s, slen, p, plen) != LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
***************
*** 270,306 ****
  	Name		str = PG_GETARG_NAME(0);
  	text	   *pat = PG_GETARG_TEXT_P(1);
  	bool		result;
! 	char	   *s,
! 			   *p;
! 	int			slen,
! 				plen;
! 
! 	if (pg_database_encoding_max_length() == 1)
! 	{
! 		s = NameStr(*str);
! 		slen = strlen(s);
! 		p = VARDATA(pat);
! 		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (MatchTextIC(s, slen, p, plen) == LIKE_TRUE);
! 	}
! 	else
! 	{
! 		/* Force inputs to lower case to achieve case insensitivity */
! 		text	   *strtext;
  
! 		strtext = DatumGetTextP(DirectFunctionCall1(name_text,
  													NameGetDatum(str)));
! 		strtext = DatumGetTextP(DirectFunctionCall1(lower,
! 												  PointerGetDatum(strtext)));
! 		pat = DatumGetTextP(DirectFunctionCall1(lower,
! 												PointerGetDatum(pat)));
! 
! 		s = VARDATA(strtext);
! 		slen = (VARSIZE(strtext) - VARHDRSZ);
! 		p = VARDATA(pat);
! 		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (MBMatchTextIC(s, slen, p, plen) == LIKE_TRUE);
! 	}
  
  	PG_RETURN_BOOL(result);
  }
--- 283,293 ----
  	Name		str = PG_GETARG_NAME(0);
  	text	   *pat = PG_GETARG_TEXT_P(1);
  	bool		result;
! 	text	   *strtext;
  
! 	strtext = DatumGetTextP(DirectFunctionCall1(name_text,
  													NameGetDatum(str)));
! 	result = (Generic_Text_IC_like(strtext, pat) == LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
***************
*** 311,347 ****
  	Name		str = PG_GETARG_NAME(0);
  	text	   *pat = PG_GETARG_TEXT_P(1);
  	bool		result;
! 	char	   *s,
! 			   *p;
! 	int			slen,
! 				plen;
! 
! 	if (pg_database_encoding_max_length() == 1)
! 	{
! 		s = NameStr(*str);
! 		slen = strlen(s);
! 		p = VARDATA(pat);
! 		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (MatchTextIC(s, slen, p, plen) != LIKE_TRUE);
! 	}
! 	else
! 	{
! 		/* Force inputs to lower case to achieve case insensitivity */
! 		text	   *strtext;
  
! 		strtext = DatumGetTextP(DirectFunctionCall1(name_text,
  													NameGetDatum(str)));
! 		strtext = DatumGetTextP(DirectFunctionCall1(lower,
! 												  PointerGetDatum(strtext)));
! 		pat = DatumGetTextP(DirectFunctionCall1(lower,
! 												PointerGetDatum(pat)));
! 
! 		s = VARDATA(strtext);
! 		slen = (VARSIZE(strtext) - VARHDRSZ);
! 		p = VARDATA(pat);
! 		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (MBMatchTextIC(s, slen, p, plen) != LIKE_TRUE);
! 	}
  
  	PG_RETURN_BOOL(result);
  }
--- 298,308 ----
  	Name		str = PG_GETARG_NAME(0);
  	text	   *pat = PG_GETARG_TEXT_P(1);
  	bool		result;
! 	text	   *strtext;
  
! 	strtext = DatumGetTextP(DirectFunctionCall1(name_text,
  													NameGetDatum(str)));
! 	result = (Generic_Text_IC_like(strtext, pat) != LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
***************
*** 352,383 ****
  	text	   *str = PG_GETARG_TEXT_P(0);
  	text	   *pat = PG_GETARG_TEXT_P(1);
  	bool		result;
- 	char	   *s,
- 			   *p;
- 	int			slen,
- 				plen;
  
! 	if (pg_database_encoding_max_length() == 1)
! 	{
! 		s = VARDATA(str);
! 		slen = (VARSIZE(str) - VARHDRSZ);
! 		p = VARDATA(pat);
! 		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (MatchTextIC(s, slen, p, plen) == LIKE_TRUE);
! 	}
! 	else
! 	{
! 		/* Force inputs to lower case to achieve case insensitivity */
! 		str = DatumGetTextP(DirectFunctionCall1(lower,
! 												PointerGetDatum(str)));
! 		pat = DatumGetTextP(DirectFunctionCall1(lower,
! 												PointerGetDatum(pat)));
! 		s = VARDATA(str);
! 		slen = (VARSIZE(str) - VARHDRSZ);
! 		p = VARDATA(pat);
! 		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (MBMatchTextIC(s, slen, p, plen) == LIKE_TRUE);
! 	}
  
  	PG_RETURN_BOOL(result);
  }
--- 313,320 ----
  	text	   *str = PG_GETARG_TEXT_P(0);
  	text	   *pat = PG_GETARG_TEXT_P(1);
  	bool		result;
  
! 	result = (Generic_Text_IC_like(str, pat) == LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
***************
*** 388,419 ****
  	text	   *str = PG_GETARG_TEXT_P(0);
  	text	   *pat = PG_GETARG_TEXT_P(1);
  	bool		result;
- 	char	   *s,
- 			   *p;
- 	int			slen,
- 				plen;
  
! 	if (pg_database_encoding_max_length() == 1)
! 	{
! 		s = VARDATA(str);
! 		slen = (VARSIZE(str) - VARHDRSZ);
! 		p = VARDATA(pat);
! 		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (MatchTextIC(s, slen, p, plen) != LIKE_TRUE);
! 	}
! 	else
! 	{
! 		/* Force inputs to lower case to achieve case insensitivity */
! 		str = DatumGetTextP(DirectFunctionCall1(lower,
! 												PointerGetDatum(str)));
! 		pat = DatumGetTextP(DirectFunctionCall1(lower,
! 												PointerGetDatum(pat)));
! 		s = VARDATA(str);
! 		slen = (VARSIZE(str) - VARHDRSZ);
! 		p = VARDATA(pat);
! 		plen = (VARSIZE(pat) - VARHDRSZ);
! 		result = (MBMatchTextIC(s, slen, p, plen) != LIKE_TRUE);
! 	}
  
  	PG_RETURN_BOOL(result);
  }
--- 325,332 ----
  	text	   *str = PG_GETARG_TEXT_P(0);
  	text	   *pat = PG_GETARG_TEXT_P(1);
  	bool		result;
  
! 	result = (Generic_Text_IC_like(str, pat) != LIKE_TRUE);
  
  	PG_RETURN_BOOL(result);
  }
***************
*** 430,436 ****
  	text	   *result;
  
  	if (pg_database_encoding_max_length() == 1)
! 		result = do_like_escape(pat, esc);
  	else
  		result = MB_do_like_escape(pat, esc);
  
--- 343,349 ----
  	text	   *result;
  
  	if (pg_database_encoding_max_length() == 1)
! 		result = SB_do_like_escape(pat, esc);
  	else
  		result = MB_do_like_escape(pat, esc);
  
***************
*** 446,624 ****
  {
  	bytea	   *pat = PG_GETARG_BYTEA_P(0);
  	bytea	   *esc = PG_GETARG_BYTEA_P(1);
! 	bytea	   *result;
! 	char	   *p,
! 			   *e,
! 			   *r;
! 	int			plen,
! 				elen;
! 	bool		afterescape;
! 
! 	p = VARDATA(pat);
! 	plen = (VARSIZE(pat) - VARHDRSZ);
! 	e = VARDATA(esc);
! 	elen = (VARSIZE(esc) - VARHDRSZ);
! 
! 	/*
! 	 * Worst-case pattern growth is 2x --- unlikely, but it's hardly worth
! 	 * trying to calculate the size more accurately than that.
! 	 */
! 	result = (text *) palloc(plen * 2 + VARHDRSZ);
! 	r = VARDATA(result);
! 
! 	if (elen == 0)
! 	{
! 		/*
! 		 * No escape character is wanted.  Double any backslashes in the
! 		 * pattern to make them act like ordinary characters.
! 		 */
! 		while (plen > 0)
! 		{
! 			if (*p == '\\')
! 				*r++ = '\\';
! 			BYTEA_CopyAdvChar(r, p, plen);
! 		}
! 	}
! 	else
! 	{
! 		/*
! 		 * The specified escape must be only a single character.
! 		 */
! 		BYTEA_NextChar(e, elen);
! 		if (elen != 0)
! 			ereport(ERROR,
! 					(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
! 					 errmsg("invalid escape string"),
! 				  errhint("Escape string must be empty or one character.")));
! 
! 		e = VARDATA(esc);
! 
! 		/*
! 		 * If specified escape is '\', just copy the pattern as-is.
! 		 */
! 		if (*e == '\\')
! 		{
! 			memcpy(result, pat, VARSIZE(pat));
! 			PG_RETURN_BYTEA_P(result);
! 		}
! 
! 		/*
! 		 * Otherwise, convert occurrences of the specified escape character to
! 		 * '\', and double occurrences of '\' --- unless they immediately
! 		 * follow an escape character!
! 		 */
! 		afterescape = false;
! 		while (plen > 0)
! 		{
! 			if (BYTEA_CHAREQ(p, e) && !afterescape)
! 			{
! 				*r++ = '\\';
! 				BYTEA_NextChar(p, plen);
! 				afterescape = true;
! 			}
! 			else if (*p == '\\')
! 			{
! 				*r++ = '\\';
! 				if (!afterescape)
! 					*r++ = '\\';
! 				BYTEA_NextChar(p, plen);
! 				afterescape = false;
! 			}
! 			else
! 			{
! 				BYTEA_CopyAdvChar(r, p, plen);
! 				afterescape = false;
! 			}
! 		}
! 	}
! 
! 	SET_VARSIZE(result, r - ((char *) result));
  
! 	PG_RETURN_BYTEA_P(result);
  }
  
- /*
-  * Same as above, but specifically for bytea (binary) datatype
-  */
- static int
- MatchBytea(char *t, int tlen, char *p, int plen)
- {
- 	/* Fast path for match-everything pattern */
- 	if ((plen == 1) && (*p == '%'))
- 		return LIKE_TRUE;
- 
- 	while ((tlen > 0) && (plen > 0))
- 	{
- 		if (*p == '\\')
- 		{
- 			/* Next pattern char must match literally, whatever it is */
- 			BYTEA_NextChar(p, plen);
- 			if ((plen <= 0) || !BYTEA_CHAREQ(t, p))
- 				return LIKE_FALSE;
- 		}
- 		else if (*p == '%')
- 		{
- 			/* %% is the same as % according to the SQL standard */
- 			/* Advance past all %'s */
- 			while ((plen > 0) && (*p == '%'))
- 				BYTEA_NextChar(p, plen);
- 			/* Trailing percent matches everything. */
- 			if (plen <= 0)
- 				return LIKE_TRUE;
- 
- 			/*
- 			 * Otherwise, scan for a text position at which we can match the
- 			 * rest of the pattern.
- 			 */
- 			while (tlen > 0)
- 			{
- 				/*
- 				 * Optimization to prevent most recursion: don't recurse
- 				 * unless first pattern char might match this text char.
- 				 */
- 				if (BYTEA_CHAREQ(t, p) || (*p == '\\') || (*p == '_'))
- 				{
- 					int			matched = MatchBytea(t, tlen, p, plen);
- 
- 					if (matched != LIKE_FALSE)
- 						return matched; /* TRUE or ABORT */
- 				}
- 
- 				BYTEA_NextChar(t, tlen);
- 			}
- 
- 			/*
- 			 * End of text with no match, so no point in trying later places
- 			 * to start matching this pattern.
- 			 */
- 			return LIKE_ABORT;
- 		}
- 		else if ((*p != '_') && !BYTEA_CHAREQ(t, p))
- 		{
- 			/*
- 			 * Not the single-character wildcard and no explicit match? Then
- 			 * time to quit...
- 			 */
- 			return LIKE_FALSE;
- 		}
- 
- 		BYTEA_NextChar(t, tlen);
- 		BYTEA_NextChar(p, plen);
- 	}
- 
- 	if (tlen > 0)
- 		return LIKE_FALSE;		/* end of pattern, but not of text */
- 
- 	/* End of input string.  Do we have matching pattern remaining? */
- 	while ((plen > 0) && (*p == '%'))	/* allow multiple %'s at end of
- 										 * pattern */
- 		BYTEA_NextChar(p, plen);
- 	if (plen <= 0)
- 		return LIKE_TRUE;
- 
- 	/*
- 	 * End of text with no match, so no point in trying later places to start
- 	 * matching this pattern.
- 	 */
- 	return LIKE_ABORT;
- }	/* MatchBytea() */
--- 359,366 ----
  {
  	bytea	   *pat = PG_GETARG_BYTEA_P(0);
  	bytea	   *esc = PG_GETARG_BYTEA_P(1);
! 	bytea	   *result = SB_do_like_escape((text *)pat, (text *)esc);
  
! 	PG_RETURN_BYTEA_P((bytea *)result);
  }
  
Index: src/backend/utils/adt/like_match.c
===================================================================
RCS file: /cvsroot/pgsql/src/backend/utils/adt/like_match.c,v
retrieving revision 1.15
diff -c -r1.15 like_match.c
*** src/backend/utils/adt/like_match.c	27 Feb 2007 23:48:08 -0000	1.15
--- src/backend/utils/adt/like_match.c	1 Jun 2007 02:21:59 -0000
***************
*** 3,20 ****
   * like_match.c
   *	  like expression handling internal code.
   *
!  * This file is included by like.c *twice*, to provide an optimization
!  * for single-byte encodings.
   *
   * Before the inclusion, we need to define following macros:
   *
!  * CHAREQ
!  * ICHAREQ
!  * NextChar
!  * CopyAdvChar
!  * MatchText (MBMatchText)
!  * MatchTextIC (MBMatchTextIC)
!  * do_like_escape (MB_do_like_escape)
   *
   * Copyright (c) 1996-2007, PostgreSQL Global Development Group
   *
--- 3,18 ----
   * like_match.c
   *	  like expression handling internal code.
   *
!  * This file is included by like.c three times, to provide natching code for
!  * single-byte encodings, UTF8, and for other multi-byte encodings.
!  * UTF8 is a special case because we can use a much more efficient version
!  * of NextChar than can be used for other multi-byte encodings.
   *
   * Before the inclusion, we need to define following macros:
   *
!  * NextChar 
!  * MatchText - to name of function wanted
!  * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar
   *
   * Copyright (c) 1996-2007, PostgreSQL Global Development Group
   *
***************
*** 81,97 ****
  	{
  		if (*p == '\\')
  		{
! 			/* Next pattern char must match literally, whatever it is */
! 			NextChar(p, plen);
! 			if ((plen <= 0) || !CHAREQ(t, p))
  				return LIKE_FALSE;
  		}
  		else if (*p == '%')
  		{
  			/* %% is the same as % according to the SQL standard */
  			/* Advance past all %'s */
  			while ((plen > 0) && (*p == '%'))
! 				NextChar(p, plen);
  			/* Trailing percent matches everything. */
  			if (plen <= 0)
  				return LIKE_TRUE;
--- 79,101 ----
  	{
  		if (*p == '\\')
  		{
! 			/* Next byte must match literally, whatever it is */
! 			NextByte(p, plen);
! 			if ((plen <= 0) || *p != *t )
  				return LIKE_FALSE;
  		}
  		else if (*p == '%')
  		{
+ 			/*
+ 			 * % processing is essentially a search for a match for what 
+ 			 * follows the %, plus a recursive match of the remainder.
+ 			 * We succeed if and only if both conditions are met.
+ 			 */
+ 
  			/* %% is the same as % according to the SQL standard */
  			/* Advance past all %'s */
  			while ((plen > 0) && (*p == '%'))
! 				NextByte(p, plen);
  			/* Trailing percent matches everything. */
  			if (plen <= 0)
  				return LIKE_TRUE;
***************
*** 100,206 ****
  			 * Otherwise, scan for a text position at which we can match the
  			 * rest of the pattern.
  			 */
! 			while (tlen > 0)
! 			{
! 				/*
! 				 * Optimization to prevent most recursion: don't recurse
! 				 * unless first pattern char might match this text char.
! 				 */
! 				if (CHAREQ(t, p) || (*p == '\\') || (*p == '_'))
! 				{
! 					int			matched = MatchText(t, tlen, p, plen);
  
! 					if (matched != LIKE_FALSE)
! 						return matched; /* TRUE or ABORT */
! 				}
  
  				NextChar(t, tlen);
! 			}
  
! 			/*
! 			 * End of text with no match, so no point in trying later places
! 			 * to start matching this pattern.
! 			 */
! 			return LIKE_ABORT;
! 		}
! 		else if ((*p != '_') && !CHAREQ(t, p))
! 		{
! 			/*
! 			 * Not the single-character wildcard and no explicit match? Then
! 			 * time to quit...
! 			 */
! 			return LIKE_FALSE;
! 		}
! 
! 		NextChar(t, tlen);
! 		NextChar(p, plen);
! 	}
! 
! 	if (tlen > 0)
! 		return LIKE_FALSE;		/* end of pattern, but not of text */
  
! 	/* End of input string.  Do we have matching pattern remaining? */
! 	while ((plen > 0) && (*p == '%'))	/* allow multiple %'s at end of
! 										 * pattern */
! 		NextChar(p, plen);
! 	if (plen <= 0)
! 		return LIKE_TRUE;
  
! 	/*
! 	 * End of text with no match, so no point in trying later places to start
! 	 * matching this pattern.
! 	 */
! 	return LIKE_ABORT;
! }	/* MatchText() */
  
! /*
!  * Same as above, but ignore case
!  */
! static int
! MatchTextIC(char *t, int tlen, char *p, int plen)
! {
! 	/* Fast path for match-everything pattern */
! 	if ((plen == 1) && (*p == '%'))
! 		return LIKE_TRUE;
  
! 	while ((tlen > 0) && (plen > 0))
! 	{
! 		if (*p == '\\')
! 		{
! 			/* Next pattern char must match literally, whatever it is */
! 			NextChar(p, plen);
! 			if ((plen <= 0) || !ICHAREQ(t, p))
! 				return LIKE_FALSE;
! 		}
! 		else if (*p == '%')
! 		{
! 			/* %% is the same as % according to the SQL standard */
! 			/* Advance past all %'s */
! 			while ((plen > 0) && (*p == '%'))
! 				NextChar(p, plen);
! 			/* Trailing percent matches everything. */
! 			if (plen <= 0)
! 				return LIKE_TRUE;
  
! 			/*
! 			 * Otherwise, scan for a text position at which we can match the
! 			 * rest of the pattern.
! 			 */
! 			while (tlen > 0)
! 			{
! 				/*
! 				 * Optimization to prevent most recursion: don't recurse
! 				 * unless first pattern char might match this text char.
! 				 */
! 				if (ICHAREQ(t, p) || (*p == '\\') || (*p == '_'))
  				{
! 					int			matched = MatchTextIC(t, tlen, p, plen);
  
! 					if (matched != LIKE_FALSE)
! 						return matched; /* TRUE or ABORT */
! 				}
  
! 				NextChar(t, tlen);
  			}
  
  			/*
--- 104,165 ----
  			 * Otherwise, scan for a text position at which we can match the
  			 * rest of the pattern.
  			 */
! 			if (*p == '_')
  
! 			{
! 				/* %_ is the same as _% - avoid matching _ repeatedly */
  
  				NextChar(t, tlen);
! 				NextByte(p, plen);
  
! 				if (tlen == 0)
! 				{
! 					return (plen == 0) ? LIKE_TRUE : LIKE_ABORT;
! 				}
! 				else if (plen == 0)
! 				{
! 					return LIKE_FALSE;
! 				}
  
! 				while (tlen > 0)
! 				{
! 					int			matched = MatchText(t, tlen, p, plen);
! 						
! 					if (matched != LIKE_FALSE)
! 							return matched; /* TRUE or ABORT */
  
! 					NextChar(t, tlen);
! 				}
! 			}
! 			else
! 			{
  
! 				char firstpat = *p ;
  
! 				if (*p == '\\')
! 				{
! 					if (plen < 2)
! 						return LIKE_FALSE;
! 					firstpat = p[1];
! 				}
  
! 				while (tlen > 0)
  				{
! 					/*
! 					 * Optimization to prevent most recursion: don't recurse
! 					 * unless first pattern byte matches first text byte.
! 					 */
! 					if (*t == firstpat)
! 					{
! 						int			matched = MatchText(t, tlen, p, plen);
! 						
! 						if (matched != LIKE_FALSE)
! 							return matched; /* TRUE or ABORT */
! 					}
  
! 					NextChar(t, tlen);
  
! 				}
  			}
  
  			/*
***************
*** 209,215 ****
  			 */
  			return LIKE_ABORT;
  		}
! 		else if ((*p != '_') && !ICHAREQ(t, p))
  		{
  			/*
  			 * Not the single-character wildcard and no explicit match? Then
--- 168,180 ----
  			 */
  			return LIKE_ABORT;
  		}
! 		else if (*p == '_')
! 		{
! 			NextChar(t, tlen);
! 			NextByte(p, plen);
! 			continue;
! 		}
! 		else if (*t != *p)
  		{
  			/*
  			 * Not the single-character wildcard and no explicit match? Then
***************
*** 217,225 ****
  			 */
  			return LIKE_FALSE;
  		}
! 
! 		NextChar(t, tlen);
! 		NextChar(p, plen);
  	}
  
  	if (tlen > 0)
--- 182,194 ----
  			 */
  			return LIKE_FALSE;
  		}
! 		/*
! 		 * It is safe to use NextByte instead of NextChar here, even for
! 		 * multi-byte character sets, because we are not following 
! 		 * immediately after a wildcard character.
! 		 */
! 		NextByte(t, tlen);
! 		NextByte(p, plen);
  	}
  
  	if (tlen > 0)
***************
*** 228,234 ****
  	/* End of input string.  Do we have matching pattern remaining? */
  	while ((plen > 0) && (*p == '%'))	/* allow multiple %'s at end of
  										 * pattern */
! 		NextChar(p, plen);
  	if (plen <= 0)
  		return LIKE_TRUE;
  
--- 197,204 ----
  	/* End of input string.  Do we have matching pattern remaining? */
  	while ((plen > 0) && (*p == '%'))	/* allow multiple %'s at end of
  										 * pattern */
! 		NextByte(p, plen);
! 
  	if (plen <= 0)
  		return LIKE_TRUE;
  
***************
*** 237,248 ****
  	 * matching this pattern.
  	 */
  	return LIKE_ABORT;
! }	/* MatchTextIC() */
  
  /*
   * like_escape() --- given a pattern and an ESCAPE string,
   * convert the pattern to use Postgres' standard backslash escape convention.
   */
  static text *
  do_like_escape(text *pat, text *esc)
  {
--- 207,220 ----
  	 * matching this pattern.
  	 */
  	return LIKE_ABORT;
! }	/* MatchText() */
  
  /*
   * like_escape() --- given a pattern and an ESCAPE string,
   * convert the pattern to use Postgres' standard backslash escape convention.
   */
+ #ifdef do_like_escape
+ 
  static text *
  do_like_escape(text *pat, text *esc)
  {
***************
*** 336,338 ****
--- 308,324 ----
  
  	return result;
  }
+ #endif /* do_like_escape */
+ 
+ #ifdef CHAREQ
+ #undef CHAREQ
+ #endif
+ 
+ #undef NextChar
+ #undef CopyAdvChar
+ #undef MatchText
+ 
+ #ifdef do_like_escape
+ #undef do_like_escape
+ #endif
+

---------------------------(end of broadcast)---------------------------
TIP 9: In versions below 8.0, the planner will ignore your desire to
       choose an index scan if your joining column's datatypes do not
       match

Re: [PATCHES] [HACKERS] like/ilike improvements

Reply via email to