Re: Allow to_date() and to_timestamp() to accept localized names

Juan José Santamaría Flecha Sun, 22 Sep 2019 05:34:39 -0700

On Wed, Sep 18, 2019 at 11:09 AM Juan José Santamaría Flecha
<juanjo.santama...@gmail.com> wrote:
>
> On Fri, Sep 13, 2019 at 10:31 PM Alvaro Herrera
> <alvhe...@2ndquadrant.com> wrote:
> >
> Thanks for taking a look at this.
>
> > I'm confused why we acquire the MONTH_DIM / etc definitions.  Can't we
> > just use lengthof() of the corresponding array?  AFAICS it should work
> > just as well.
> >
>
> It was because of the length difference between ascii-name arrays,
> which were all null-ended, and localized-name arrays. The attached
> version uses lengthof().
>
> > I wonder if the "compare first char" thing (seq_search_localized) really
> > works when there are multibyte chars in the day/month names.  I think
> > the code compares just the first char ... but what if the original
> > string uses those funny Unicode non-normalized letters and the locale
> > translation uses normalized letters?  My guess is that the first-char
> > comparison will fail, but surely you'll want the name to match.
> > (There's no month/day name in Spanish that doesn't start with an ASCII
> > letter, but I bet there are some in other languages.)  I think the
> > localized comparison should avoid the first-char optimization, just
> > compare the whole string all the time, and avoid possible weird issues.
>
> The localized search is reformulated in this version to deal with
> multibyte normalization. A regression test for this issue is included.


This version is updated to optimize the need for dynamic allocation.


> Regards,
>
> Juan José Santamaría Flecha

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 4e3e213..c470c0e 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -6415,8 +6415,17 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
      <listitem>
       <para>
        <literal>TM</literal> does not include trailing blanks.
+      </para>
+     </listitem>
+
+     <listitem>
+      <para>
        <function>to_timestamp</function> and <function>to_date</function> ignore
-       the <literal>TM</literal> modifier.
+       the case when receiving names as an input.  For example, either
+       <literal>to_timestamp('2000-JUN', 'YYYY-MON')</literal> or
+       <literal>to_timestamp('2000-Jun', 'YYYY-MON')</literal> or
+       <literal>to_timestamp('2000-JUN', 'YYYY-mon')</literal> work and return
+       the same output.
       </para>
      </listitem>
 
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index 755ca6e..39d2d11 100644
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -96,6 +96,7 @@
 #include "utils/memutils.h"
 #include "utils/numeric.h"
 #include "utils/pg_locale.h"
+#include "common/unicode_norm.h"
 
 /* ----------
  * Routines type
@@ -174,18 +175,17 @@ typedef struct
 #define CLOCK_24_HOUR		0
 #define CLOCK_12_HOUR		1
 
-
 /* ----------
  * Full months
  * ----------
  */
 static const char *const months_full[] = {
 	"January", "February", "March", "April", "May", "June", "July",
-	"August", "September", "October", "November", "December", NULL
+	"August", "September", "October", "November", "December"
 };
 
 static const char *const days_short[] = {
-	"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", NULL
+	"Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat"
 };
 
 /* ----------
@@ -217,8 +217,8 @@ static const char *const days_short[] = {
  * matches for BC have an odd index.  So the boolean value for BC is given by
  * taking the array index of the match, modulo 2.
  */
-static const char *const adbc_strings[] = {ad_STR, bc_STR, AD_STR, BC_STR, NULL};
-static const char *const adbc_strings_long[] = {a_d_STR, b_c_STR, A_D_STR, B_C_STR, NULL};
+static const char *const adbc_strings[] = {ad_STR, bc_STR, AD_STR, BC_STR};
+static const char *const adbc_strings_long[] = {a_d_STR, b_c_STR, A_D_STR, B_C_STR};
 
 /* ----------
  * AM / PM
@@ -244,8 +244,8 @@ static const char *const adbc_strings_long[] = {a_d_STR, b_c_STR, A_D_STR, B_C_S
  * matches for PM have an odd index.  So the boolean value for PM is given by
  * taking the array index of the match, modulo 2.
  */
-static const char *const ampm_strings[] = {am_STR, pm_STR, AM_STR, PM_STR, NULL};
-static const char *const ampm_strings_long[] = {a_m_STR, p_m_STR, A_M_STR, P_M_STR, NULL};
+static const char *const ampm_strings[] = {am_STR, pm_STR, AM_STR, PM_STR};
+static const char *const ampm_strings_long[] = {a_m_STR, p_m_STR, A_M_STR, P_M_STR};
 
 /* ----------
  * Months in roman-numeral
@@ -254,10 +254,10 @@ static const char *const ampm_strings_long[] = {a_m_STR, p_m_STR, A_M_STR, P_M_S
  * ----------
  */
 static const char *const rm_months_upper[] =
-{"XII", "XI", "X", "IX", "VIII", "VII", "VI", "V", "IV", "III", "II", "I", NULL};
+{"XII", "XI", "X", "IX", "VIII", "VII", "VI", "V", "IV", "III", "II", "I"};
 
 static const char *const rm_months_lower[] =
-{"xii", "xi", "x", "ix", "viii", "vii", "vi", "v", "iv", "iii", "ii", "i", NULL};
+{"xii", "xi", "x", "ix", "viii", "vii", "vi", "v", "iv", "iii", "ii", "i"};
 
 /* ----------
  * Roman numbers
@@ -975,7 +975,7 @@ static void parse_format(FormatNode *node, const char *str, const KeyWord *kw,
 
 static void DCH_to_char(FormatNode *node, bool is_interval,
 						TmToChar *in, char *out, Oid collid);
-static void DCH_from_char(FormatNode *node, char *in, TmFromChar *out);
+static void DCH_from_char(FormatNode *node, char *in, TmFromChar *out, Oid collid);
 
 #ifdef DEBUG_TO_FROM_CHAR
 static void dump_index(const KeyWord *k, const int *index);
@@ -990,10 +990,12 @@ static void from_char_set_mode(TmFromChar *tmfc, const FromCharDateMode mode);
 static void from_char_set_int(int *dest, const int value, const FormatNode *node);
 static int	from_char_parse_int_len(int *dest, char **src, const int len, FormatNode *node);
 static int	from_char_parse_int(int *dest, char **src, FormatNode *node);
-static int	seq_search(char *name, const char *const *array, int type, int max, int *len);
-static int	from_char_seq_search(int *dest, char **src, const char *const *array, int type, int max, FormatNode *node);
+static int	seq_search_sqlascii(char *name, const char *const *array, int type, int max, int *len, int dim);
+static int	seq_search_localized(char *name, char **array, int max, int *len, int dim, Oid collid);
+static int	from_char_seq_search(int *dest, char **src, const char *const *array, char **localized_array, int type, int max,
+								 FormatNode *node, int dim, Oid collid);
 static void do_to_timestamp(text *date_txt, text *fmt,
-							struct pg_tm *tm, fsec_t *fsec);
+							struct pg_tm *tm, fsec_t *fsec, Oid collid);
 static char *fill_str(char *str, int c, int max);
 static FormatNode *NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree);
 static char *int_to_roman(int number);
@@ -2325,17 +2327,18 @@ from_char_parse_int(int *dest, char **src, FormatNode *node)
 }
 
 /* ----------
- * Sequential search with to upper/lower conversion
+ * Sequential search with to upper/lower conversion for SQL_ASCII array input
  * ----------
  */
 static int
-seq_search(char *name, const char *const *array, int type, int max, int *len)
+seq_search_sqlascii(char *name, const char *const *array, int type, int max, int *len, int dim)
 {
 	const char *p;
 	const char *const *a;
 	char	   *n;
 	int			last,
 				i;
+	int			index;
 
 	*len = 0;
 
@@ -2348,7 +2351,7 @@ seq_search(char *name, const char *const *array, int type, int max, int *len)
 	else if (type == ALL_LOWER)
 		*name = pg_tolower((unsigned char) *name);
 
-	for (last = 0, a = array; *a != NULL; a++)
+	for (last = 0, a = array, index = 0; index < dim; a++, index++)
 	{
 		/* compare first chars */
 		if (*name != **a)
@@ -2360,13 +2363,13 @@ seq_search(char *name, const char *const *array, int type, int max, int *len)
 			if (max && i == max)
 			{
 				*len = i;
-				return a - array;
+				return index;
 			}
 			/* full size */
 			if (*p == '\0')
 			{
 				*len = i;
-				return a - array;
+				return index;
 			}
 			/* Not found in array 'a' */
 			if (*n == '\0')
@@ -2396,6 +2399,83 @@ seq_search(char *name, const char *const *array, int type, int max, int *len)
 	return -1;
 }
 
+/* ----------
+ * Sequential search with initcap conversion for localized array input
+ * ----------
+ */
+static int
+seq_search_localized(char *name, char **array, int max, int *len, int dim, Oid collid)
+{
+	char 	  **a;
+	char 	   *initcap_element;
+	char 	   *initcap_name;
+	char 	   *norm_name;
+	int			index;
+	int 		mb_max;
+	int         name_len;
+	int			encoding;
+	int         norm_len;
+	int         element_len;
+
+	*len = 0;
+
+	if (!*name)
+		return -1;
+
+	encoding = GetDatabaseEncoding();
+	mb_max = max * pg_encoding_max_length(encoding);
+	name_len = strlen(name);
+	name_len = name_len < mb_max ? name_len : mb_max;
+
+	/* Normalize and initcap name */
+	norm_name = name;
+	norm_len = name_len;
+	if (mb_max > max && encoding == PG_UTF8)
+	{
+		pg_wchar   *wchar_name;
+		pg_wchar   *norm_wname;
+		size_t      name_wlen;
+		size_t      norm_wlen;
+		wchar_name = (pg_wchar *) palloc((name_len + 1) * sizeof(pg_wchar));
+		name_wlen = pg_mb2wchar_with_len(name, wchar_name, name_len);
+		norm_wname = unicode_normalize_kc(wchar_name);
+		pfree(wchar_name);
+		norm_wlen = pg_wchar_strlen(norm_wname);
+		if (name_wlen > norm_wlen)
+		{
+			norm_name = (char *) palloc((norm_wlen + 1) * sizeof(pg_wchar));
+			norm_len = pg_wchar2mb_with_len(norm_wname, norm_name, norm_wlen);
+		}
+		pfree(norm_wname);
+	}
+	initcap_name = str_initcap(norm_name, norm_len, collid);
+	if (name_len != norm_len)
+		pfree(norm_name);
+
+	for (a = array, index = 0; index < dim; a++, index++)
+	{
+		/* Initcap element, assume it is normalized */
+		element_len = strlen(*a);
+		initcap_element = str_initcap(*a, element_len, collid);
+
+#ifdef DEBUG_TO_FROM_CHAR
+        elog(DEBUG_elog_output, "Name: 0x%x, Normalized: 0x%x, Element: 0x%x",
+             (unsigned char)*name, (unsigned char)*initcap_name, (unsigned char)*initcap_element);
+#endif
+		if (strncmp(initcap_name, initcap_element, element_len) == 0)
+		{
+			*len = element_len + name_len - norm_len;
+			pfree(initcap_element);
+			pfree(initcap_name);
+			return index;
+		}
+		pfree(initcap_element);
+	}
+
+	pfree(initcap_name);
+	return -1;
+}
+
 /*
  * Perform a sequential search in 'array' for text matching the first 'max'
  * characters of the source string.
@@ -2407,16 +2487,20 @@ seq_search(char *name, const char *const *array, int type, int max, int *len)
  * If the string doesn't match, throw an error.
  */
 static int
-from_char_seq_search(int *dest, char **src, const char *const *array, int type, int max,
-					 FormatNode *node)
+from_char_seq_search(int *dest, char **src, const char *const *array, char **localized_array, int type, int max,
+					 FormatNode *node, int dim, Oid collid)
 {
 	int			len;
 
-	*dest = seq_search(*src, array, type, max, &len);
+	if (localized_array == NULL)
+		*dest = seq_search_sqlascii(*src, array, type, max, &len, dim);
+	else
+		*dest = seq_search_localized(*src, localized_array, max, &len, dim, collid);
 	if (len <= 0)
 	{
 		char		copy[DCH_MAX_ITEM_SIZ + 1];
 
+		/* We use byte length, localized names encoding is ignored */
 		Assert(max <= DCH_MAX_ITEM_SIZ);
 		strlcpy(copy, *src, max + 1);
 
@@ -3017,19 +3101,24 @@ DCH_to_char(FormatNode *node, bool is_interval, TmToChar *in, char *out, Oid col
  * ----------
  */
 static void
-DCH_from_char(FormatNode *node, char *in, TmFromChar *out)
+DCH_from_char(FormatNode *node, char *in, TmFromChar *out, Oid collid)
 {
 	FormatNode *n;
 	char	   *s;
 	int			len,
 				value;
 	bool		fx_mode = false;
+	char	  **localized_names;
 
 	/* number of extra skipped characters (more than given in format string) */
 	int			extra_skip = 0;
+	/* cache localized days and months */
+	cache_locale_time();
 
 	for (n = node, s = in; n->type != NODE_TYPE_END && *s != '\0'; n++)
 	{
+		localized_names = NULL;
+
 		/*
 		 * Ignore spaces at the beginning of the string and before fields when
 		 * not in FX (fixed width) mode.
@@ -3111,8 +3200,8 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out)
 			case DCH_P_M:
 			case DCH_a_m:
 			case DCH_p_m:
-				from_char_seq_search(&value, &s, ampm_strings_long,
-									 ALL_UPPER, n->key->len, n);
+				from_char_seq_search(&value, &s, ampm_strings_long, localized_names,
+									 ALL_UPPER, n->key->len, n, lengthof(ampm_strings_long), collid);
 				from_char_set_int(&out->pm, value % 2, n);
 				out->clock = CLOCK_12_HOUR;
 				break;
@@ -3120,8 +3209,8 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out)
 			case DCH_PM:
 			case DCH_am:
 			case DCH_pm:
-				from_char_seq_search(&value, &s, ampm_strings,
-									 ALL_UPPER, n->key->len, n);
+				from_char_seq_search(&value, &s, ampm_strings, localized_names,
+									 ALL_UPPER, n->key->len, n, lengthof(ampm_strings), collid);
 				from_char_set_int(&out->pm, value % 2, n);
 				out->clock = CLOCK_12_HOUR;
 				break;
@@ -3211,30 +3300,34 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out)
 			case DCH_B_C:
 			case DCH_a_d:
 			case DCH_b_c:
-				from_char_seq_search(&value, &s, adbc_strings_long,
-									 ALL_UPPER, n->key->len, n);
+				from_char_seq_search(&value, &s, adbc_strings_long, localized_names,
+									 ALL_UPPER, n->key->len, n, lengthof(adbc_strings_long), collid);
 				from_char_set_int(&out->bc, value % 2, n);
 				break;
 			case DCH_AD:
 			case DCH_BC:
 			case DCH_ad:
 			case DCH_bc:
-				from_char_seq_search(&value, &s, adbc_strings,
-									 ALL_UPPER, n->key->len, n);
+				from_char_seq_search(&value, &s, adbc_strings, localized_names,
+									 ALL_UPPER, n->key->len, n, lengthof(adbc_strings), collid);
 				from_char_set_int(&out->bc, value % 2, n);
 				break;
 			case DCH_MONTH:
 			case DCH_Month:
 			case DCH_month:
-				from_char_seq_search(&value, &s, months_full, ONE_UPPER,
-									 MAX_MONTH_LEN, n);
+				if (S_TM(n->suffix))
+					localized_names = localized_full_months;
+				from_char_seq_search(&value, &s, months_full, localized_names,
+									 ONE_UPPER, MAX_MONTH_LEN, n, lengthof(months_full), collid);
 				from_char_set_int(&out->mm, value + 1, n);
 				break;
 			case DCH_MON:
 			case DCH_Mon:
 			case DCH_mon:
-				from_char_seq_search(&value, &s, months, ONE_UPPER,
-									 MAX_MON_LEN, n);
+				if (S_TM(n->suffix))
+					localized_names = localized_abbrev_months;
+				from_char_seq_search(&value, &s, months, localized_names,
+									 ONE_UPPER, MAX_MON_LEN, n, lengthof(months_full), collid);
 				from_char_set_int(&out->mm, value + 1, n);
 				break;
 			case DCH_MM:
@@ -3244,16 +3337,20 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out)
 			case DCH_DAY:
 			case DCH_Day:
 			case DCH_day:
-				from_char_seq_search(&value, &s, days, ONE_UPPER,
-									 MAX_DAY_LEN, n);
+				if (S_TM(n->suffix))
+					localized_names = localized_full_days;
+				from_char_seq_search(&value, &s, days, localized_names,
+									 ONE_UPPER, MAX_DAY_LEN, n, lengthof(days_short), collid);
 				from_char_set_int(&out->d, value, n);
 				out->d++;
 				break;
 			case DCH_DY:
 			case DCH_Dy:
 			case DCH_dy:
-				from_char_seq_search(&value, &s, days, ONE_UPPER,
-									 MAX_DY_LEN, n);
+				if (S_TM(n->suffix))
+					localized_names = localized_abbrev_days;
+				from_char_seq_search(&value, &s, days_short, localized_names,
+									 ONE_UPPER, MAX_DY_LEN, n, lengthof(days_short), collid);
 				from_char_set_int(&out->d, value, n);
 				out->d++;
 				break;
@@ -3351,13 +3448,13 @@ DCH_from_char(FormatNode *node, char *in, TmFromChar *out)
 				SKIP_THth(s, n->suffix);
 				break;
 			case DCH_RM:
-				from_char_seq_search(&value, &s, rm_months_upper,
-									 ALL_UPPER, MAX_RM_LEN, n);
+				from_char_seq_search(&value, &s, rm_months_upper, localized_names,
+									 ALL_UPPER, MAX_RM_LEN, n, lengthof(rm_months_upper), collid);
 				from_char_set_int(&out->mm, MONTHS_PER_YEAR - value, n);
 				break;
 			case DCH_rm:
-				from_char_seq_search(&value, &s, rm_months_lower,
-									 ALL_LOWER, MAX_RM_LEN, n);
+				from_char_seq_search(&value, &s, rm_months_lower, localized_names,
+									 ALL_LOWER, MAX_RM_LEN, n, lengthof(rm_months_lower), collid);
 				from_char_set_int(&out->mm, MONTHS_PER_YEAR - value, n);
 				break;
 			case DCH_W:
@@ -3690,7 +3787,7 @@ to_timestamp(PG_FUNCTION_ARGS)
 	struct pg_tm tm;
 	fsec_t		fsec;
 
-	do_to_timestamp(date_txt, fmt, &tm, &fsec);
+	do_to_timestamp(date_txt, fmt, &tm, &fsec, PG_GET_COLLATION());
 
 	/* Use the specified time zone, if any. */
 	if (tm.tm_zone)
@@ -3725,7 +3822,7 @@ to_date(PG_FUNCTION_ARGS)
 	struct pg_tm tm;
 	fsec_t		fsec;
 
-	do_to_timestamp(date_txt, fmt, &tm, &fsec);
+	do_to_timestamp(date_txt, fmt, &tm, &fsec, PG_GET_COLLATION());
 
 	/* Prevent overflow in Julian-day routines */
 	if (!IS_VALID_JULIAN(tm.tm_year, tm.tm_mon, tm.tm_mday))
@@ -3761,7 +3858,7 @@ to_date(PG_FUNCTION_ARGS)
  */
 static void
 do_to_timestamp(text *date_txt, text *fmt,
-				struct pg_tm *tm, fsec_t *fsec)
+				struct pg_tm *tm, fsec_t *fsec, Oid collid)
 {
 	FormatNode *format;
 	TmFromChar	tmfc;
@@ -3810,11 +3907,11 @@ do_to_timestamp(text *date_txt, text *fmt,
 		}
 
 #ifdef DEBUG_TO_FROM_CHAR
-		/* dump_node(format, fmt_len); */
-		/* dump_index(DCH_keywords, DCH_index); */
+		dump_node(format, fmt_len);
+		dump_index(DCH_keywords, DCH_index);
 #endif
 
-		DCH_from_char(format, date_str, &tmfc);
+		DCH_from_char(format, date_str, &tmfc, collid);
 
 		pfree(fmt_str);
 		if (!incache)
diff --git a/src/test/regress/expected/collate.linux.utf8.out b/src/test/regress/expected/collate.linux.utf8.out
index ad56ff9..5319d95 100644
--- a/src/test/regress/expected/collate.linux.utf8.out
+++ b/src/test/regress/expected/collate.linux.utf8.out
@@ -461,6 +461,24 @@ SELECT to_char(date '2010-04-01', 'DD TMMON YYYY' COLLATE "tr_TR");
  01 NİS 2010
 (1 row)
 
+-- to_date
+SET DateStyle='ISO, YMD';
+SET TimeZone='UTC';
+SELECT to_timestamp('01 ŞUB 2010', 'DD TMMON YYYY'); -- normalized \u015E
+      to_timestamp      
+------------------------
+ 2010-02-01 00:00:00+00
+(1 row)
+
+SELECT to_timestamp('01 ŞUB 2010', 'DD TMMON YYYY'); -- not normalized \u0053+\u0327
+      to_timestamp      
+------------------------
+ 2010-02-01 00:00:00+00
+(1 row)
+
+SELECT to_timestamp('1234567890ab 2010', 'TMMONTH YYYY'); -- fail
+ERROR:  invalid value "123456789" for "MONTH"
+DETAIL:  The given value did not match any of the allowed values for this field.
 -- backwards parsing
 CREATE VIEW collview1 AS SELECT * FROM collate_test1 WHERE b COLLATE "C" >= 'bbc';
 CREATE VIEW collview2 AS SELECT a, b FROM collate_test1 ORDER BY b COLLATE "C";
diff --git a/src/test/regress/sql/collate.linux.utf8.sql b/src/test/regress/sql/collate.linux.utf8.sql
index eac2f90..08b0404 100644
--- a/src/test/regress/sql/collate.linux.utf8.sql
+++ b/src/test/regress/sql/collate.linux.utf8.sql
@@ -182,6 +182,14 @@ SELECT to_char(date '2010-02-01', 'DD TMMON YYYY' COLLATE "tr_TR");
 SELECT to_char(date '2010-04-01', 'DD TMMON YYYY');
 SELECT to_char(date '2010-04-01', 'DD TMMON YYYY' COLLATE "tr_TR");
 
+-- to_date
+
+SET DateStyle='ISO, YMD';
+SET TimeZone='UTC';
+SELECT to_timestamp('01 ŞUB 2010', 'DD TMMON YYYY'); -- normalized \u015E
+SELECT to_timestamp('01 ŞUB 2010', 'DD TMMON YYYY'); -- not normalized \u0053+\u0327
+SELECT to_timestamp('1234567890ab 2010', 'TMMONTH YYYY'); -- fail
+
 
 -- backwards parsing

Re: Allow to_date() and to_timestamp() to accept localized names

Reply via email to