I wrote:
> That leads me to the attached patch. There is more that could be done
> here --- in particular, I'd like to see the character-not-byte-count
> rule extended to literal text. But that seems like fit material for
> a different patch.
Attached is a patch that makes formatting.c more multibyte-aware;
it now handles multibyte characters as single NODE_TYPE_CHAR format
nodes, rather than one node per byte. This doesn't really have much
impact on the output (to_char) side, but on the input side, it
greatly simplifies treating such characters as single characters
rather than multiple ones. An example is that (in UTF8 encoding)
previously we got
u8=# select to_number('$12.34', '€99D99');
to_number
-----------
0.34
(1 row)
because the literal euro sign is 3 bytes long and was thought to be
3 literal characters. Now we get the expected result
u8=# select to_number('$12.34', '€99D99');
to_number
-----------
12.34
(1 row)
Aside from skipping 1 input character (not byte) per literal format
character, I fixed the SKIP_THth macro, allowing to_date/to_timestamp to
also follow the rule of skipping whole characters not bytes for non-data
format patterns. There might be some other places that need similar
adjustments, but I couldn't find any.
Not sure about whether/how to add regression tests for this; it's really
impossible to add specific tests in an ASCII-only test file. Maybe we
could put a test or two into collate.linux.utf8.sql, but it seems a bit
off topic for that, and I think that test file hardly gets run anyway.
Note this needs to be applied over the patch I posted at
https://postgr.es/m/[email protected]
I intend to commit that fairly soon, but it's not in right now.
regards, tom lane
diff --git a/src/backend/utils/adt/formatting.c b/src/backend/utils/adt/formatting.c
index cb0dbf7..ec97de0 100644
*** a/src/backend/utils/adt/formatting.c
--- b/src/backend/utils/adt/formatting.c
*************** typedef enum
*** 151,158 ****
FROM_CHAR_DATE_ISOWEEK /* ISO 8601 week date */
} FromCharDateMode;
- typedef struct FormatNode FormatNode;
-
typedef struct
{
const char *name;
--- 151,156 ----
*************** typedef struct
*** 162,174 ****
FromCharDateMode date_mode;
} KeyWord;
! struct FormatNode
{
! int type; /* node type */
! const KeyWord *key; /* if node type is KEYWORD */
! char character; /* if node type is CHAR */
! int suffix; /* keyword suffix */
! };
#define NODE_TYPE_END 1
#define NODE_TYPE_ACTION 2
--- 160,172 ----
FromCharDateMode date_mode;
} KeyWord;
! typedef struct
{
! int type; /* NODE_TYPE_XXX, see below */
! const KeyWord *key; /* if type is ACTION */
! char character[MAX_MULTIBYTE_CHAR_LEN + 1]; /* if type is CHAR */
! int suffix; /* keyword prefix/suffix code, if any */
! } FormatNode;
#define NODE_TYPE_END 1
#define NODE_TYPE_ACTION 2
*************** parse_format(FormatNode *node, const cha
*** 1282,1293 ****
}
else if (*str)
{
/*
* Process double-quoted literal string, if any
*/
if (*str == '"')
{
! while (*(++str))
{
if (*str == '"')
{
--- 1280,1294 ----
}
else if (*str)
{
+ int chlen;
+
/*
* Process double-quoted literal string, if any
*/
if (*str == '"')
{
! str++;
! while (*str)
{
if (*str == '"')
{
*************** parse_format(FormatNode *node, const cha
*** 1297,1307 ****
/* backslash quotes the next character, if any */
if (*str == '\\' && *(str + 1))
str++;
n->type = NODE_TYPE_CHAR;
! n->character = *str;
n->key = NULL;
n->suffix = 0;
n++;
}
}
else
--- 1298,1311 ----
/* backslash quotes the next character, if any */
if (*str == '\\' && *(str + 1))
str++;
+ chlen = pg_mblen(str);
n->type = NODE_TYPE_CHAR;
! memcpy(n->character, str, chlen);
! n->character[chlen] = '\0';
n->key = NULL;
n->suffix = 0;
n++;
+ str += chlen;
}
}
else
*************** parse_format(FormatNode *node, const cha
*** 1312,1323 ****
*/
if (*str == '\\' && *(str + 1) == '"')
str++;
n->type = NODE_TYPE_CHAR;
! n->character = *str;
n->key = NULL;
n->suffix = 0;
n++;
! str++;
}
}
}
--- 1316,1329 ----
*/
if (*str == '\\' && *(str + 1) == '"')
str++;
+ chlen = pg_mblen(str);
n->type = NODE_TYPE_CHAR;
! memcpy(n->character, str, chlen);
! n->character[chlen] = '\0';
n->key = NULL;
n->suffix = 0;
n++;
! str += chlen;
}
}
}
*************** dump_node(FormatNode *node, int max)
*** 1349,1355 ****
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_ACTION '%s'\t(%s,%s)",
a, n->key->name, DUMP_THth(n->suffix), DUMP_FM(n->suffix));
else if (n->type == NODE_TYPE_CHAR)
! elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%c'", a, n->character);
else if (n->type == NODE_TYPE_END)
{
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_END", a);
--- 1355,1362 ----
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_ACTION '%s'\t(%s,%s)",
a, n->key->name, DUMP_THth(n->suffix), DUMP_FM(n->suffix));
else if (n->type == NODE_TYPE_CHAR)
! elog(DEBUG_elog_output, "%d:\t NODE_TYPE_CHAR '%s'",
! a, n->character);
else if (n->type == NODE_TYPE_END)
{
elog(DEBUG_elog_output, "%d:\t NODE_TYPE_END", a);
*************** asc_toupper_z(const char *buff)
*** 2008,2015 ****
do { \
if (S_THth(_suf)) \
{ \
! if (*(ptr)) (ptr)++; \
! if (*(ptr)) (ptr)++; \
} \
} while (0)
--- 2015,2022 ----
do { \
if (S_THth(_suf)) \
{ \
! if (*(ptr)) (ptr) += pg_mblen(ptr); \
! if (*(ptr)) (ptr) += pg_mblen(ptr); \
} \
} while (0)
*************** is_next_separator(FormatNode *n)
*** 2076,2082 ****
return true;
}
! else if (isdigit((unsigned char) n->character))
return false;
return true; /* some non-digit input (separator) */
--- 2083,2090 ----
return true;
}
! else if (n->character[1] == '\0' &&
! isdigit((unsigned char) n->character[0]))
return false;
return true; /* some non-digit input (separator) */
*************** DCH_to_char(FormatNode *node, bool is_in
*** 2405,2412 ****
{
if (n->type != NODE_TYPE_ACTION)
{
! *s = n->character;
! s++;
continue;
}
--- 2413,2420 ----
{
if (n->type != NODE_TYPE_ACTION)
{
! strcpy(s, n->character);
! s += strlen(s);
continue;
}
*************** DCH_from_char(FormatNode *node, char *in
*** 2974,2980 ****
* we don't insist that the consumed character match the format's
* character.
*/
! s++;
continue;
}
--- 2982,2988 ----
* we don't insist that the consumed character match the format's
* character.
*/
! s += pg_mblen(s);
continue;
}
*************** get_last_relevant_decnum(char *num)
*** 4217,4223 ****
/*
* These macros are used in NUM_processor() and its subsidiary routines.
* OVERLOAD_TEST: true if we've reached end of input string
! * AMOUNT_TEST(s): true if at least s characters remain in string
*/
#define OVERLOAD_TEST (Np->inout_p >= Np->inout + input_len)
#define AMOUNT_TEST(s) (Np->inout_p <= Np->inout + (input_len - (s)))
--- 4225,4231 ----
/*
* These macros are used in NUM_processor() and its subsidiary routines.
* OVERLOAD_TEST: true if we've reached end of input string
! * AMOUNT_TEST(s): true if at least s bytes remain in string
*/
#define OVERLOAD_TEST (Np->inout_p >= Np->inout + input_len)
#define AMOUNT_TEST(s) (Np->inout_p <= Np->inout + (input_len - (s)))
*************** NUM_processor(FormatNode *node, NUMDesc
*** 4821,4829 ****
if (!Np->is_to_char)
{
/*
! * Check at least one character remains to be scanned. (In
! * actions below, must use AMOUNT_TEST if we want to read more
! * characters than that.)
*/
if (OVERLOAD_TEST)
break;
--- 4829,4837 ----
if (!Np->is_to_char)
{
/*
! * Check at least one byte remains to be scanned. (In actions
! * below, must use AMOUNT_TEST if we want to read more bytes than
! * that.)
*/
if (OVERLOAD_TEST)
break;
*************** NUM_processor(FormatNode *node, NUMDesc
*** 5081,5092 ****
* In TO_CHAR, non-pattern characters in the format are copied to
* the output. In TO_NUMBER, we skip one input character for each
* non-pattern format character, whether or not it matches the
! * format character. (Currently, that's actually implemented as
! * skipping one input byte per non-pattern format byte, which is
! * wrong...)
*/
if (Np->is_to_char)
! *Np->inout_p = n->character;
}
Np->inout_p++;
}
--- 5089,5106 ----
* In TO_CHAR, non-pattern characters in the format are copied to
* the output. In TO_NUMBER, we skip one input character for each
* non-pattern format character, whether or not it matches the
! * format character.
*/
if (Np->is_to_char)
! {
! strcpy(Np->inout_p, n->character);
! Np->inout_p += strlen(Np->inout_p);
! }
! else
! {
! Np->inout_p += pg_mblen(Np->inout_p);
! }
! continue;
}
Np->inout_p++;
}