On Wed, 19 Jul 2023 at 23:14, Dean Rasheed <dean.a.rash...@gmail.com> wrote:
> Hmm, I'm somewhat sceptical about this second patch. It's not obvious
> why adding such tests would speed it up, and indeed, testing on my
> machine with 50M rows, I see a noticeable speed-up from patch 1, and a
> slow-down from patch 2:

I noticed that 6fcda9aba added quite a lot of conditions that need to
be checked before we process a normal decimal integer string. I think
we could likely do better and code it to assume that most strings will
be decimal and put that case first rather than last.

In the attached, I've changed that for the 32-bit version only.  A
more complete patch would need to do the 16 and 64-bit versions too.

-- setup
create table a (a int);
insert into a select x from generate_series(1,10000000)x;
copy a to '~/a.dump';

-- test
truncate a; copy a from '/tmp/a.dump';

master @ ab29a7a9c
Time: 2152.633 ms (00:02.153)
Time: 2121.091 ms (00:02.121)
Time: 2100.995 ms (00:02.101)
Time: 2101.724 ms (00:02.102)
Time: 2103.949 ms (00:02.104)

master + pg_strtoint32_base_10_first.patch
Time: 2061.464 ms (00:02.061)
Time: 2035.513 ms (00:02.036)
Time: 2028.356 ms (00:02.028)
Time: 2043.218 ms (00:02.043)
Time: 2037.035 ms (00:02.037) (~3.6% faster)

Without that, we need to check if the first digit is '0' a total of 3
times and also check if the 2nd digit is any of 'x', 'X', 'o', 'O',
'b' or 'B'. It seems to be coded with the assumption that hex strings
are the most likely. I think decimals are the most likely by far.

David
diff --git a/src/backend/utils/adt/numutils.c b/src/backend/utils/adt/numutils.c
index 471fbb7ee6..60c90f7252 100644
--- a/src/backend/utils/adt/numutils.c
+++ b/src/backend/utils/adt/numutils.c
@@ -314,113 +314,119 @@ pg_strtoint32_safe(const char *s, Node *escontext)
        else if (*ptr == '+')
                ptr++;
 
-       /* process digits */
-       if (ptr[0] == '0' && (ptr[1] == 'x' || ptr[1] == 'X'))
+       /* process decimal digits */
+       if (isdigit((unsigned char) ptr[0]) &&
+               (isdigit((unsigned char) ptr[1]) || ptr[1] == '_' || ptr[1] == 
'\0' || isspace(ptr[1])))
        {
-               firstdigit = ptr += 2;
+               firstdigit = ptr;
 
                while (*ptr)
                {
-                       if (isxdigit((unsigned char) *ptr))
+                       if (isdigit((unsigned char) *ptr))
                        {
-                               if (unlikely(tmp > -(PG_INT32_MIN / 16)))
+                               if (unlikely(tmp > -(PG_INT32_MIN / 10)))
                                        goto out_of_range;
 
-                               tmp = tmp * 16 + hexlookup[(unsigned char) 
*ptr++];
+                               tmp = tmp * 10 + (*ptr++ - '0');
                        }
                        else if (*ptr == '_')
                        {
-                               /* underscore must be followed by more digits */
+                               /* underscore may not be first */
+                               if (unlikely(ptr == firstdigit))
+                                       goto invalid_syntax;
+                               /* and it must be followed by more digits */
                                ptr++;
-                               if (*ptr == '\0' || !isxdigit((unsigned char) 
*ptr))
+                               if (*ptr == '\0' || !isdigit((unsigned char) 
*ptr))
                                        goto invalid_syntax;
                        }
                        else
                                break;
                }
        }
-       else if (ptr[0] == '0' && (ptr[1] == 'o' || ptr[1] == 'O'))
+       /* process hex digits */
+       else if (ptr[0] == '0' && (ptr[1] == 'x' || ptr[1] == 'X'))
        {
                firstdigit = ptr += 2;
 
                while (*ptr)
                {
-                       if (*ptr >= '0' && *ptr <= '7')
+                       if (isxdigit((unsigned char) *ptr))
                        {
-                               if (unlikely(tmp > -(PG_INT32_MIN / 8)))
+                               if (unlikely(tmp > -(PG_INT32_MIN / 16)))
                                        goto out_of_range;
 
-                               tmp = tmp * 8 + (*ptr++ - '0');
+                               tmp = tmp * 16 + hexlookup[(unsigned char) 
*ptr++];
                        }
                        else if (*ptr == '_')
                        {
                                /* underscore must be followed by more digits */
                                ptr++;
-                               if (*ptr == '\0' || *ptr < '0' || *ptr > '7')
+                               if (*ptr == '\0' || !isxdigit((unsigned char) 
*ptr))
                                        goto invalid_syntax;
                        }
                        else
                                break;
                }
        }
-       else if (ptr[0] == '0' && (ptr[1] == 'b' || ptr[1] == 'B'))
+       /* process octal digits */
+       else if (ptr[0] == '0' && (ptr[1] == 'o' || ptr[1] == 'O'))
        {
                firstdigit = ptr += 2;
 
                while (*ptr)
                {
-                       if (*ptr >= '0' && *ptr <= '1')
+                       if (*ptr >= '0' && *ptr <= '7')
                        {
-                               if (unlikely(tmp > -(PG_INT32_MIN / 2)))
+                               if (unlikely(tmp > -(PG_INT32_MIN / 8)))
                                        goto out_of_range;
 
-                               tmp = tmp * 2 + (*ptr++ - '0');
+                               tmp = tmp * 8 + (*ptr++ - '0');
                        }
                        else if (*ptr == '_')
                        {
                                /* underscore must be followed by more digits */
                                ptr++;
-                               if (*ptr == '\0' || *ptr < '0' || *ptr > '1')
+                               if (*ptr == '\0' || *ptr < '0' || *ptr > '7')
                                        goto invalid_syntax;
                        }
                        else
                                break;
                }
        }
-       else
+       /* process binary digits */
+       else if (ptr[0] == '0' && (ptr[1] == 'b' || ptr[1] == 'B'))
        {
-               firstdigit = ptr;
+               firstdigit = ptr += 2;
 
                while (*ptr)
                {
-                       if (isdigit((unsigned char) *ptr))
+                       if (*ptr >= '0' && *ptr <= '1')
                        {
-                               if (unlikely(tmp > -(PG_INT32_MIN / 10)))
+                               if (unlikely(tmp > -(PG_INT32_MIN / 2)))
                                        goto out_of_range;
 
-                               tmp = tmp * 10 + (*ptr++ - '0');
+                               tmp = tmp * 2 + (*ptr++ - '0');
                        }
                        else if (*ptr == '_')
                        {
-                               /* underscore may not be first */
-                               if (unlikely(ptr == firstdigit))
-                                       goto invalid_syntax;
-                               /* and it must be followed by more digits */
+                               /* underscore must be followed by more digits */
                                ptr++;
-                               if (*ptr == '\0' || !isdigit((unsigned char) 
*ptr))
+                               if (*ptr == '\0' || *ptr < '0' || *ptr > '1')
                                        goto invalid_syntax;
                        }
                        else
                                break;
                }
        }
+       else
+               goto invalid_syntax;
 
        /* require at least one digit */
        if (unlikely(ptr == firstdigit))
                goto invalid_syntax;
 
        /* allow trailing whitespace, but not other trailing chars */
-       while (*ptr != '\0' && isspace((unsigned char) *ptr))
+       while (isspace((unsigned char) *ptr))
                ptr++;
 
        if (unlikely(*ptr != '\0'))

Reply via email to