[PATCH] Support negative indexes in split_part

Nikhil Benesch Sun, 01 Nov 2020 18:40:32 -0800

I posted the idea of allowing negative indexes in split_part to pgsql-general
last week, and it seemed there was some interest:


http://postgr.es/m/CAPWqQZR%2B-5pAZNSSrnmYczRaX-huemc%3DoO8URvDZvUA-M%3DMOBA%40mail.gmail.com

Attached is a patch, based on master, that implements the approach as described
in that discussion.

The motivation is that the existing idioms for splitting a string and selecting
the nth-to-last element are rather complicated and/or inefficient:

  1. (string_to_array('foo bar baz', ' '))[cardinality(string_to_array('foo bar 
baz', ' ')) - 1]
  2. reverse(split_part(reverse('foo bar baz'), ' ', 1))
  3. (regexp_match('foo baz bar', '\S*$'))[1]

With the patch, split_part(haystack, needle, -1) selects the last field of the
string, split_part(haystack, needle, -2) selects the second-to-last field, and
so on. Per Tom Lane, there is precedent for this design, where negative indices
meaning "count from the end", namely the left and right string functions.

The patch includes updates to the docs and regression tests. If the feature is
deemed desirable, I believe the patch is "commit quality" (though, fair warning,
this is my first contribution to Postgres, so I might have the wrong notion
of what a committable patch looks like).

Note that the implementation is deliberately a bit inefficient to keep things
simple. When presented with a negative index, the implementation does an extra
pass over the string to count the total number of fields, in order to convert
the negative index to a positive index. Then it proceeds as it normally would.

One can imagine adding support for backwards B-M-H, but I'm not sure that could
be made to work with all multibyte encodings. We could at least avoid the extra
pass over the string by allocating a circular buffer of size |n| when n is
negative, but that wasn't clearly worthwhile. (I did implement the optimization
for the special case of -1, since its implementation was trivial.)

Cheers,
Nikhil

From a09f95bf56a7b6400ebce9e6faad90a434d21ac8 Mon Sep 17 00:00:00 2001
From: Nikhil Benesch <[email protected]>
Date: Sun, 1 Nov 2020 15:53:17 -0500
Subject: [PATCH] Support negative indexes in split_part

Negative indexes count from the right instead of the left.
For example:

    split_part('a@b@c@d', -2) -> c

The motivation is to provide a straightforward alternative to the
complicated and inefficient idioms available for this today, e.g.:

     reverse(split_part(reverse(haystack), reverse(needle), 1))
     (regexp_match(haystack, needle || '$'))[1]

Discussion: 
http://postgr.es/m/CAPWqQZR%2B-5pAZNSSrnmYczRaX-huemc%3DoO8URvDZvUA-M%3DMOBA%40mail.gmail.com
---
 doc/src/sgml/func.sgml                |  4 +-
 src/backend/utils/adt/varlena.c       | 62 ++++++++++++++++++++++++---
 src/test/regress/expected/strings.out | 62 ++++++++++++++++++++++++++-
 src/test/regress/sql/strings.sql      | 20 +++++++++
 4 files changed, 139 insertions(+), 9 deletions(-)

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index 7ef2ec9972..d0880ab1f3 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -3356,7 +3356,9 @@ repeat('Pg', 4) <returnvalue>PgPgPgPg</returnvalue>
        <para>
         Splits <parameter>string</parameter> at occurrences
         of <parameter>delimiter</parameter> and returns
-        the <parameter>n</parameter>'th field (counting from one).
+        the <parameter>n</parameter>'th field (counting from one),
+        or the |<parameter>n</parameter>|'th-to-last field if
+        <parameter>n</parameter> is negative.
        </para>
        <para>
         <literal>split_part('abc~@~def~@~ghi', '~@~', 2)</literal>
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index d7bc330541..c90550a183 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -1471,6 +1471,19 @@ text_position_get_match_pos(TextPositionState *state)
        }
 }
 
+/*
+ * Resets to the initial state installed by text_position_setup.
+ * The next call to text_position_next will search from the beginning
+ * of the string.
+ */
+static void
+text_position_reset(TextPositionState *state)
+{
+       state->last_match = NULL;
+       state->refpoint = state->str1;
+       state->refpos = 0;
+}
+
 static void
 text_position_cleanup(TextPositionState *state)
 {
@@ -4582,7 +4595,7 @@ replace_text_regexp(text *src_text, void *regexp,
 /*
  * split_part
  * parse input string
- * return ord item (1 based)
+ * return ord item (1 based, negative counts from end)
  * based on provided field separator
  */
 Datum
@@ -4600,10 +4613,10 @@ split_part(PG_FUNCTION_ARGS)
        bool            found;
 
        /* field number is 1 based */
-       if (fldnum < 1)
+       if (fldnum == 0)
                ereport(ERROR,
                                (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-                                errmsg("field position must be greater than 
zero")));
+                                errmsg("field position must not be zero")));
 
        inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
        fldsep_len = VARSIZE_ANY_EXHDR(fldsep);
@@ -4615,8 +4628,8 @@ split_part(PG_FUNCTION_ARGS)
        /* empty field separator */
        if (fldsep_len < 1)
        {
-               /* if first field, return input string, else empty string */
-               if (fldnum == 1)
+               /* if first or last field, return input string, else empty 
string */
+               if (fldnum == 1 || fldnum == -1)
                        PG_RETURN_TEXT_P(inputstring);
                else
                        PG_RETURN_TEXT_P(cstring_to_text(""));
@@ -4632,14 +4645,49 @@ split_part(PG_FUNCTION_ARGS)
        if (!found)
        {
                text_position_cleanup(&state);
-               /* if field 1 requested, return input string, else empty string 
*/
-               if (fldnum == 1)
+               /* if first or last field, return input string, else empty 
string */
+               if (fldnum == 1 || fldnum == -1)
                        PG_RETURN_TEXT_P(inputstring);
                else
                        PG_RETURN_TEXT_P(cstring_to_text(""));
        }
        end_ptr = text_position_get_match_ptr(&state);
 
+       /*
+        * negative fields count from the right
+        * convert to positive field number by counting total number of fields
+        */
+       if (fldnum < 0)
+       {
+               /* searching from the first match, so string has two fields to 
start */
+               int                     fldlen = 2;
+
+               while (text_position_next(&state))
+                       fldlen++;
+
+               /* special case of last field does not require extra pass */
+               if (fldnum == -1)
+               {
+                       start_ptr = text_position_get_match_ptr(&state) + 
fldsep_len;
+                       end_ptr = VARDATA_ANY(inputstring) + inputstring_len;
+                       text_position_cleanup(&state);
+                       PG_RETURN_TEXT_P(cstring_to_text_with_len(start_ptr,
+                                                                               
                          end_ptr - start_ptr));
+               }
+
+               /* nonexistent field, so return empty string */
+               if (-fldnum > fldlen)
+               {
+                       text_position_cleanup(&state);
+                       PG_RETURN_TEXT_P(cstring_to_text(""));
+               }
+
+               /* reset to pointing at first match, but now with positive 
fldnum */
+               fldnum += fldlen + 1;
+               text_position_reset(&state);
+               text_position_next(&state);
+       }
+
        while (found && --fldnum > 0)
        {
                /* identify bounds of next field */
diff --git a/src/test/regress/expected/strings.out 
b/src/test/regress/expected/strings.out
index 8c034c9599..3226f3c0f7 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -1552,8 +1552,44 @@ SELECT replace('yabadoo', 'bad', '') AS "yaoo";
 --
 -- test split_part
 --
+select split_part('','@',1) AS "empty string";
+ empty string 
+--------------
+ 
+(1 row)
+
+select split_part('','@',-1) AS "empty string";
+ empty string 
+--------------
+ 
+(1 row)
+
+select split_part('joeuser@mydatabase','',1) AS "joeuser@mydatabase";
+ joeuser@mydatabase 
+--------------------
+ joeuser@mydatabase
+(1 row)
+
+select split_part('joeuser@mydatabase','',2) AS "empty string";
+ empty string 
+--------------
+ 
+(1 row)
+
+select split_part('joeuser@mydatabase','',-1) AS "joeuser@mydatabase";
+ joeuser@mydatabase 
+--------------------
+ joeuser@mydatabase
+(1 row)
+
+select split_part('joeuser@mydatabase','',-2) AS "empty string";
+ empty string 
+--------------
+ 
+(1 row)
+
 select split_part('joeuser@mydatabase','@',0) AS "an error";
-ERROR:  field position must be greater than zero
+ERROR:  field position must not be zero
 select split_part('joeuser@mydatabase','@',1) AS "joeuser";
  joeuser 
 ---------
@@ -1578,6 +1614,30 @@ select split_part('@joeuser@mydatabase@','@',2) AS 
"joeuser";
  joeuser
 (1 row)
 
+select split_part('joeuser@mydatabase','@',-1) AS "mydatabase";
+ mydatabase 
+------------
+ mydatabase
+(1 row)
+
+select split_part('joeuser@mydatabase','@',-2) AS "joeuser";
+ joeuser 
+---------
+ joeuser
+(1 row)
+
+select split_part('joeuser@mydatabase','@',-3) AS "empty string";
+ empty string 
+--------------
+ 
+(1 row)
+
+select split_part('@joeuser@mydatabase@','@',-2) AS "mydatabase";
+ mydatabase 
+------------
+ mydatabase
+(1 row)
+
 --
 -- test to_hex
 --
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index 14901a2692..3f33a501f7 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -533,6 +533,18 @@ SELECT replace('yabadoo', 'bad', '') AS "yaoo";
 --
 -- test split_part
 --
+select split_part('','@',1) AS "empty string";
+
+select split_part('','@',-1) AS "empty string";
+
+select split_part('joeuser@mydatabase','',1) AS "joeuser@mydatabase";
+
+select split_part('joeuser@mydatabase','',2) AS "empty string";
+
+select split_part('joeuser@mydatabase','',-1) AS "joeuser@mydatabase";
+
+select split_part('joeuser@mydatabase','',-2) AS "empty string";
+
 select split_part('joeuser@mydatabase','@',0) AS "an error";
 
 select split_part('joeuser@mydatabase','@',1) AS "joeuser";
@@ -543,6 +555,14 @@ select split_part('joeuser@mydatabase','@',3) AS "empty 
string";
 
 select split_part('@joeuser@mydatabase@','@',2) AS "joeuser";
 
+select split_part('joeuser@mydatabase','@',-1) AS "mydatabase";
+
+select split_part('joeuser@mydatabase','@',-2) AS "joeuser";
+
+select split_part('joeuser@mydatabase','@',-3) AS "empty string";
+
+select split_part('@joeuser@mydatabase@','@',-2) AS "mydatabase";
+
 --
 -- test to_hex
 --
-- 
2.25.1

[PATCH] Support negative indexes in split_part

Reply via email to