Hello
Attached is a patch for adding uri as an encoding option for
encode/decode. It uses what's called "percent-encoding" in rfc3986
(https://tools.ietf.org/html/rfc3986#section-2.1).
The background for this patch is that I could easily build urls in
plpgsql, but doing the actual encoding of the url parts is painfully
slow. The list of available encodings for encode/decode looks quite
arbitrary to me, so I can't see any reason this one couldn't be in
there.
In modern web scenarios one would probably most likely want to encode
the utf8 representation of a text string for inclusion in a url, in
which case correct invocation would be ENCODE(CONVERT_TO('some text in
database encoding goes here', 'UTF8'), 'uri'), but uri
percent-encoding can of course also be used for other text encodings
and arbitrary binary data.
Regards,
Anders
diff --git a/src/backend/utils/adt/encode.c b/src/backend/utils/adt/encode.c
index 7293d66de5..33cf7bb57c 100644
--- a/src/backend/utils/adt/encode.c
+++ b/src/backend/utils/adt/encode.c
@@ -512,6 +512,131 @@ esc_dec_len(const char *src, unsigned srclen)
return len;
}
+/*
+ * URI percent encoding
+ *
+ * Percent encodes all byte values except the unreserved ASCII characters as per RFC3986.
+ */
+
+static const char upper_hex_digits[] = "0123456789ABCDEF";
+
+static unsigned
+uri_encode(const char *src, unsigned srclen, char *dst)
+{
+ char *d = dst;
+
+ for (const char *s = src; s < src + srclen; s++)
+ {
+ if ((*s >= 'A' && *s <= 'Z') ||
+ (*s >= 'a' && *s <= 'z') ||
+ (*s >= '0' && *s <= '9') ||
+ *s == '-' ||
+ *s == '_' ||
+ *s == '.' ||
+ *s == '~')
+ {
+ *d++ = *s;
+ }
+ else
+ {
+ *d++ = '%';
+ *d++ = upper_hex_digits[(*s >> 4) & 0xF];
+ *d++ = upper_hex_digits[*s & 0xF];
+ }
+ }
+ return d - dst;
+}
+
+static unsigned
+uri_decode(const char *src, unsigned srclen, char *dst)
+{
+ const char *s = src;
+ const char *srcend = src + srclen;
+ char *d = dst;
+ char val;
+
+ while (s < srcend)
+ {
+ if (*s == '%')
+ {
+ if (s > srcend - 3) {
+ /* This will never get triggered since uri_dec_len already takes care of validation
+ */
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid uri percent encoding"),
+ errhint("Input data ends prematurely.")));
+ }
+
+ /* Skip '%' */
+ s++;
+
+ val = get_hex(*s++) << 4;
+ val += get_hex(*s++);
+ *d++ = val;
+ }
+ else
+ {
+ *d++ = *s++;
+ }
+ }
+ return d - dst;
+}
+
+static unsigned
+uri_enc_len(const char *src, unsigned srclen)
+{
+ int len = 0;
+
+ for (const char *s = src; s < src + srclen; s++)
+ {
+ if ((*s >= 'A' && *s <= 'Z') ||
+ (*s >= 'a' && *s <= 'z') ||
+ (*s >= '0' && *s <= '9') ||
+ *s == '-' ||
+ *s == '_' ||
+ *s == '.' ||
+ *s == '~')
+ {
+ len++;
+ }
+ else
+ {
+ len += 3;
+ }
+ }
+ return len;
+}
+
+static unsigned
+uri_dec_len(const char *src, unsigned srclen)
+{
+ const char *s = src;
+ const char *srcend = src + srclen;
+ int len = 0;
+
+ while (s < srcend)
+ {
+ if (*s == '%')
+ {
+ if (s > srcend - 3) {
+ ereport(ERROR,
+ (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid uri percent encoding"),
+ errhint("Input data ends prematurely.")));
+ }
+ s++;
+ get_hex(*s++);
+ get_hex(*s++);
+ }
+ else {
+ s++;
+ }
+ len++;
+ }
+ return len;
+}
+
/*
* Common
*/
@@ -541,6 +666,12 @@ static const struct
esc_enc_len, esc_dec_len, esc_encode, esc_decode
}
},
+ {
+ "uri",
+ {
+ uri_enc_len, uri_dec_len, uri_encode, uri_decode
+ }
+ },
{
NULL,
{
diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out
index 2483966576..f89c5ec1c3 100644
--- a/src/test/regress/expected/strings.out
+++ b/src/test/regress/expected/strings.out
@@ -1870,3 +1870,24 @@ SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5
Th\000o\x02\x03
(1 row)
+SET bytea_output TO hex;
+SELECT encode(E'en\\300\\336d'::bytea, 'uri');
+ encode
+-----------
+ en%C0%DEd
+(1 row)
+
+SELECT decode('%De%c0%DEd', 'uri');
+ decode
+------------
+ \xdec0de64
+(1 row)
+
+SELECT decode('error%Ex', 'uri');
+ERROR: invalid hexadecimal digit: "x"
+SELECT decode('error%E', 'uri');
+ERROR: invalid uri percent encoding
+HINT: Input data ends prematurely.
+SELECT decode('error%', 'uri');
+ERROR: invalid uri percent encoding
+HINT: Input data ends prematurely.
diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql
index b5e75c344f..1d03836b6e 100644
--- a/src/test/regress/sql/strings.sql
+++ b/src/test/regress/sql/strings.sql
@@ -641,3 +641,10 @@ SELECT btrim(E'\\000trim\\000'::bytea, ''::bytea);
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'Th\\001omas'::bytea from 2),'escape');
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 8),'escape');
SELECT encode(overlay(E'Th\\000omas'::bytea placing E'\\002\\003'::bytea from 5 for 3),'escape');
+
+SET bytea_output TO hex;
+SELECT encode(E'en\\300\\336d'::bytea, 'uri');
+SELECT decode('%De%c0%DEd', 'uri');
+SELECT decode('error%Ex', 'uri');
+SELECT decode('error%E', 'uri');
+SELECT decode('error%', 'uri');