 src/common/wchar.c          | 116 ++++++++++++++++++++++++++-------
 src/include/port/pg_utf8.h  |  74 +++++++++++++++++++++
 src/port/Makefile           |  13 +++-
 src/port/pg_utf8_choose.c   |  65 +++++++++++++++++++
 src/port/pg_utf8_fallback.c | 153 ++++++++++++++++++++++++++++++++++++++++++++
 src/port/pg_utf8_sse42.c    |  29 +++++++++
 6 files changed, 425 insertions(+), 25 deletions(-)

diff --git a/src/common/wchar.c b/src/common/wchar.c
index 6e7d731e02..742957e67e 100644
--- a/src/common/wchar.c
+++ b/src/common/wchar.c
@@ -13,6 +13,7 @@
 #include "c.h"
 
 #include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
 
 
 /*
@@ -1189,6 +1190,15 @@ pg_eucjp_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1247,6 +1257,15 @@ pg_euckr_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1330,6 +1349,15 @@ pg_euctw_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1383,6 +1411,15 @@ pg_johab_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1433,6 +1470,15 @@ pg_mule_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1502,6 +1548,15 @@ pg_sjis_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1551,6 +1606,15 @@ pg_big5_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1600,6 +1664,15 @@ pg_gbk_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1649,6 +1722,15 @@ pg_uhc_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1709,6 +1791,15 @@ pg_gb18030_verifystr(const unsigned char *s, int len)
 		int			l;
 
 		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
 		if (!IS_HIGHBIT_SET(*s))
 		{
 			if (*s == '\0')
@@ -1760,30 +1851,7 @@ pg_utf8_verifychar(const unsigned char *s, int len)
 static int
 pg_utf8_verifystr(const unsigned char *s, int len)
 {
-	const unsigned char *start = s;
-
-	while (len > 0)
-	{
-		int			l;
-
-		/* fast path for ASCII-subset characters */
-		if (!IS_HIGHBIT_SET(*s))
-		{
-			if (*s == '\0')
-				break;
-			l = 1;
-		}
-		else
-		{
-			l = pg_utf8_verifychar(s, len);
-			if (l == -1)
-				break;
-		}
-		s += l;
-		len -= l;
-	}
-
-	return s - start;
+	return pg_validate_utf8(s, len);
 }
 
 /*
diff --git a/src/include/port/pg_utf8.h b/src/include/port/pg_utf8.h
new file mode 100644
index 0000000000..b0e0939e43
--- /dev/null
+++ b/src/include/port/pg_utf8.h
@@ -0,0 +1,74 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8.h
+ *	  Routines for fast validation of UTF-8 text.
+ *
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * src/include/port/pg_utf8.h
+ *
+ *-------------------------------------------------------------------------
+ */
+#ifndef PG_UTF8_H
+#define PG_UTF8_H
+
+
+#if defined(USE_SSE42_CRC32C)
+/* Use Intel SSE4.2 instructions. */
+extern int pg_validate_utf8_sse42(const unsigned char *s, int len);
+
+#elif defined(USE_SSE42_CRC32C_WITH_RUNTIME_CHECK)
+/*
+ * Use Intel SSE 4.2 instructions, but perform a runtime check first
+ * to check that they are available.
+ */
+extern int (*pg_validate_utf8) (const unsigned char *s, int len);
+extern int pg_validate_utf8_sse42(const unsigned char *s, int len);
+extern int pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+#else
+extern int pg_validate_utf8_fallback(const unsigned char *s, int len);
+
+#endif							/* USE_SSE42_CRC32C */
+
+
+/* from https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord */
+#define HAS_ZERO(chunk) ( \
+	((chunk) - UINT64CONST(0x0101010101010101)) & \
+	 ~(chunk) & \
+	 UINT64CONST(0x8080808080808080))
+
+/* Verify a chunk of bytes for valid ASCII including a zero-byte check. */
+static inline int
+check_ascii(const unsigned char *s, int len)
+{
+	uint64		half1, half2,
+				highbit_mask;
+
+	if  (len >= 2 * sizeof(uint64))
+	{
+		memcpy(&half1, s, sizeof(uint64));
+		memcpy(&half2, s + sizeof(uint64), sizeof(uint64));
+
+		/*
+		 * If there are any zero bytes, bail and let the slow
+		 * path handle it.
+		 */
+		if (HAS_ZERO(half1) || HAS_ZERO(half2))
+			return 0;
+
+		/* Check if any bytes in this chunk have the high bit set. */
+		highbit_mask = ((half1 | half2) & UINT64CONST(0x8080808080808080));
+
+		if (!highbit_mask)
+			return 2 * sizeof(uint64);
+		else
+			return 0;
+	}
+
+	return 0;
+}
+
+#endif							/* PG_UTF8_H */
diff --git a/src/port/Makefile b/src/port/Makefile
index e41b005c4f..bd33d500c5 100644
--- a/src/port/Makefile
+++ b/src/port/Makefile
@@ -59,7 +59,13 @@ OBJS = \
 	snprintf.o \
 	strerror.o \
 	tar.o \
-	thread.o
+	thread.o \
+	pg_utf8_sse42.o \
+	pg_utf8_fallback.o \
+	pg_utf8_choose.o
+
+# FIXME --^
+# we need something like $(PG_SSE42_OBJS)
 
 # libpgport.a, libpgport_shlib.a, and libpgport_srv.a contain the same files
 # foo.o, foo_shlib.o, and foo_srv.o are all built from foo.c
@@ -88,6 +94,11 @@ libpgport.a: $(OBJS)
 thread.o: CFLAGS+=$(PTHREAD_CFLAGS)
 thread_shlib.o: CFLAGS+=$(PTHREAD_CFLAGS)
 
+# all versions of pg_utf8_sse42.o need CFLAGS_SSE42
+pg_utf8_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
+pg_utf8_sse42_srv.o: CFLAGS+=$(CFLAGS_SSE42)
+
 # all versions of pg_crc32c_sse42.o need CFLAGS_SSE42
 pg_crc32c_sse42.o: CFLAGS+=$(CFLAGS_SSE42)
 pg_crc32c_sse42_shlib.o: CFLAGS+=$(CFLAGS_SSE42)
diff --git a/src/port/pg_utf8_choose.c b/src/port/pg_utf8_choose.c
new file mode 100644
index 0000000000..4dd80c2189
--- /dev/null
+++ b/src/port/pg_utf8_choose.c
@@ -0,0 +1,65 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_sse42_choose.c
+ *	  Choose between Intel SSE 4.2 and fallback implementation.
+ *
+ * On first call, checks if the CPU we're running on supports Intel SSE
+ * 4.2. If it does, use SSE instructions for UTF-8 validation. Otherwise,
+ * fall back to the pure C implementation which has a fast path for ASCII
+ * text.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_sse42_choose.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#ifdef HAVE__GET_CPUID
+#include <cpuid.h>
+#endif
+
+#ifdef HAVE__CPUID
+#include <intrin.h>
+#endif
+
+#include "port/pg_utf8.h"
+
+static bool
+pg_utf8_sse42_available(void)
+{
+	unsigned int exx[4] = {0, 0, 0, 0};
+
+#if defined(HAVE__GET_CPUID)
+	__get_cpuid(1, &exx[0], &exx[1], &exx[2], &exx[3]);
+#elif defined(HAVE__CPUID)
+	__cpuid(exx, 1);
+#else
+#error cpuid instruction not available
+#endif
+
+	return (exx[2] & (1 << 20)) != 0;	/* SSE 4.2 */
+}
+
+/*
+ * This gets called on the first call. It replaces the function pointer
+ * so that subsequent calls are routed directly to the chosen implementation.
+ */
+static int
+pg_validate_utf8_choose(const unsigned char *s, int len)
+{
+	if (pg_utf8_sse42_available())
+		//pg_validate_utf8 = pg_validate_utf8_sse42;
+		pg_validate_utf8 = pg_validate_utf8_fallback; //  FIXME
+	else
+		pg_validate_utf8 = pg_validate_utf8_fallback;
+
+	return pg_validate_utf8(s, len);
+}
+
+int	(*pg_validate_utf8) (const unsigned char *s, int len) = pg_validate_utf8_choose;
diff --git a/src/port/pg_utf8_fallback.c b/src/port/pg_utf8_fallback.c
new file mode 100644
index 0000000000..113534c2ec
--- /dev/null
+++ b/src/port/pg_utf8_fallback.c
@@ -0,0 +1,153 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ *	  Validate UTF-8 with a fast path for the ASCII subset.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include "port/pg_utf8.h"
+
+
+#define IS_CONTINUATION_BYTE(c) (((c) & 0b11000000) == 0b10000000)
+
+/*
+ * See the comment in common/wchar.c under "multibyte sequence validators".
+ */
+int
+pg_validate_utf8_fallback(const unsigned char *s, int len)
+{
+	const unsigned char *start = s;
+	unsigned char b1, b2, b3, b4;
+
+	while (len > 0)
+	{
+		int			l;
+
+		/* fast path for ASCII-subset characters */
+		l = check_ascii(s, len);
+		if (l)
+		{
+			s += l;
+			len -= l;
+			continue;
+		}
+
+		/* Found non-ASCII or zero above, so verify a single character. */
+		if (!IS_HIGHBIT_SET(*s))
+		{
+			if (*s == '\0')
+				break;
+			l = 1;
+		}
+		else if ((*s & 0b11100000) == 0b11000000)
+		{
+			l = 2;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+
+			if (!IS_CONTINUATION_BYTE(b2))
+				break;
+
+			/* check 2-byte overlong: 1100.000x.10xx.xxxx */
+			if (b1 < 0xC2)
+				break;
+
+#ifdef USE_ASSERT_CHECKING
+			uint32 code_point = (b1 & 0b00011111) << 6 |
+								(b2 & 0b00111111);
+
+			Assert(code_point >= 0x80 && code_point <= 0x7FF);
+#endif
+		}
+		else if ((*s & 0b11110000) == 0b11100000)
+		{
+			l = 3;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3))
+				break;
+
+			/* check 3-byte overlong: 1110.0000 1001.xxxx 10xx.xxxx */
+			if (b1 == 0xE0 && b2 < 0xA0)
+				break;
+
+			/* check surrogate: 1110.1101 101x.xxxx 10xx.xxxx */
+			if (b1 == 0xED && b2 > 0x9F)
+				break;
+
+#ifdef USE_ASSERT_CHECKING
+			uint32 code_point = (b1 & 0b00001111) << 12 |
+								(b2 & 0b00111111) << 6 |
+								(b3 & 0b00111111);
+
+			Assert((code_point >= 0x0800 && code_point <= 0xD7FF) ||
+				   (code_point >= 0xE000 && code_point <= 0xFFFF));
+#endif
+		}
+		else if ((*s & 0b11111000) == 0b11110000)
+		{
+			l = 4;
+			if (len < l)
+				break;
+
+			b1 = *s;
+			b2 = *(s + 1);
+			b3 = *(s + 2);
+			b4 = *(s + 3);
+
+			if (!IS_CONTINUATION_BYTE(b2) ||
+				!IS_CONTINUATION_BYTE(b3) ||
+				!IS_CONTINUATION_BYTE(b4))
+				break;
+
+			/*
+			 * check 4-byte overlong:
+			 * 1111.0000 1000.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if (b1 == 0xF0 && b2 < 0x90)
+
+			/*
+			 * check too large:
+			 * 1111.0100 1001.xxxx 10xx.xxxx 10xx.xxxx
+			 */
+			if ((b1 == 0xF4 && b2 > 0x8F) || b1 > 0xF4)
+				break;
+
+#ifdef USE_ASSERT_CHECKING
+			uint32 code_point = (b1 & 0b00000111) << 18 |
+								(b2 & 0b00111111) << 12 |
+								(b3 & 0b00111111) << 6 |
+								(b4 & 0b00111111);
+
+			Assert(code_point >= 0x010000 && code_point <= 0x10FFFF);
+#endif
+		}
+		else
+			/* We may have a bare continuation or large byte. */
+			break;
+
+		s += l;
+		len -= l;
+	}
+
+	return s - start;
+}
diff --git a/src/port/pg_utf8_sse42.c b/src/port/pg_utf8_sse42.c
new file mode 100644
index 0000000000..30bd9769b6
--- /dev/null
+++ b/src/port/pg_utf8_sse42.c
@@ -0,0 +1,29 @@
+/*-------------------------------------------------------------------------
+ *
+ * pg_utf8_fallback.c
+ *	  Validate UTF-8 with Intel SSE 4.2 instructions.
+ *
+ * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ *
+ * IDENTIFICATION
+ *	  src/port/pg_utf8_fallback.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "c.h"
+
+#include <nmmintrin.h>
+
+#include "mb/pg_wchar.h"
+#include "port/pg_utf8.h"
+
+// TODO
+int
+pg_validate_utf8_sse42(const unsigned char *s, int len)
+{
+	Assert(0);
+	return 0;
+}
