Re: Proposal to introduce a shuffle function to intarray extension

Martin Kalcher Sun, 17 Jul 2022 16:07:38 -0700

Am 18.07.22 um 00:37 schrieb Thomas Munro:

Seems OK for a worst case.  It must still be a lot faster than doing
it in SQL.  Now I wonder what the exact requirements would be to
dispatch to a faster version that would handle int4.  I haven't
studied this in detail but perhaps to dispatch to a fast shuffle for
objects of size X, the requirement would be something like typlen == X
&& align_bytes <= typlen && typlen % align_bytes == 0, where
align_bytes is typalign converted to ALIGNOF_{CHAR,SHORT,INT,DOUBLE}?
Or in English, 'the data consists of densely packed objects of fixed
size X, no padding'.  Or perhaps you can work out the padded size and
use that, to catch a few more types.  Then you call
array_shuffle_{2,4,8}() as appropriate, which should be as fast as
your original int[] proposal, but work also for float, date, ...?


About your experimental patch, I haven't reviewed it properly or tried
it but I wonder if uint32 dat_offset, uint32 size (= half size
elements) would be enough due to limitations on varlenas.

I made another experimental patch with fast tracks for typelen4 andtypelen8. alignments are not yet considered.

From 37269598a1dace99c9059514e0fdcb90b9240ed3 Mon Sep 17 00:00:00 2001
From: Martin Kalcher <[email protected]>
Date: Sun, 17 Jul 2022 18:06:04 +0200
Subject: [PATCH] introduce array_shuffle()

---
 src/backend/utils/adt/arrayfuncs.c | 178 +++++++++++++++++++++++++++++
 src/include/catalog/pg_proc.dat    |   3 +
 2 files changed, 181 insertions(+)

diff --git a/src/backend/utils/adt/arrayfuncs.c b/src/backend/utils/adt/arrayfuncs.c
index fb167f226a..919cb2bfd6 100644
--- a/src/backend/utils/adt/arrayfuncs.c
+++ b/src/backend/utils/adt/arrayfuncs.c
@@ -6872,3 +6872,181 @@ trim_array(PG_FUNCTION_ARGS)
 
 	PG_RETURN_DATUM(result);
 }
+
+static ArrayType *
+array_shuffle_4(ArrayType *array)
+{
+	ArrayType	*result;
+	int32		*adat,
+				*rdat;
+	int			i,
+				j,
+				nitems;
+
+	result = create_array_envelope(ARR_NDIM(array),
+								   ARR_DIMS(array),
+								   ARR_LBOUND(array),
+								   ARR_SIZE(array),
+								   ARR_ELEMTYPE(array),
+								   array->dataoffset);
+
+	adat = (int32 *) ARR_DATA_PTR(array);
+	rdat = (int32 *) ARR_DATA_PTR(result);
+	nitems = ARR_DIMS(array)[0];
+
+	for (i = 0; i < nitems; i++)
+	{
+		j = random() % (i + 1);
+		rdat[i] = rdat[j];
+		rdat[j] = adat[i];
+	}
+	return result;
+}
+
+static ArrayType *
+array_shuffle_8(ArrayType *array)
+{
+	ArrayType	*result;
+	int64		*adat,
+				*rdat;
+	int			i,
+				j,
+				nitems;
+
+	result = create_array_envelope(ARR_NDIM(array),
+								   ARR_DIMS(array),
+								   ARR_LBOUND(array),
+								   ARR_SIZE(array),
+								   ARR_ELEMTYPE(array),
+								   array->dataoffset);
+
+	adat = (int64 *) ARR_DATA_PTR(array);
+	rdat = (int64 *) ARR_DATA_PTR(result);
+	nitems = ARR_DIMS(array)[0];
+
+	for (i = 0; i < nitems; i++)
+	{
+		j = random() % (i + 1);
+		rdat[i] = rdat[j];
+		rdat[j] = adat[i];
+	}
+	return result;
+}
+
+static ArrayType *
+array_shuffle_generic(ArrayType *array,
+					  int16 elmlen, bool elmbyval, char elmalign)
+{
+	ArrayType	*result;
+	char		*adat,
+				*rdat;
+	int 		i,
+				ndim,
+				*dims,
+				nitems,
+				nelms_per_item;
+
+	struct {
+		char *dat;
+		size_t size;
+	} *refs;
+
+	ndim = ARR_NDIM(array);
+	dims = ARR_DIMS(array);
+
+	nelms_per_item = 1;
+
+	for (i = 1; i < ndim; i++)
+		nelms_per_item *= dims[i];
+
+	nitems = dims[0];
+
+	refs = palloc(nitems * (sizeof(char *) + sizeof(size_t)));
+	adat = ARR_DATA_PTR(array);
+
+	/*
+	 * To allow shuffling of arrays with variable length elements, we are going
+	 * to shuffle references to the elements, because swapping variable length
+	 * elements in an array is complicated. In a second step we will iterate the
+	 * shuffled references and copy the elements to the destination array.
+	 * We are using the "inside-out" variant of the Fisher-Yates algorithm to
+	 * produce a shuffled array of references: Append each reference than swap it
+	 * with a radomly-chosen refence.
+	 */
+	for (i = 0; i < nitems; i++)
+	{
+		char *next = array_seek(adat, 0, NULL, nelms_per_item,
+							    elmlen, elmbyval, elmalign);
+		int j = random() % (i + 1);
+		refs[i] = refs[j];
+		refs[j].dat = adat;
+		refs[j].size = next - adat;
+		adat = next;
+	}
+
+	result = create_array_envelope(ARR_NDIM(array),
+								   ARR_DIMS(array),
+								   ARR_LBOUND(array),
+								   ARR_SIZE(array),
+								   ARR_ELEMTYPE(array),
+								   array->dataoffset);
+
+	rdat = ARR_DATA_PTR(result);
+
+	/* Collect all references and copy elements to the destination array */
+	for (i = 0; i < nitems; i++)
+	{
+		memcpy(rdat, refs[i].dat, refs[i].size);
+		rdat += refs[i].size;
+	}
+
+	pfree(refs);
+
+	return result;
+}
+
+Datum
+array_shuffle(PG_FUNCTION_ARGS)
+{
+	ArrayType		*array,
+					*result;
+	int16			elmlen;
+	bool			elmbyval;
+	char			elmalign;
+	Oid             elmtyp;
+	TypeCacheEntry	*typentry;
+
+	array = PG_GETARG_ARRAYTYPE_P(0);
+
+	if (ARR_HASNULL(array))
+		ereport(ERROR,
+				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
+				 errmsg("shuffling arrays with null elements is not supported")));
+
+	if (ARR_NDIM(array) < 1 || ARR_DIMS(array)[0] < 2)
+		PG_RETURN_ARRAYTYPE_P(array);
+
+	elmtyp = ARR_ELEMTYPE(array);
+
+	typentry = (TypeCacheEntry *) fcinfo->flinfo->fn_extra;
+
+	if (typentry == NULL || typentry->type_id != elmtyp)
+	{
+		typentry = lookup_type_cache(elmtyp, 0);
+		fcinfo->flinfo->fn_extra = (void *) typentry;
+	}
+	elmlen = typentry->typlen;
+	elmbyval = typentry->typbyval;
+	elmalign = typentry->typalign;
+
+	if (ARR_NDIM(array) == 1 && elmlen == 4)
+		result = array_shuffle_4(array);
+	else if (ARR_NDIM(array) == 1 && elmlen == 8)
+		result = array_shuffle_8(array);
+	else
+		result = array_shuffle_generic(array, elmlen, elmbyval, elmalign);
+
+	PG_FREE_IF_COPY(array, 0);
+
+	PG_RETURN_ARRAYTYPE_P(result);
+}
diff --git a/src/include/catalog/pg_proc.dat b/src/include/catalog/pg_proc.dat
index 2e41f4d9e8..56aff551d3 100644
--- a/src/include/catalog/pg_proc.dat
+++ b/src/include/catalog/pg_proc.dat
@@ -1681,6 +1681,9 @@
   proname => 'arraycontjoinsel', provolatile => 's', prorettype => 'float8',
   proargtypes => 'internal oid internal int2 internal',
   prosrc => 'arraycontjoinsel' },
+{ oid => '7777', descr => 'shuffle array',
+  proname => 'array_shuffle', proisstrict => 'f', prorettype => 'anyarray',
+  proargtypes => 'anyarray', prosrc => 'array_shuffle' },
 
 { oid => '764', descr => 'large object import',
   proname => 'lo_import', provolatile => 'v', proparallel => 'u',
-- 
2.37.1

Re: Proposal to introduce a shuffle function to intarray extension

Reply via email to