On Sat, Dec 12, 2015 at 7:42 PM, Ants Aasma <[email protected]> wrote:
> As the open coding doesn't help with eliminating control flow
> dependencies, so my idea is to encode the sort network comparison
> order in an array and use that to drive a simple loop. The code size
> would be pretty similar to insertion sort and the loop overhead should
> mostly be hidden by the CPU OoO machinery. Probably won't help much,
> but would be interesting and simple enough to try out. Can you share
> you code for the benchmark so I can try it out?
I can. But the further results showing the number of comparisons is
higher than for insertion sort have dampened my enthusiasm for the
change. I'm assuming that even if it's faster for a simple integer or
sort it'll be much slower for anything that requires calling out to
the datatype comparator. I also hadn't actually measured what
percentage of the sort was being spent in the insertion sort. I had
guessed it would be higher.
The test is attached. qsort_tuple.c is copied from tuplesort (with the
ifdef for NOPRESORT added, but you could skip that if you want.).
Compile with something like:
gcc -DNOPRESORT -O3 -DCOUNTS -Wall -Wno-unused-function simd-sort-test.c
--
greg
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <sys/time.h>
#include <sys/types.h>
#include <unistd.h>
#ifdef COUNTS
unsigned nswaps;
unsigned ncmps;
#define countswap (nswaps++)
#define countcmp (ncmps++)
#else
#define countswap ((void)0)
#define countcmp ((void)0)
#endif
typedef void *Tuplesortstate;
typedef long long int Datum;
typedef int bool;
typedef struct
{
void *tuple; /* the tuple proper */
Datum datum1; /* value of first key column */
bool isnull1; /* is first key column NULL? */
int tupindex; /* see notes above */
} SortTuple;
typedef SortTuple elem_t;
typedef int (*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
Tuplesortstate *state);
typedef struct SortSupportData *SortSupport;
static int comparator(Datum a, Datum b, SortSupport ssup) {
if (b > a)
return -1;
else if (b < a)
return 1;
else return 0;
};
struct SortSupportData {
bool ssup_nulls_first;
bool ssup_reverse;
int (*comparator)(Datum,Datum,SortSupport);
};
struct SortSupportData ssup = {0, 0, comparator};
static inline int
ApplySortComparator(Datum datum1, bool isNull1,
Datum datum2, bool isNull2,
SortSupport ssup)
{
int compare;
countcmp;
if (isNull1)
{
if (isNull2)
compare = 0; /* NULL "=" NULL */
else if (ssup->ssup_nulls_first)
compare = -1; /* NULL "<" NOT_NULL */
else
compare = 1; /* NULL ">" NOT_NULL */
}
else if (isNull2)
{
if (ssup->ssup_nulls_first)
compare = 1; /* NOT_NULL ">" NULL */
else
compare = -1; /* NOT_NULL "<" NULL */
}
else
{
compare = (*ssup->comparator) (datum1, datum2, ssup);
if (ssup->ssup_reverse)
compare = -compare;
}
return compare;
}
#define CHECK_FOR_INTERRUPTS() do {} while (0)
#define Min(a,b) ((a)<(b)?(a):(b))
#define Max(a,b) ((a)<(b)?(b):(a))
#include "qsort_tuple.c"
#define mycmp(a,b) \
(ApplySortComparator(list[a].datum1, list[a].isnull1, \
list[b].datum1, list[b].isnull1, \
&ssup))
#define myswap(a,b) \
do { \
elem_t _tmp; \
_tmp = list[a]; \
list[a] = list[b]; \
list[b] = _tmp; \
countswap; \
} while (0)
#define myswapif(a,b) \
do { \
if (mycmp(a,b) > 0) \
myswap(a,b); \
} while (0)
static int insertion_ok(unsigned N) {
return N<1000;
}
static void insertion(elem_t *list, unsigned N) {
if (N > 1000) {
printf("insertion sort not feasible for large N\n");
exit(1);
}
for (unsigned pm = 1; pm < N; pm++)
for (unsigned pl = pm; pl && mycmp(pl-1, pl) > 0; pl--)
myswap(pl, pl-1);
}
static int sort_networks_ok(unsigned N) {
return N<=8;
}
static void sort_networks(elem_t *list, unsigned N) {
if (N > 8) {
printf("sort_networks only implemented for N in 0..8\n");
exit(1);
}
switch(N) {
case 0:
case 1:
break;
case 2:
myswapif(0,1);
break;
case 3:
myswapif(0,1); myswapif(1,2);
myswapif(0,1);
break;
case 4:
myswapif(0,1); myswapif(2,3);
myswapif(1,3); myswapif(0,2);
myswapif(1,2);
break;
case 5:
myswapif(1,2); myswapif(3,4);
myswapif(1,3); myswapif(0,2);
myswapif(2,4); myswapif(0,3);
myswapif(0,1); myswapif(2,3);
myswapif(1,2);
break;
case 6:
myswapif(0,1); myswapif(2,3); myswapif(4,5);
myswapif(0,2); myswapif(3,5); myswapif(1,4);
myswapif(0,1); myswapif(2,3); myswapif(4,5);
myswapif(1,2); myswapif(3,4);
myswapif(2,3);
break;
case 7:
myswapif(1,2); myswapif(3,4); myswapif(5,6);
myswapif(0,2); myswapif(4,6); myswapif(3,5);
myswapif(2,6); myswapif(1,5); myswapif(0,4);
myswapif(2,5); myswapif(0,3);
myswapif(2,4); myswapif(1,3);
myswapif(0,1); myswapif(2,3); myswapif(4,5);
break;
case 8:
myswapif(0, 1); myswapif(2, 3); myswapif(4, 5); myswapif(6, 7);
myswapif(0, 3); myswapif(1, 2); myswapif(4, 7); myswapif(5, 6);
myswapif(0, 1); myswapif(2, 3); myswapif(4, 5); myswapif(6, 7);
myswapif(0, 7); myswapif(1, 6); myswapif(2, 5); myswapif(3, 4);
myswapif(1, 3); myswapif(0, 2); myswapif(5, 7); myswapif(4, 6);
myswapif(0, 1); myswapif(2, 3); myswapif(4, 5); myswapif(6, 7);
break;
}
}
static int bitonic_ok(unsigned N) {
return (N&(N-1))==0;
}
static void bitonic_serial(elem_t *list, unsigned N) {
if (N & (N-1)) {
printf("bitonic sort implemented only for powers of 2\n");
exit(1);
}
int i,j,k;
for (k=2;k<=N;k=2*k) {
for (j=k>>1;j>0;j=j>>1) {
for (i=0;i<N;i++) {
int ixj=i^j;
if (ixj > i) {
if ((i & k) == 0)
myswapif(i,ixj);
if ((i & k) != 0)
myswapif(ixj, i);
}
}
}
}
}
static int qsort_compare(const void *_a, const void *_b) {
const elem_t *a = _a, *b = _b;
return ApplySortComparator(a->datum1, a->isnull1,
b->datum1, b->isnull1,
&ssup);
}
static void quicksort(elem_t *list, unsigned N) {
qsort(list, N, sizeof(elem_t), qsort_compare);
}
static void test_qsort_ssup(elem_t *list, unsigned N) {
qsort_ssup(list, N, &ssup);
}
static elem_t *generate_list(unsigned N) {
elem_t *list = aligned_alloc(256, sizeof(elem_t) * N);
memset(list, 0, sizeof(elem_t) * N);
for (unsigned i=0 ; i<N; i++) {
SortTuple x = {NULL, 0, 0, 0};
x.datum1 = (Datum)rand() << (8*(sizeof(x.datum1) - sizeof(int)));
list[i] = x;
}
return list;
}
static void time_sort(char *label, unsigned seed, unsigned N, unsigned M, void (*sort)(elem_t*, unsigned)) {
unsigned reps = M/N;
elem_t **lists = malloc(sizeof(elem_t*) * reps);
srand(seed);
for (unsigned i=0; i<reps; i++)
lists[i] = generate_list(N);
#ifdef COUNTS
nswaps = 0;
ncmps = 0;
#endif
struct timespec t1, t2;
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t1);
for (unsigned i=0;i<reps; i++)
sort(lists[i], N);
clock_gettime(CLOCK_THREAD_CPUTIME_ID, &t2);
for (unsigned i=0; i<reps; i++)
free(lists[i]);
free(lists);
double elapsed = ((double)
(t2.tv_sec - t1.tv_sec) * 1000 * 1000 * 1000 +
(t2.tv_nsec - t1.tv_nsec)) / reps;
char *units = "ns";
if (elapsed > 1000) {
elapsed /= 1000;
units = "us";
if (elapsed > 1000) {
elapsed /= 1000;
units = "ms";
if (elapsed > 1000) {
elapsed /= 1000;
units = "s";
if (elapsed > 119) {
elapsed /= 60;
units = "m";
if (elapsed > 119) {
elapsed /= 60;
units = "h";
}
}
}
}
}
printf("using %s sort %0.3f%s per sort of %u %lu-byte items",
label,
elapsed, units,
N, sizeof(elem_t)/sizeof(char));
#ifdef COUNTS
if (ncmps)
printf(" %0.1f compares/sort", (double)ncmps/reps);
if (nswaps)
printf(" %0.1f swaps/sort", (double)nswaps/reps);
#endif
printf("\n");
}
struct sort {
char *label;
void (*func)(elem_t*, unsigned);
int (*ok)(unsigned);
} sorts[] = {
{"bitonic", bitonic_serial, bitonic_ok},
{"insertion", insertion, insertion_ok},
{"sort networks", sort_networks, sort_networks_ok},
{"libc quicksort", quicksort, NULL},
{"qsort_ssup", test_qsort_ssup, NULL},
};
#define NSORTS (sizeof(sorts)/sizeof(*sorts))
static void time_several_sorts(unsigned N, unsigned M) {
struct timeval tv;
gettimeofday(&tv, NULL);
unsigned seed = getpid() ^ tv.tv_sec ^ tv.tv_usec;
srand(seed);
seed = rand();
for (unsigned i=0; i<NSORTS; i++) {
if (!sorts[i].ok || sorts[i].ok(N))
time_sort(sorts[i].label, seed, N, M, sorts[i].func);
}
}
int main (int argc, char *argv[], char *envp[]) {
unsigned N;
if (argc > 1) {
N = atoi(argv[1]);
} else {
N = 16384;
}
time_several_sorts(N, 1<<23);
}
/*
* autogenerated by src/backend/utils/sort/gen_qsort_tuple.pl, do not edit!
*
* This file is included by tuplesort.c, rather than compiled separately.
*/
/* $NetBSD: qsort.c,v 1.13 2003/08/07 16:43:42 agc Exp $ */
/*-
* Copyright (c) 1992, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the University nor the names of its contributors
* may be used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/*
* Qsort routine based on J. L. Bentley and M. D. McIlroy,
* "Engineering a sort function",
* Software--Practice and Experience 23 (1993) 1249-1265.
*
* We have modified their original by adding a check for already-sorted input,
* which seems to be a win per discussions on pgsql-hackers around 2006-03-21.
*
* Also, we recurse on the smaller partition and iterate on the larger one,
* which ensures we cannot recurse more than log(N) levels (since the
* partition recursed to is surely no more than half of the input). Bentley
* and McIlroy explicitly rejected doing this on the grounds that it's "not
* worth the effort", but we have seen crashes in the field due to stack
* overrun, so that judgment seems wrong.
*/
static void
swapfunc(SortTuple *a, SortTuple *b, size_t n)
{
do
{
SortTuple t = *a;
*a++ = *b;
*b++ = t;
} while (--n > 0);
}
#define swap(a, b) \
do { \
SortTuple t = *(a); \
*(a) = *(b); \
*(b) = t; \
} while (0);
#define vecswap(a, b, n) if ((n) > 0) swapfunc(a, b, n)
static SortTuple *
med3_tuple(SortTuple *a, SortTuple *b, SortTuple *c, SortTupleComparator cmp_tuple, Tuplesortstate *state)
{
return cmp_tuple(a, b, state) < 0 ?
(cmp_tuple(b, c, state) < 0 ? b :
(cmp_tuple(a, c, state) < 0 ? c : a))
: (cmp_tuple(b, c, state) > 0 ? b :
(cmp_tuple(a, c, state) < 0 ? a : c));
}
static void
qsort_tuple(SortTuple *a, size_t n, SortTupleComparator cmp_tuple, Tuplesortstate *state)
{
SortTuple *pa,
*pb,
*pc,
*pd,
*pl,
*pm,
*pn;
size_t d1,
d2;
int r,
presorted;
loop:
CHECK_FOR_INTERRUPTS();
if (n <= 8) {
#define swapif(l,r) if (cmp_tuple(a+l,a+r, state) > 0) swap(a+l,a+r)
switch(n) {
case 0:
case 1:
break;
case 2:
swapif(0,1);
break;
case 3:
swapif(0,1); swapif(1,2);
swapif(0,1);
break;
case 4:
swapif(0,1); swapif(2,3);
swapif(1,3); swapif(0,2);
swapif(1,2);
break;
case 5:
swapif(1,2); swapif(3,4);
swapif(1,3); swapif(0,2);
swapif(2,4); swapif(0,3);
swapif(0,1); swapif(2,3);
swapif(1,2);
break;
case 6:
swapif(0,1); swapif(2,3); swapif(4,5);
swapif(0,2); swapif(3,5); swapif(1,4);
swapif(0,1); swapif(2,3); swapif(4,5);
swapif(1,2); swapif(3,4);
swapif(2,3);
break;
case 7:
swapif(1,2); swapif(3,4); swapif(5,6);
swapif(0,2); swapif(4,6); swapif(3,5);
swapif(2,6); swapif(1,5); swapif(0,4);
swapif(2,5); swapif(0,3);
swapif(2,4); swapif(1,3);
swapif(0,1); swapif(2,3); swapif(4,5);
break;
case 8:
swapif(0, 1); swapif(2, 3); swapif(4, 5); swapif(6, 7);
swapif(0, 3); swapif(1, 2); swapif(4, 7); swapif(5, 6);
swapif(0, 1); swapif(2, 3); swapif(4, 5); swapif(6, 7);
swapif(0, 7); swapif(1, 6); swapif(2, 5); swapif(3, 4);
swapif(1, 3); swapif(0, 2); swapif(5, 7); swapif(4, 6);
swapif(0, 1); swapif(2, 3); swapif(4, 5); swapif(6, 7);
break;
}
return;
#undef swapif
}
presorted = 1;
for (pm = a + 1; pm < a + n; pm++)
{
CHECK_FOR_INTERRUPTS();
if (cmp_tuple(pm - 1, pm, state) > 0)
{
presorted = 0;
break;
}
}
if (presorted)
return;
pm = a + (n / 2);
if (n > 7)
{
pl = a;
pn = a + (n - 1);
if (n > 40)
{
size_t d = (n / 8);
pl = med3_tuple(pl, pl + d, pl + 2 * d, cmp_tuple, state);
pm = med3_tuple(pm - d, pm, pm + d, cmp_tuple, state);
pn = med3_tuple(pn - 2 * d, pn - d, pn, cmp_tuple, state);
}
pm = med3_tuple(pl, pm, pn, cmp_tuple, state);
}
swap(a, pm);
pa = pb = a + 1;
pc = pd = a + (n - 1);
for (;;)
{
while (pb <= pc && (r = cmp_tuple(pb, a, state)) <= 0)
{
if (r == 0)
{
swap(pa, pb);
pa++;
}
pb++;
CHECK_FOR_INTERRUPTS();
}
while (pb <= pc && (r = cmp_tuple(pc, a, state)) >= 0)
{
if (r == 0)
{
swap(pc, pd);
pd--;
}
pc--;
CHECK_FOR_INTERRUPTS();
}
if (pb > pc)
break;
swap(pb, pc);
pb++;
pc--;
}
pn = a + n;
d1 = Min(pa - a, pb - pa);
vecswap(a, pb - d1, d1);
d1 = Min(pd - pc, pn - pd - 1);
vecswap(pb, pn - d1, d1);
d1 = pb - pa;
d2 = pd - pc;
if (d1 <= d2)
{
/* Recurse on left partition, then iterate on right partition */
if (d1 > 1)
qsort_tuple(a, d1, cmp_tuple, state);
if (d2 > 1)
{
/* Iterate rather than recurse to save stack space */
/* qsort_tuple(pn - d2, d2, cmp_tuple, state); */
a = pn - d2;
n = d2;
goto loop;
}
}
else
{
/* Recurse on right partition, then iterate on left partition */
if (d2 > 1)
qsort_tuple(pn - d2, d2, cmp_tuple, state);
if (d1 > 1)
{
/* Iterate rather than recurse to save stack space */
/* qsort_tuple(a, d1, cmp_tuple, state); */
n = d1;
goto loop;
}
}
}
#define cmp_ssup(a, b, ssup) \
ApplySortComparator((a)->datum1, (a)->isnull1, \
(b)->datum1, (b)->isnull1, ssup)
static SortTuple *
med3_ssup(SortTuple *a, SortTuple *b, SortTuple *c, SortSupport ssup)
{
return cmp_ssup(a, b, ssup) < 0 ?
(cmp_ssup(b, c, ssup) < 0 ? b :
(cmp_ssup(a, c, ssup) < 0 ? c : a))
: (cmp_ssup(b, c, ssup) > 0 ? b :
(cmp_ssup(a, c, ssup) < 0 ? a : c));
}
static void
qsort_ssup(SortTuple *a, size_t n, SortSupport ssup)
{
SortTuple *pa,
*pb,
*pc,
*pd,
*pl,
*pm,
*pn;
size_t d1,
d2;
int r;
loop:
CHECK_FOR_INTERRUPTS();
if (n < 7)
{
for (pm = a + 1; pm < a + n; pm++)
for (pl = pm; pl > a && cmp_ssup(pl - 1, pl, ssup) > 0; pl--)
swap(pl, pl - 1);
return;
}
#ifndef NOPRESORT
int presorted = 1;
for (pm = a + 1; pm < a + n; pm++)
{
CHECK_FOR_INTERRUPTS();
if (cmp_ssup(pm - 1, pm, ssup) > 0)
{
presorted = 0;
break;
}
}
if (presorted)
return;
#endif
pm = a + (n / 2);
if (n > 7)
{
pl = a;
pn = a + (n - 1);
if (n > 40)
{
size_t d = (n / 8);
pl = med3_ssup(pl, pl + d, pl + 2 * d, ssup);
pm = med3_ssup(pm - d, pm, pm + d, ssup);
pn = med3_ssup(pn - 2 * d, pn - d, pn, ssup);
}
pm = med3_ssup(pl, pm, pn, ssup);
}
swap(a, pm);
pa = pb = a + 1;
pc = pd = a + (n - 1);
for (;;)
{
while (pb <= pc && (r = cmp_ssup(pb, a, ssup)) <= 0)
{
if (r == 0)
{
swap(pa, pb);
pa++;
}
pb++;
CHECK_FOR_INTERRUPTS();
}
while (pb <= pc && (r = cmp_ssup(pc, a, ssup)) >= 0)
{
if (r == 0)
{
swap(pc, pd);
pd--;
}
pc--;
CHECK_FOR_INTERRUPTS();
}
if (pb > pc)
break;
swap(pb, pc);
pb++;
pc--;
}
pn = a + n;
d1 = Min(pa - a, pb - pa);
vecswap(a, pb - d1, d1);
d1 = Min(pd - pc, pn - pd - 1);
vecswap(pb, pn - d1, d1);
d1 = pb - pa;
d2 = pd - pc;
if (d1 <= d2)
{
/* Recurse on left partition, then iterate on right partition */
if (d1 > 1)
qsort_ssup(a, d1, ssup);
if (d2 > 1)
{
/* Iterate rather than recurse to save stack space */
/* qsort_ssup(pn - d2, d2, ssup); */
a = pn - d2;
n = d2;
goto loop;
}
}
else
{
/* Recurse on right partition, then iterate on left partition */
if (d2 > 1)
qsort_ssup(pn - d2, d2, ssup);
if (d1 > 1)
{
/* Iterate rather than recurse to save stack space */
/* qsort_ssup(a, d1, ssup); */
n = d1;
goto loop;
}
}
}
--
Sent via pgsql-hackers mailing list ([email protected])
To make changes to your subscription:
http://www.postgresql.org/mailpref/pgsql-hackers