Changeset: 3b0d3db0469c for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/3b0d3db0469c Added Files: gdk/gdk_rsort.c Modified Files: gdk/CMakeLists.txt gdk/gdk_batop.c gdk/gdk_private.h sql/test/VOC/Tests/VOCquery.test Branch: default Log Message:
Implemented radix sort, use it instead of quick sort for larger arrays. diffs (253 lines): diff --git a/gdk/CMakeLists.txt b/gdk/CMakeLists.txt --- a/gdk/CMakeLists.txt +++ b/gdk/CMakeLists.txt @@ -64,6 +64,7 @@ target_sources(bat gdk_string.c gdk_qsort.c gdk_qsort_impl.h + gdk_rsort.c gdk_storage.c gdk_bat.c gdk_delta.c gdk_delta.h diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c --- a/gdk/gdk_batop.c +++ b/gdk/gdk_batop.c @@ -2269,6 +2269,19 @@ do_sort(void *restrict h, void *restrict { if (n <= 1) /* trivially sorted */ return GDK_SUCCEED; + switch (tpe) { + case TYPE_bte: + case TYPE_sht: + case TYPE_int: + case TYPE_lng: +#ifdef HAVE_HGE + case TYPE_hge: +#endif + assert(base == NULL); + if (nilslast == reverse && n > 100) + return GDKrsort(h, t, n, hs, ts, reverse); + break; + } if (stable) { if (reverse) return GDKssort_rev(h, t, base, n, hs, ts, tpe); diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h --- a/gdk/gdk_private.h +++ b/gdk/gdk_private.h @@ -151,6 +151,9 @@ gdk_return GDKremovedir(int farmid, cons gdk_return GDKsave(int farmid, const char *nme, const char *ext, void *buf, size_t size, storage_t mode, bool dosync) __attribute__((__warn_unused_result__)) __attribute__((__visibility__("hidden"))); +gdk_return GDKrsort(void *restrict h, void *restrict t, size_t n, size_t hs, size_t ts, bool reverse) + __attribute__((__warn_unused_result__)) + __attribute__((__visibility__("hidden"))); gdk_return GDKssort_rev(void *restrict h, void *restrict t, const void *restrict base, size_t n, int hs, int ts, int tpe) __attribute__((__warn_unused_result__)) __attribute__((__visibility__("hidden"))); diff --git a/gdk/gdk_rsort.c b/gdk/gdk_rsort.c new file mode 100644 --- /dev/null +++ b/gdk/gdk_rsort.c @@ -0,0 +1,168 @@ +/* + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 2024 MonetDB Foundation; + * Copyright August 2008 - 2023 MonetDB B.V.; + * Copyright 1997 - July 2008 CWI. + */ + +#include "monetdb_config.h" +#include "gdk.h" +#include "gdk_private.h" + +#define RADIX 8 /* one char at a time */ +#define NBUCKETS (1 << RADIX) + +gdk_return +GDKrsort(void *restrict h, void *restrict t, size_t n, size_t hs, size_t ts, bool reverse) +{ + const size_t niters = (8 * hs + RADIX - 1) / RADIX; + size_t *counts = GDKmalloc(niters * NBUCKETS * sizeof(size_t)); + size_t pos[NBUCKETS]; + uint8_t *h1 = h; + uint8_t *h2; + uint8_t *t1 = NULL; + uint8_t *t2 = NULL; + Heap tmph, tmpt; + + if (counts == NULL) + return GDK_FAIL; + + tmph = tmpt = (Heap) { + .farmid = 1, + }; + + snprintf(tmph.filename, sizeof(tmph.filename), "%s%crsort%zuh", + TEMPDIR_NAME, DIR_SEP, (size_t) MT_getpid()); + if (HEAPalloc(&tmph, n, hs) != GDK_SUCCEED) { + GDKfree(counts); + return GDK_FAIL; + } + h2 = (uint8_t *) tmph.base; + + if (t) { + snprintf(tmpt.filename, sizeof(tmpt.filename), "%s%crsort%zut", + TEMPDIR_NAME, DIR_SEP, (size_t) MT_getpid()); + if (HEAPalloc(&tmpt, n, ts) != GDK_SUCCEED) { + GDKfree(counts); + HEAPfree(&tmph, true); + return GDK_FAIL; + } + t1 = t; + t2 = (uint8_t *) tmpt.base; + } else { + ts = 0; + } + + memset(counts, 0, niters * NBUCKETS * sizeof(size_t)); + memset(pos, 0, sizeof(pos)); + for (size_t i = 0, o = 0; i < n; i++, o += hs) { + for (size_t j = 0; j < niters; j++) { +#ifdef WORDS_BIGENDIAN + uint8_t v = h1[o + hs - j - 1]; +#else + uint8_t v = h1[o + j]; +#endif + counts[(j << RADIX) + v]++; + } + } + + /* When sorting in ascending order, the negative numbers occupy + * the second half of the buckets in the last iteration; when + * sorting in descending order, the negative numbers occupy the + * first half. In either case, at the end we need to put the + * second half first and the first half after. */ + const size_t neg = NBUCKETS / 2; + for (size_t j = 0, b = 0; j < niters; j++, b += NBUCKETS) { + size_t nb = counts[b] > 0; + if (reverse) { + pos[NBUCKETS - 1] = 0; + for (size_t i = NBUCKETS - 1; i > 0; i--) { + pos[i - 1] = pos[i] + counts[b + i]; + nb += counts[b + i] > 0; + } + } else { + pos[0] = 0; + for (size_t i = 1; i < NBUCKETS; i++) { + pos[i] = pos[i - 1] + counts[b + i - 1]; + nb += counts[b + i] > 0; + } + } + if (nb == 1) { + /* no need to reshuffle data for this iteration: + * everything is in the same bucket */ + if (j == niters - 1) { + /* this was the last iteration; we're + * not copying data and updating the pos + * array, so we just update the one + * value we need after the loop */ + if (reverse) + pos[neg] = pos[neg - 1]; + else + pos[neg - 1] = pos[neg]; + /* may as well skip the test at the top + * of the loop */ + break; + } + continue; + } + /* note, this loop changes the pos array */ + for (size_t i = 0, ho = 0, to = 0; i < n; i++, ho += hs, to += ts) { +#ifdef WORDS_BIGENDIAN + uint8_t v = h1[ho + hs - j - 1]; +#else + uint8_t v = h1[ho + j]; +#endif + if (t) + memcpy(t2 + ts * pos[v], t1 + to, ts); + memcpy(h2 + hs * pos[v]++, h1 + ho, hs); + } + uint8_t *t = h1; + h1 = h2; + h2 = t; + t = t1; + t1 = t2; + t2 = t; + } + GDKfree(counts); + const size_t negpos = pos[reverse ? neg : neg - 1]; + + if (h1 != (uint8_t *) h) { + /* copy the negative integers to the start, copy positive after */ + if (negpos < n) { + memcpy(h2, h1 + hs * negpos, (n - negpos) * hs); + if (t) + memcpy(t2, t1 + ts * negpos, (n - negpos) * ts); + } + if (negpos > 0) { + memcpy(h2 + hs * (n - negpos), h1, negpos * hs); + if (t) + memcpy(t2 + ts * (n - negpos), t1, negpos * ts); + } + } else if (negpos == 0 || negpos == n) { + /* no negative or no positive integers: they're all at the right place */ + } else { + /* copy the negative integers to the start, copy positive after */ + if (negpos < n) { + memcpy(h2, h1 + hs * negpos, (n - negpos) * hs); + if (t) + memcpy(t2, t1 + ts * negpos, (n - negpos) * ts); + } + if (negpos > 0) { + memcpy(h2 + hs * (n - negpos), h1, negpos * hs); + if (t) + memcpy(t2 + ts * (n - negpos), t1, negpos * ts); + } + memcpy(h, h2, n * hs); + if (t) + memcpy(t, t2, n * ts); + } + HEAPfree(&tmph, true); + if (t) + HEAPfree(&tmpt, true); + return GDK_SUCCEED; +} diff --git a/sql/test/VOC/Tests/VOCquery.test b/sql/test/VOC/Tests/VOCquery.test --- a/sql/test/VOC/Tests/VOCquery.test +++ b/sql/test/VOC/Tests/VOCquery.test @@ -36,7 +36,7 @@ select count(*) from craftsmen c where e ---- 2349 -query IIITTIRRRR nosort +query IIITTIRRRR rowsort SELECT number, trip, tonnage, departure_Date, arrival_date, RANK() OVER ( PARTITION BY trip ORDER BY tonnage ) AS RankAggregation, CUME_DIST() OVER ( PARTITION BY trip ORDER BY tonnage nulls first ) as CumeDistGroup1, @@ -45,9 +45,9 @@ PERCENT_RANK() OVER ( PARTITION BY trip PERCENT_RANK() OVER ( PARTITION BY trip ORDER BY tonnage nulls last ) as PercentRankGroup2 FROM voyages WHERE particulars LIKE '%_recked%' ---- -3580 values hashing to 1dae27877c63f429ab14924a7c9250ba +3580 values hashing to 0fb84c30b86984999eb17deaf134a592 -query RRRR nosort +query RRRR rowsort SELECT CUME_DIST() OVER ( ORDER BY tonnage nulls first ) as CumeDistGroup1, CUME_DIST() OVER ( ORDER BY tonnage nulls last ) as CumeDistGroup2, @@ -55,7 +55,7 @@ PERCENT_RANK() OVER ( ORDER BY tonnage n PERCENT_RANK() OVER ( ORDER BY tonnage nulls last ) as PercentRankGroup2 FROM voyages WHERE particulars LIKE '%_recked%' ---- -1432 values hashing to edc9fc94dae411fcb58f92f145a72e04 +1432 values hashing to e6c06ab2f2514389b6ad904d76571427 query RR rowsort SELECT _______________________________________________ checkin-list mailing list -- checkin-list@monetdb.org To unsubscribe send an email to checkin-list-le...@monetdb.org