MonetDB: default - Implemented radix sort, use it instead of qui...

Sjoerd Mullender via checkin-list Mon, 23 Sep 2024 04:24:17 -0700

Changeset: 3b0d3db0469c for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/3b0d3db0469c
Added Files:
        gdk/gdk_rsort.c
Modified Files:
        gdk/CMakeLists.txt
        gdk/gdk_batop.c
        gdk/gdk_private.h
        sql/test/VOC/Tests/VOCquery.test
Branch: default
Log Message:


Implemented radix sort, use it instead of quick sort for larger arrays.


diffs (253 lines):

diff --git a/gdk/CMakeLists.txt b/gdk/CMakeLists.txt
--- a/gdk/CMakeLists.txt
+++ b/gdk/CMakeLists.txt
@@ -64,6 +64,7 @@ target_sources(bat
   gdk_string.c
   gdk_qsort.c
   gdk_qsort_impl.h
+  gdk_rsort.c
   gdk_storage.c
   gdk_bat.c
   gdk_delta.c gdk_delta.h
diff --git a/gdk/gdk_batop.c b/gdk/gdk_batop.c
--- a/gdk/gdk_batop.c
+++ b/gdk/gdk_batop.c
@@ -2269,6 +2269,19 @@ do_sort(void *restrict h, void *restrict
 {
        if (n <= 1)             /* trivially sorted */
                return GDK_SUCCEED;
+       switch (tpe) {
+       case TYPE_bte:
+       case TYPE_sht:
+       case TYPE_int:
+       case TYPE_lng:
+#ifdef HAVE_HGE
+       case TYPE_hge:
+#endif
+               assert(base == NULL);
+               if (nilslast == reverse && n > 100)
+                       return GDKrsort(h, t, n, hs, ts, reverse);
+               break;
+       }
        if (stable) {
                if (reverse)
                        return GDKssort_rev(h, t, base, n, hs, ts, tpe);
diff --git a/gdk/gdk_private.h b/gdk/gdk_private.h
--- a/gdk/gdk_private.h
+++ b/gdk/gdk_private.h
@@ -151,6 +151,9 @@ gdk_return GDKremovedir(int farmid, cons
 gdk_return GDKsave(int farmid, const char *nme, const char *ext, void *buf, 
size_t size, storage_t mode, bool dosync)
        __attribute__((__warn_unused_result__))
        __attribute__((__visibility__("hidden")));
+gdk_return GDKrsort(void *restrict h, void *restrict t, size_t n, size_t hs, 
size_t ts, bool reverse)
+       __attribute__((__warn_unused_result__))
+       __attribute__((__visibility__("hidden")));
 gdk_return GDKssort_rev(void *restrict h, void *restrict t, const void 
*restrict base, size_t n, int hs, int ts, int tpe)
        __attribute__((__warn_unused_result__))
        __attribute__((__visibility__("hidden")));
diff --git a/gdk/gdk_rsort.c b/gdk/gdk_rsort.c
new file mode 100644
--- /dev/null
+++ b/gdk/gdk_rsort.c
@@ -0,0 +1,168 @@
+/*
+ * SPDX-License-Identifier: MPL-2.0
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0.  If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright 2024 MonetDB Foundation;
+ * Copyright August 2008 - 2023 MonetDB B.V.;
+ * Copyright 1997 - July 2008 CWI.
+ */
+
+#include "monetdb_config.h"
+#include "gdk.h"
+#include "gdk_private.h"
+
+#define RADIX 8                        /* one char at a time */
+#define NBUCKETS (1 << RADIX)
+
+gdk_return
+GDKrsort(void *restrict h, void *restrict t, size_t n, size_t hs, size_t ts, 
bool reverse)
+{
+       const size_t niters = (8 * hs + RADIX - 1) / RADIX;
+       size_t *counts = GDKmalloc(niters * NBUCKETS * sizeof(size_t));
+       size_t pos[NBUCKETS];
+       uint8_t *h1 = h;
+       uint8_t *h2;
+       uint8_t *t1 = NULL;
+       uint8_t *t2 = NULL;
+       Heap tmph, tmpt;
+
+       if (counts == NULL)
+               return GDK_FAIL;
+
+       tmph = tmpt = (Heap) {
+               .farmid = 1,
+       };
+
+       snprintf(tmph.filename, sizeof(tmph.filename), "%s%crsort%zuh",
+                TEMPDIR_NAME, DIR_SEP, (size_t) MT_getpid());
+       if (HEAPalloc(&tmph, n, hs) != GDK_SUCCEED) {
+               GDKfree(counts);
+               return GDK_FAIL;
+       }
+       h2 = (uint8_t *) tmph.base;
+
+       if (t) {
+               snprintf(tmpt.filename, sizeof(tmpt.filename), "%s%crsort%zut",
+                        TEMPDIR_NAME, DIR_SEP, (size_t) MT_getpid());
+               if (HEAPalloc(&tmpt, n, ts) != GDK_SUCCEED) {
+                       GDKfree(counts);
+                       HEAPfree(&tmph, true);
+                       return GDK_FAIL;
+               }
+               t1 = t;
+               t2 = (uint8_t *) tmpt.base;
+       } else {
+               ts = 0;
+       }
+
+       memset(counts, 0, niters * NBUCKETS * sizeof(size_t));
+       memset(pos, 0, sizeof(pos));
+       for (size_t i = 0, o = 0; i < n; i++, o += hs) {
+               for (size_t j = 0; j < niters; j++) {
+#ifdef WORDS_BIGENDIAN
+                       uint8_t v = h1[o + hs - j - 1];
+#else
+                       uint8_t v = h1[o + j];
+#endif
+                       counts[(j << RADIX) + v]++;
+               }
+       }
+
+       /* When sorting in ascending order, the negative numbers occupy
+        * the second half of the buckets in the last iteration; when
+        * sorting in descending order, the negative numbers occupy the
+        * first half.  In either case, at the end we need to put the
+        * second half first and the first half after. */
+       const size_t neg = NBUCKETS / 2;
+       for (size_t j = 0, b = 0; j < niters; j++, b += NBUCKETS) {
+               size_t nb = counts[b] > 0;
+               if (reverse) {
+                       pos[NBUCKETS - 1] = 0;
+                       for (size_t i = NBUCKETS - 1; i > 0; i--) {
+                               pos[i - 1] = pos[i] + counts[b + i];
+                               nb += counts[b + i] > 0;
+                       }
+               } else {
+                       pos[0] = 0;
+                       for (size_t i = 1; i < NBUCKETS; i++) {
+                               pos[i] = pos[i - 1] + counts[b + i - 1];
+                               nb += counts[b + i] > 0;
+                       }
+               }
+               if (nb == 1) {
+                       /* no need to reshuffle data for this iteration:
+                        * everything is in the same bucket */
+                       if (j == niters - 1) {
+                               /* this was the last iteration; we're
+                                * not copying data and updating the pos
+                                * array, so we just update the one
+                                * value we need after the loop */
+                               if (reverse)
+                                       pos[neg] = pos[neg - 1];
+                               else
+                                       pos[neg - 1] = pos[neg];
+                               /* may as well skip the test at the top
+                                * of the loop */
+                               break;
+                       }
+                       continue;
+               }
+               /* note, this loop changes the pos array */
+               for (size_t i = 0, ho = 0, to = 0; i < n; i++, ho += hs, to += 
ts) {
+#ifdef WORDS_BIGENDIAN
+                       uint8_t v = h1[ho + hs - j - 1];
+#else
+                       uint8_t v = h1[ho + j];
+#endif
+                       if (t)
+                               memcpy(t2 + ts * pos[v], t1 + to, ts);
+                       memcpy(h2 + hs * pos[v]++, h1 + ho, hs);
+               }
+               uint8_t *t = h1;
+               h1 = h2;
+               h2 = t;
+               t = t1;
+               t1 = t2;
+               t2 = t;
+       }
+       GDKfree(counts);
+       const size_t negpos = pos[reverse ? neg : neg - 1];
+
+       if (h1 != (uint8_t *) h) {
+               /* copy the negative integers to the start, copy positive after 
*/
+               if (negpos < n) {
+                       memcpy(h2, h1 + hs * negpos, (n - negpos) * hs);
+                       if (t)
+                               memcpy(t2, t1 + ts * negpos, (n - negpos) * ts);
+               }
+               if (negpos > 0) {
+                       memcpy(h2 + hs * (n - negpos), h1, negpos * hs);
+                       if (t)
+                               memcpy(t2 + ts * (n - negpos), t1, negpos * ts);
+               }
+       } else if (negpos == 0 || negpos == n) {
+               /* no negative or no positive integers: they're all at the 
right place */
+       } else {
+               /* copy the negative integers to the start, copy positive after 
*/
+               if (negpos < n) {
+                       memcpy(h2, h1 + hs * negpos, (n - negpos) * hs);
+                       if (t)
+                               memcpy(t2, t1 + ts * negpos, (n - negpos) * ts);
+               }
+               if (negpos > 0) {
+                       memcpy(h2 + hs * (n - negpos), h1, negpos * hs);
+                       if (t)
+                               memcpy(t2 + ts * (n - negpos), t1, negpos * ts);
+               }
+               memcpy(h, h2, n * hs);
+               if (t)
+                       memcpy(t, t2, n * ts);
+       }
+       HEAPfree(&tmph, true);
+       if (t)
+               HEAPfree(&tmpt, true);
+       return GDK_SUCCEED;
+}
diff --git a/sql/test/VOC/Tests/VOCquery.test b/sql/test/VOC/Tests/VOCquery.test
--- a/sql/test/VOC/Tests/VOCquery.test
+++ b/sql/test/VOC/Tests/VOCquery.test
@@ -36,7 +36,7 @@ select count(*) from craftsmen c where e
 ----
 2349
 
-query IIITTIRRRR nosort
+query IIITTIRRRR rowsort
 SELECT number, trip, tonnage, departure_Date, arrival_date,
 RANK() OVER ( PARTITION BY trip ORDER BY tonnage ) AS RankAggregation,
 CUME_DIST() OVER ( PARTITION BY trip ORDER BY tonnage nulls first ) as 
CumeDistGroup1,
@@ -45,9 +45,9 @@ PERCENT_RANK() OVER ( PARTITION BY trip 
 PERCENT_RANK() OVER ( PARTITION BY trip ORDER BY tonnage nulls last ) as 
PercentRankGroup2
 FROM voyages WHERE particulars LIKE '%_recked%'
 ----
-3580 values hashing to 1dae27877c63f429ab14924a7c9250ba
+3580 values hashing to 0fb84c30b86984999eb17deaf134a592
 
-query RRRR nosort
+query RRRR rowsort
 SELECT
 CUME_DIST() OVER ( ORDER BY tonnage nulls first ) as CumeDistGroup1,
 CUME_DIST() OVER ( ORDER BY tonnage nulls last ) as CumeDistGroup2,
@@ -55,7 +55,7 @@ PERCENT_RANK() OVER ( ORDER BY tonnage n
 PERCENT_RANK() OVER ( ORDER BY tonnage nulls last ) as PercentRankGroup2
 FROM voyages WHERE particulars LIKE '%_recked%'
 ----
-1432 values hashing to edc9fc94dae411fcb58f92f145a72e04
+1432 values hashing to e6c06ab2f2514389b6ad904d76571427
 
 query RR rowsort
 SELECT
_______________________________________________
checkin-list mailing list -- checkin-list@monetdb.org
To unsubscribe send an email to checkin-list-le...@monetdb.org

MonetDB: default - Implemented radix sort, use it instead of qui...

Reply via email to