Am 23.01.2017 um 20:07 schrieb Junio C Hamano:
> René Scharfe <l....@web.de> writes:
> 
>> Implement qsort_s() as a wrapper to the GNU version of qsort_r(1) and
>> use it on Linux.  Performance increases slightly:
>>
>> Test                         HEAD^             HEAD
>> --------------------------------------------------------------------
>> 0071.2: sort(1)              0.10(0.20+0.02)   0.10(0.21+0.01) +0.0%
>> 0071.3: string_list_sort()   0.17(0.15+0.01)   0.16(0.15+0.01) -5.9%
>>
>> Additionally the unstripped size of compat/qsort_s.o falls from 24576
>> to 16544 bytes in my build.
>>
>> IMHO these savings aren't worth the increased complexity of having to
>> support two implementations.
> 
> I do worry about having to support more implementations in the
> future that have different function signature for the comparison
> callbacks, which will make things ugly, but this addition alone
> doesn't look too bad to me.

It is unfair of me to show a 5% speedup and then recommend to not
include it. ;-)  That difference won't be measurable in real use cases
and the patch is not necessary.  This patch is simple, but the overall
complexity (incl. #ifdefs etc.) will be lower without it.

But here's another one, with even higher performance and with an even
bigger recommendation to not include it! :)  It veers off into another
direction: Parallel execution.  It requires thread-safe comparison
functions, which might surprise callers.  The value 1000 for the minimum
number of items before threading kicks in is just a guess, not based on
measurements.  So it's quite raw -- and I'm not sure why it's still a
bit slower than sort(1).

Test                         HEAD^             HEAD
---------------------------------------------------------------------
0071.2: sort(1)              0.10(0.18+0.03)   0.10(0.20+0.02) +0.0%
0071.3: string_list_sort()   0.17(0.14+0.02)   0.11(0.18+0.02) -35.3%

---
 compat/qsort_s.c | 76 ++++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 58 insertions(+), 18 deletions(-)

diff --git a/compat/qsort_s.c b/compat/qsort_s.c
index 52d1f0a73d..1304a089af 100644
--- a/compat/qsort_s.c
+++ b/compat/qsort_s.c
@@ -1,4 +1,5 @@
 #include "../git-compat-util.h"
+#include "../thread-utils.h"
 
 /*
  * A merge sort implementation, simplified from the qsort implementation
@@ -6,29 +7,58 @@
  * Added context pointer, safety checks and return value.
  */
 
-static void msort_with_tmp(void *b, size_t n, size_t s,
-                          int (*cmp)(const void *, const void *, void *),
-                          char *t, void *ctx)
+#define MIN_ITEMS_FOR_THREAD 1000
+
+struct work {
+       int nr_threads;
+       void *base;
+       size_t nmemb;
+       size_t size;
+       char *tmp;
+       int (*cmp)(const void *, const void *, void *);
+       void *ctx;
+};
+
+static void *msort_with_tmp(void *work)
 {
+       struct work one, two, *w = work;
        char *tmp;
        char *b1, *b2;
        size_t n1, n2;
+       size_t s, n;
 
-       if (n <= 1)
-               return;
+       if (w->nmemb <= 1)
+               return NULL;
 
-       n1 = n / 2;
-       n2 = n - n1;
-       b1 = b;
-       b2 = (char *)b + (n1 * s);
+       one = two = *w;
+       one.nr_threads /= 2;
+       two.nr_threads -= one.nr_threads;
+       n = one.nmemb;
+       s = one.size;
+       n1 = one.nmemb = n / 2;
+       n2 = two.nmemb = n - n1;
+       b1 = one.base;
+       b2 = two.base = b1 + n1 * s;
+       two.tmp += n1 * s;
 
-       msort_with_tmp(b1, n1, s, cmp, t, ctx);
-       msort_with_tmp(b2, n2, s, cmp, t, ctx);
+#ifndef NO_PTHREADS
+       if (one.nr_threads && n > MIN_ITEMS_FOR_THREAD) {
+               pthread_t thread;
+               int err = pthread_create(&thread, NULL, msort_with_tmp, &one);
+               msort_with_tmp(&two);
+               if (err || pthread_join(thread, NULL))
+                       msort_with_tmp(&one);
+       } else
+#endif
+       {
+               msort_with_tmp(&one);
+               msort_with_tmp(&two);
+       }
 
-       tmp = t;
+       tmp = one.tmp;
 
        while (n1 > 0 && n2 > 0) {
-               if (cmp(b1, b2, ctx) <= 0) {
+               if (one.cmp(b1, b2, one.ctx) <= 0) {
                        memcpy(tmp, b1, s);
                        tmp += s;
                        b1 += s;
@@ -42,7 +72,8 @@ static void msort_with_tmp(void *b, size_t n, size_t s,
        }
        if (n1 > 0)
                memcpy(tmp, b1, n1 * s);
-       memcpy(b, t, (n - n2) * s);
+       memcpy(one.base, one.tmp, (n - n2) * s);
+       return NULL;
 }
 
 int git_qsort_s(void *b, size_t n, size_t s,
@@ -50,20 +81,29 @@ int git_qsort_s(void *b, size_t n, size_t s,
 {
        const size_t size = st_mult(n, s);
        char buf[1024];
+       struct work w;
 
        if (!n)
                return 0;
        if (!b || !cmp)
                return -1;
 
+       w.nr_threads = online_cpus();
+       w.base = b;
+       w.nmemb = n;
+       w.size = s;
+       w.cmp = cmp;
+       w.ctx = ctx;
+
        if (size < sizeof(buf)) {
                /* The temporary array fits on the small on-stack buffer. */
-               msort_with_tmp(b, n, s, cmp, buf, ctx);
+               w.tmp = buf;
        } else {
                /* It's somewhat large, so malloc it.  */
-               char *tmp = xmalloc(size);
-               msort_with_tmp(b, n, s, cmp, tmp, ctx);
-               free(tmp);
+               w.tmp = xmalloc(size);
        }
+       msort_with_tmp(&w);
+       if (w.tmp != buf)
+               free(w.tmp);
        return 0;
 }
-- 
2.11.0

Reply via email to