https://bugs.llvm.org/show_bug.cgi?id=50026
I reported it to the llvm people. it is two slightly different quicksort algorithms which perform radically differently. The one which you could assume would take more time, performs MUCH better. I made a custom quicksort algorithm which outperforms qsort by A LOT for sorting an array of around 300 randomly created unsigned characters, which is what I use it for. if you read the report a guy said there's a 10% difference for sorting 3 million characters on freebsd, but there's about 40% performance difference on OpenBSD. maybe it's also how the OpenBSD team modified clang to prevent rop chain stuff or something? I'm using a westmere based intell server. it's the same for clang 11. What's the deal? -Luke
/* clang sort_test2.c -O2 -o sort_test && ./sort_test */ /* * quicksort should be slightly faster than quicksort1 because it is * identical except quicksort1 does more work using global counters. * quicsort1 runs 41% faster. Using gcc, quicksort is instead slightly faster. */ /* * I'm using clang 10.0.1 on OpenBSD. these are my results: quicksort time = 0.004501771 quicksort1 time = 0.002655233 */ #include <err.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <unistd.h> #include <sys/param.h> size_t m; size_t g; size_t i; /* * quicksort sorts characters on a u_char array. * 'a' is the first character on the array. * 'b' is last character on the array */ static void quicksort(u_char a[], u_char b[]) { u_char temp; if (b - a <= 1) { if (a == b) return; if (*a > *b) { temp = *a; *a = *b; *b = temp; } return; } u_char * j = a + 1; u_char * k = b; u_char u = *a; /* * This if() avoids repeated awkward special case branching * in the similar if() in the "while (++j < k)" loop */ if (*j > u) { do { if (*k < u) { temp = *j; *j = *k; *k = temp; goto skip0; } } while (j < --k); quicksort(j, b); return; skip0: if (j == --k) { *a = *j; *j = u; quicksort(j+1, b); return; } } while (++j < k) { if (*j > u) { do { if (*k < u) { temp = *j; *j = *k; *k = temp; goto skip; } } while (j < --k); quicksort(j, b); *a = *--j; *j = u; quicksort(a, j-1); return; skip: if (j == --k) { *a = *j; *j = u; quicksort(a, j-1); quicksort(j+1, b); return; } } } if (*j > u) { quicksort(j, b); *a = *--j; *j = u; quicksort(a, j-1); } else { *a = *j; *j = u; quicksort(a, j-1); if (j == b) return; quicksort(j+1, b); } } /* * quicksort1 is identical to quicksort() but it alters some global counters * it should take slightly longer to run, but it runs in half the time */ static void quicksort1(u_char a[], u_char b[], size_t level) { ++m; if (level > i) i = level; u_char temp; if (b - a <= 1) { ++g; if (a == b) return; if (*a > *b) { temp = *a; *a = *b; *b = temp; } return; } u_char * j = a + 1; u_char * k = b; u_char u = *a; /* * This if() avoids repeated awkward special case branching * in the similar if() in the "while (++j < k)" loop */ if (*j > u) { do { if (*k < u) { temp = *j; *j = *k; *k = temp; goto skip0; } } while (j < --k); quicksort1(j, b, level + 1); return; skip0: if (j == --k) { *a = *j; *j = u; quicksort1(j+1, b, level + 1); return; } } while (++j < k) { if (*j > u) { do { if (*k < u) { temp = *j; *j = *k; *k = temp; goto skip; } } while (j < --k); quicksort1(j, b, level + 1); *a = *--j; *j = u; quicksort1(a, j-1, level + 1); return; skip: if (j == --k) { *a = *j; *j = u; quicksort1(a, j-1, level + 1); quicksort1(j+1, b, level + 1); return; } } } if (*j > u) { quicksort1(j, b, level + 1); *a = *--j; *j = u; quicksort1(a, j-1, level + 1); } else { *a = *j; *j = u; quicksort1(a, j-1, level + 1); if (j == b) return; quicksort1(j+1, b, level + 1); } } int main() { long double cpu_time_used; struct timespec start, end; u_char *buffer, *buffer2; size_t y; size_t n = 30000; buffer = (u_char*)malloc(n); if (buffer == NULL) err(1, "malloc"); buffer2 = (u_char*)malloc(n); if (buffer2 == NULL) err(1, "malloc"); for (y = 0; y < n; ++y) buffer[y] = rand() % 256; // run it once to clear any wrappers clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); memcpy(buffer2, buffer, n); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); quicksort(buffer2, buffer2 + n - 1); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); cpu_time_used = (long double) (end.tv_sec - start.tv_sec) + (long double) (end.tv_nsec - start.tv_nsec) / (long double) 1000000000; printf("\n\nquicksort\n"); printf("time = %.9Lf\n\n", cpu_time_used); memcpy(buffer2, buffer, n); i = m = g = 0; clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start); quicksort1(buffer2, buffer2 + n - 1, 1); clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end); cpu_time_used = (long double) (end.tv_sec - start.tv_sec) + (long double) (end.tv_nsec - start.tv_nsec) / (long double) 1000000000; printf("\n\nquicksort1\n"); //~ printf("time = %.9Lf, max depth: %lu, calls: %lu, g: %lu\n\n\n", cpu_time_used, i, m, g); printf("time = %.9Lf\n", cpu_time_used); return 0; }