Rather than u32_swap and u64_swap working on 4- and 8-byte objects
directly, let them handle any multiple of 4 or 8 bytes.  This speeds
up most users of sort() by avoiding fallback to the byte copy loop.

Despite what commit ca96ab859ab4 ("lib/sort: Add 64 bit swap function")
claims, very few users of sort() sort pointers (or pointer-sized
objects); most sort structures containing at least two words.
(E.g. drivers/acpi/fan.c:acpi_fan_get_fps() sorts an array of 40-byte
struct acpi_fan_fps.)

x86-64 code size 872 -> 885 bytes (+8)

Signed-off-by: George Spelvin <l...@sdf.org>
---
 lib/sort.c | 117 +++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 96 insertions(+), 21 deletions(-)

diff --git a/lib/sort.c b/lib/sort.c
index d6b7a202b0b6..dff2ab2e196e 100644
--- a/lib/sort.c
+++ b/lib/sort.c
@@ -11,35 +11,110 @@
 #include <linux/export.h>
 #include <linux/sort.h>
 
-static int alignment_ok(const void *base, int align)
+/**
+ * alignment_ok - is this pointer & size okay for word-wide copying?
+ * @base: pointer to data
+ * @size: size of each element
+ * @align: required aignment (typically 4 or 8)
+ *
+ * Returns true if elements can be copied using word loads and stores.
+ * The size must be a multiple of the alignment, and the base address must
+ * be if we do not have CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS.
+ *
+ * For some reason, gcc doesn't know to optimize "if (a & mask || b & mask)"
+ * to "if ((a | b) & mask)", so we do that by hand.
+ */
+static bool __attribute_const__
+alignment_ok(const void *base, size_t size, unsigned int align)
 {
-       return IS_ENABLED(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) ||
-               ((unsigned long)base & (align - 1)) == 0;
+       unsigned int lsbits = (unsigned int)size;
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+       (void)base;
+#else
+       lsbits |= (unsigned int)(size_t)base;
+#endif
+       lsbits &= align - 1;
+       return lsbits == 0;
 }
 
+/**
+ * u32_swap - swap two elements in 32-bit chunks
+ * @a, @b: pointers to the elements
+ * @size: element size (must be a multiple of 4)
+ *
+ * Exchange the two objects in memory.  This exploits base+index addressing,
+ * which basically all CPUs have, to minimize loop overhead computations.
+ *
+ * For some reason, on x86 gcc 7.3.0 adds a redundant test of n at the
+ * bottom of the loop, even though the zero flag is stil valid from the
+ * subtract (since the intervening mov instructions don't alter the flags).
+ * Gcc 8.1.0 doesn't have that problem.
+ */
 static void u32_swap(void *a, void *b, int size)
 {
-       u32 t = *(u32 *)a;
-       *(u32 *)a = *(u32 *)b;
-       *(u32 *)b = t;
+       size_t n = size;
+
+       do {
+               u32 t = *(u32 *)(a + (n -= 4));
+               *(u32 *)(a + n) = *(u32 *)(b + n);
+               *(u32 *)(b + n) = t;
+       } while (n);
 }
 
+/**
+ * u64_swap - swap two elements in 64-bit chunks
+ * @a, @b: pointers to the elements
+ * @size: element size (must be a multiple of 8)
+ *
+ * Exchange the two objects in memory.  This exploits base+index
+ * addressing, which basically all CPUs have, to minimize loop overhead
+ * computations.
+ *
+ * We'd like to use 64-bit loads if possible.  If they're not, emulating
+ * one requires base+index+4 addressing which x86 has but most other
+ * processors do not.  If CONFIG_64BIT, we definitely have 64-bit loads,
+ * but it's possible to have 64-bit loads without 64-bit pointers (e.g.
+ * x32 ABI).  Are there any cases the kernel needs to worry about?
+ */
+
 static void u64_swap(void *a, void *b, int size)
 {
-       u64 t = *(u64 *)a;
-       *(u64 *)a = *(u64 *)b;
-       *(u64 *)b = t;
-}
-
-static void generic_swap(void *a, void *b, int size)
-{
-       char t;
+       size_t n = size;
 
        do {
-               t = *(char *)a;
-               *(char *)a++ = *(char *)b;
-               *(char *)b++ = t;
-       } while (--size > 0);
+#ifdef CONFIG_64BIT
+               u64 t = *(u64 *)(a + (n -= 8));
+               *(u64 *)(a + n) = *(u64 *)(b + n);
+               *(u64 *)(b + n) = t;
+#else
+               /* Use two 32-bit transfers to avoid base+index+4 addressing */
+               u32 t = *(u32 *)(a + (n -= 4));
+               *(u32 *)(a + n) = *(u32 *)(b + n);
+               *(u32 *)(b + n) = t;
+
+               t = *(u32 *)(a + (n -= 4));
+               *(u32 *)(a + n) = *(u32 *)(b + n);
+               *(u32 *)(b + n) = t;
+#endif
+       } while (n);
+}
+
+/**
+ * generic_swap - swap two elements a byte at a time
+ * @a, @b: pointers to the elements
+ * @size: element size
+ *
+ * This is the fallback if alignment doesn't allow using larger chunks.
+ */
+static void generic_swap(void *a, void *b, int size)
+{
+       size_t n = size;
+
+       do {
+               char t = ((char *)a)[--n];
+               ((char *)a)[n] = ((char *)b)[n];
+               ((char *)b)[n] = t;
+       } while (n);
 }
 
 /**
@@ -67,10 +142,10 @@ void sort(void *base, size_t num, size_t size,
        int i = (num/2 - 1) * size, n = num * size, c, r;
 
        if (!swap_func) {
-               if (size == 4 && alignment_ok(base, 4))
-                       swap_func = u32_swap;
-               else if (size == 8 && alignment_ok(base, 8))
+               if (alignment_ok(base, size, 8))
                        swap_func = u64_swap;
+               else if (alignment_ok(base, size, 4))
+                       swap_func = u32_swap;
                else
                        swap_func = generic_swap;
        }
-- 
2.20.1

Reply via email to