When loading a module, module_frob_arch_sections() tries to figure out
the number of PLTs that'll be needed to handle all the RELAs. While
doing this, it tries to dedupe PLT allocations for multiple
R_AARCH64_CALL26 relocations to the same symbol. It does the same for
R_AARCH64_JUMP26 relocations.

To make checks for duplicates easier/faster, it sorts the relocation
list by type, symbol and addend. That way, to check for a duplicate
relocation, it just needs to compare with the previous entry.

However, sorting the entire relocation array is unnecessary and
expensive (O(n log n)) because there are a lot of other relocation types
that don't need deduping or can't be deduped.

So this commit partitions the array into entries that need deduping and
those that don't. And then sorts just the part that needs deduping. And
when CONFIG_RANDOMIZE_BASE is disabled, the sorting is skipped entirely
because PLTs are not allocated for R_AARCH64_CALL26 and R_AARCH64_JUMP26
if it's disabled.

This gives significant reduction in module load time for modules with
large number of relocations with no measurable impact on modules with a
small number of relocations. In my test setup with CONFIG_RANDOMIZE_BASE
enabled, these were the results for a few downstream modules:

Module          Size (MB)
wlan            14
video codec     3.8
drm             1.8
IPA             2.5
audio           1.2
gpu             1.8

Without this patch:
Module          Number of entries sorted        Module load time (ms)
wlan            243739                          283
video codec     74029                           138
drm             53837                           67
IPA             42800                           90
audio           21326                           27
gpu             20967                           32

Total time to load all these module: 637 ms

With this patch:
Module          Number of entries sorted        Module load time (ms)
wlan            22454                           61
video codec     10150                           47
drm             13014                           40
IPA             8097                            63
audio           4606                            16
gpu             6527                            20

Total time to load all these modules: 247

Time saved during boot for just these 6 modules: 390 ms

Cc: Ard Biesheuvel <[email protected]>
Signed-off-by: Saravana Kannan <[email protected]>
---

v1 -> v2:
- Provided more details in the commit text
- Pulled in Will's comments on the coding style
- Pulled in Ard's suggestion about skipping jumps with the same section
  index (parts of Will's suggested code)

 arch/arm64/kernel/module-plts.c | 46 ++++++++++++++++++++++++++++++---
 1 file changed, 43 insertions(+), 3 deletions(-)

diff --git a/arch/arm64/kernel/module-plts.c b/arch/arm64/kernel/module-plts.c
index 65b08a74aec6..0ce3a28e3347 100644
--- a/arch/arm64/kernel/module-plts.c
+++ b/arch/arm64/kernel/module-plts.c
@@ -253,6 +253,40 @@ static unsigned int count_plts(Elf64_Sym *syms, Elf64_Rela 
*rela, int num,
        return ret;
 }
 
+static bool branch_rela_needs_plt(Elf64_Sym *syms, Elf64_Rela *rela,
+                                 Elf64_Word dstidx)
+{
+
+       Elf64_Sym *s = syms + ELF64_R_SYM(rela->r_info);
+
+       if (s->st_shndx == dstidx)
+               return false;
+
+       return ELF64_R_TYPE(rela->r_info) == R_AARCH64_JUMP26 ||
+              ELF64_R_TYPE(rela->r_info) == R_AARCH64_CALL26;
+}
+
+/* Group branch PLT relas at the front end of the array. */
+static int partition_branch_plt_relas(Elf64_Sym *syms, Elf64_Rela *rela,
+                                     int numrels, Elf64_Word dstidx)
+{
+       int i = 0, j = numrels - 1;
+
+       if (!IS_ENABLED(CONFIG_RANDOMIZE_BASE))
+               return 0;
+
+       while (i < j) {
+               if (branch_rela_needs_plt(syms, &rela[i], dstidx))
+                       i++;
+               else if (branch_rela_needs_plt(syms, &rela[j], dstidx))
+                       swap(rela[i], rela[j]);
+               else
+                       j--;
+       }
+
+       return i;
+}
+
 int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs,
                              char *secstrings, struct module *mod)
 {
@@ -290,7 +324,7 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr 
*sechdrs,
 
        for (i = 0; i < ehdr->e_shnum; i++) {
                Elf64_Rela *rels = (void *)ehdr + sechdrs[i].sh_offset;
-               int numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela);
+               int nents, numrels = sechdrs[i].sh_size / sizeof(Elf64_Rela);
                Elf64_Shdr *dstsec = sechdrs + sechdrs[i].sh_info;
 
                if (sechdrs[i].sh_type != SHT_RELA)
@@ -300,8 +334,14 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr 
*sechdrs,
                if (!(dstsec->sh_flags & SHF_EXECINSTR))
                        continue;
 
-               /* sort by type, symbol index and addend */
-               sort(rels, numrels, sizeof(Elf64_Rela), cmp_rela, NULL);
+               /*
+                * sort branch relocations requiring a PLT by type, symbol index
+                * and addend
+                */
+               nents = partition_branch_plt_relas(syms, rels, numrels,
+                                                  sechdrs[i].sh_info);
+               if (nents)
+                       sort(rels, nents, sizeof(Elf64_Rela), cmp_rela, NULL);
 
                if (!str_has_prefix(secstrings + dstsec->sh_name, ".init"))
                        core_plts += count_plts(syms, rels, numrels,
-- 
2.27.0.111.gc72c7da667-goog

Reply via email to