Add X86_ENDBR to indirect branch targets. Since adding X86_ENDBR makes one byte displacement of "jrcxz L(end)" out of range, move L(end) closer to "jrcxz L(end)".
* mpn/x86_64/coreibwl/mullo_basecase.asm: Add X86_ENDBR to indirect branch targets. Move L(end) closer to "jrcxz L(end)". --- mpn/x86_64/coreibwl/mullo_basecase.asm | 58 ++++++++++++++++---------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/mpn/x86_64/coreibwl/mullo_basecase.asm b/mpn/x86_64/coreibwl/mullo_basecase.asm index b3e435b35..d7f3bd55e 100644 --- a/mpn/x86_64/coreibwl/mullo_basecase.asm +++ b/mpn/x86_64/coreibwl/mullo_basecase.asm @@ -134,13 +134,15 @@ ifdef(`PIC', jmp *(%r10,%rax,8) ') -L(mf0): mulx( (up), %r10, %r8) +L(mf0): X86_ENDBR + mulx( (up), %r10, %r8) lea 56(up), up lea -8(rp), rp lea L(f7)(%rip), jmpreg jmp L(mb0) -L(mf3): mulx( (up), %r9, %rax) +L(mf3): X86_ENDBR + mulx( (up), %r9, %rax) lea 16(up), up lea 16(rp), rp jrcxz L(mc) @@ -157,38 +159,44 @@ L(mc): mulx( -8,(up), %r10, %r8) mov %r9, (rp) jmp L(c2) -L(mf4): mulx( (up), %r10, %r8) +L(mf4): X86_ENDBR + mulx( (up), %r10, %r8) lea 24(up), up lea 24(rp), rp inc R32(n) lea L(f3)(%rip), jmpreg jmp L(mb4) -L(mf5): mulx( (up), %r9, %rax) +L(mf5): X86_ENDBR + mulx( (up), %r9, %rax) lea 32(up), up lea 32(rp), rp inc R32(n) lea L(f4)(%rip), jmpreg jmp L(mb5) -L(mf6): mulx( (up), %r10, %r8) +L(mf6): X86_ENDBR + mulx( (up), %r10, %r8) lea 40(up), up lea 40(rp), rp inc R32(n) lea L(f5)(%rip), jmpreg jmp L(mb6) -L(mf7): mulx( (up), %r9, %rax) +L(mf7): X86_ENDBR + mulx( (up), %r9, %rax) lea 48(up), up lea 48(rp), rp lea L(f6)(%rip), jmpreg jmp L(mb7) -L(mf1): mulx( (up), %r9, %rax) +L(mf1): X86_ENDBR + mulx( (up), %r9, %rax) lea L(f0)(%rip), jmpreg jmp L(mb1) -L(mf2): mulx( (up), %r10, %r8) +L(mf2): X86_ENDBR + mulx( (up), %r10, %r8) lea 8(up), up lea 8(rp), rp lea L(f1)(%rip), jmpreg @@ -235,17 +243,26 @@ L(mend):mov %r10, -8(rp) shr $3, R32(nn) jmp L(ent) -L(f0): mulx( (up), %r10, %r8) +L(f0): X86_ENDBR + mulx( (up), %r10, %r8) lea -8(up), up lea -8(rp), rp lea L(f7)(%rip), jmpreg jmp L(b0) -L(f1): mulx( (up), %r9, %rax) +L(f1): X86_ENDBR + mulx( (up), %r9, %rax) lea -1(nn), R32(nn) lea L(f0)(%rip), jmpreg jmp L(b1) +L(f7): X86_ENDBR + mulx( (up), %r9, %rax) + lea -16(up), up + lea -16(rp), rp + lea L(f6)(%rip), jmpreg + jmp L(b7) + L(end): adox( (rp), %r9) mov %r9, (rp) adox( %rcx, %rax) C relies on rcx = 0 @@ -261,13 +278,8 @@ L(ent): mulx( 8,(up), %r10, %r8) C r8 unused (use imul?) or R32(nn), R32(n) C copy count, clear CF,OF (n = 0 prior) jmp *jmpreg -L(f7): mulx( (up), %r9, %rax) - lea -16(up), up - lea -16(rp), rp - lea L(f6)(%rip), jmpreg - jmp L(b7) - -L(f2): mulx( (up), %r10, %r8) +L(f2): X86_ENDBR + mulx( (up), %r10, %r8) lea 8(up), up lea 8(rp), rp mulx( (up), %r9, %rax) @@ -313,25 +325,29 @@ L(b3): adox( 48,(rp), %r9) mulx( (up), %r9, %rax) jmp L(top) -L(f6): mulx( (up), %r10, %r8) +L(f6): X86_ENDBR + mulx( (up), %r10, %r8) lea 40(up), up lea -24(rp), rp lea L(f5)(%rip), jmpreg jmp L(b6) -L(f5): mulx( (up), %r9, %rax) +L(f5): X86_ENDBR + mulx( (up), %r9, %rax) lea 32(up), up lea -32(rp), rp lea L(f4)(%rip), jmpreg jmp L(b5) -L(f4): mulx( (up), %r10, %r8) +L(f4): X86_ENDBR + mulx( (up), %r10, %r8) lea 24(up), up lea -40(rp), rp lea L(f3)(%rip), jmpreg jmp L(b4) -L(f3): mulx( (up), %r9, %rax) +L(f3): X86_ENDBR + mulx( (up), %r9, %rax) lea 16(up), up lea -48(rp), rp jrcxz L(cor) -- 2.24.1 _______________________________________________ gmp-devel mailing list gmp-devel@gmplib.org https://gmplib.org/mailman/listinfo/gmp-devel