On Tue, Jul 22, 2025 at 4:47 AM H.J. Lu <hjl.to...@gmail.com> wrote:
>
> For TLS calls:
>
> 1. UNSPEC_TLS_GD:
>
>   (parallel [
>     (set (reg:DI 0 ax)
>          (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
>                   (const_int 0 [0])))
>     (unspec:DI [(symbol_ref:DI ("e") [flags 0x50])
>                 (reg/f:DI 7 sp)] UNSPEC_TLS_GD)
>     (clobber (reg:DI 5 di))])
>
> 2. UNSPEC_TLS_LD_BASE:
>
>   (parallel [
>     (set (reg:DI 0 ax)
>          (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
>                   (const_int 0 [0])))
>     (unspec:DI [(reg/f:DI 7 sp)] UNSPEC_TLS_LD_BASE)])
>
> 3. UNSPEC_TLSDESC:
>
>   (parallel [
>      (set (reg/f:DI 104)
>            (plus:DI (unspec:DI [
>                        (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
>                        (reg:DI 114)
>                        (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
>                     (const:DI (unspec:DI [
>                                  (symbol_ref:DI ("e") [flags 0x1a])
>                               ] UNSPEC_DTPOFF))))
>      (clobber (reg:CC 17 flags))])
>
>   (parallel [
>     (set (reg:DI 101)
>          (unspec:DI [(symbol_ref:DI ("e") [flags 0x50])
>                      (reg:DI 112)
>                      (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
>     (clobber (reg:CC 17 flags))])
>
> they return the same value for the same input value.  But multiple calls
> with the same input value may be generated for simple programs like:
>
> void a(long *);
> int b(void);
> void c(void);
> static __thread long e;
> long
> d(void)
> {
>   a(&e);
>   if (b())
>     c();
>   return e;
> }
>
> When compiled with -O2 -fPIC -mtls-dialect=gnu2, the following codes are
> generated:
>
>         .type   d, @function
> d:
> .LFB0:
>         .cfi_startproc
>         pushq   %rbx
>         .cfi_def_cfa_offset 16
>         .cfi_offset 3, -16
>         leaq    e@TLSDESC(%rip), %rbx
>         movq    %rbx, %rax
>         call    *e@TLSCALL(%rax)
>         addq    %fs:0, %rax
>         movq    %rax, %rdi
>         call    a@PLT
>         call    b@PLT
>         testl   %eax, %eax
>         jne     .L8
>         movq    %rbx, %rax
>         call    *e@TLSCALL(%rax)
>         popq    %rbx
>         .cfi_remember_state
>         .cfi_def_cfa_offset 8
>         movq    %fs:(%rax), %rax
>         ret
>         .p2align 4,,10
>         .p2align 3
> .L8:
>         .cfi_restore_state
>         call    c@PLT
>         movq    %rbx, %rax
>         call    *e@TLSCALL(%rax)
>         popq    %rbx
>         .cfi_def_cfa_offset 8
>         movq    %fs:(%rax), %rax
>         ret
>         .cfi_endproc
>
> There are 3 "call *e@TLSCALL(%rax)".  They all return the same value.
> Rename the remove_redundant_vector pass to the x86_cse pass, for 64bit,
> extend it to also remove redundant TLS calls to generate:
>
> d:
> .LFB0:
>         .cfi_startproc
>         pushq   %rbx
>         .cfi_def_cfa_offset 16
>         .cfi_offset 3, -16
>         leaq    e@TLSDESC(%rip), %rax
>         movq    %fs:0, %rdi
>         call    *e@TLSCALL(%rax)
>         addq    %rax, %rdi
>         movq    %rax, %rbx
>         call    a@PLT
>         call    b@PLT
>         testl   %eax, %eax
>         jne     .L8
>         movq    %fs:(%rbx), %rax
>         popq    %rbx
>         .cfi_remember_state
>         .cfi_def_cfa_offset 8
>         ret
>         .p2align 4,,10
>         .p2align 3
> .L8:
>         .cfi_restore_state
>         call    c@PLT
>         movq    %fs:(%rbx), %rax
>         popq    %rbx
>         .cfi_def_cfa_offset 8
>         ret
>         .cfi_endproc
>
> with only one "call *e@TLSCALL(%rax)".  This reduces the number of
> __tls_get_addr calls in libgcc.a by 72%:
>
> __tls_get_addr calls     before         after
> libgcc.a                 868            243
>
> gcc/
>
>         PR target/81501
>         * config/i386/i386-features.cc (x86_cse_kind): Add X86_CSE_TLS_GD,
>         X86_CSE_TLS_LD_BASE and X86_CSE_TLSDESC.
>         (redundant_load): Renamed to ...
>         (redundant_pattern): This.
>         (replace_tls_call): New.
>         (ix86_place_single_tls_call): Likewise.
>         (pass_remove_redundant_vector_load): Renamed to ...
>         (pass_x86_cse): This.  Add val, def_insn, mode, scalar_mode,
>         kind, candidate_kind, x86_cse, candidate_gnu_tls_p,
>         candidate_gnu2_tls_p and candidate_vector_p.
>         (pass_x86_cse::candidate_gnu_tls_p): New.
>         (pass_x86_cse::candidate_gnu2_tls_p): Likewise.
>         (pass_x86_cse::candidate_vector_p): Likewise.
>         (remove_redundant_vector_load): Renamed to ...
>         (pass_x86_cse::x86_cse): This.  Extend to remove redundant TLS
>         calls.
>         (make_pass_remove_redundant_vector_load): Renamed to ...
>         (make_pass_x86_cse): This.
>         (config/i386/i386-passes.def): Replace
>         pass_remove_redundant_vector_load with pass_x86_cse.
>         config/i386/i386-protos.h (ix86_tls_get_addr): New.
>         (make_pass_remove_redundant_vector_load): Renamed to ...
>         (make_pass_x86_cse): This.
>         * config/i386/i386.cc (ix86_tls_get_addr): Remove static.
>         * config/i386/i386.h (machine_function): Add
>         tls_descriptor_call_multiple_p.
>         * config/i386/i386.md (tls64): New attribute.
>         (@tls_global_dynamic_64_<mode>): Set tls_descriptor_call_multiple_p.
>         (@tls_local_dynamic_base_64_<mode>): Likewise.
>         (@tls_dynamic_gnu2_64_<mode>): Likewise.
>         (*tls_global_dynamic_64_<mode>): Set tls64 attribute to gd.
>         (*tls_local_dynamic_base_64_<mode>): Set tls64 attribute to ld_base.
>         (*tls_dynamic_gnu2_lea_64_<mode>): Set tls64 attribute to lea.
>         (*tls_dynamic_gnu2_call_64_<mode>): Set tls64 attribute to call.
>         (*tls_dynamic_gnu2_combine_64_<mode>): Set tls64 attribute to
>         combine.
>
> gcc/testsuite/
>
>         PR target/81501
>         * g++.target/i386/pr81501-1.C: New test.
>         * gcc.target/i386/pr81501-1a.c: Likewise.
>         * gcc.target/i386/pr81501-1b.c: Likewise.
>         * gcc.target/i386/pr81501-2a.c: Likewise.
>         * gcc.target/i386/pr81501-2b.c: Likewise.
>         * gcc.target/i386/pr81501-3.c: Likewise.
>         * gcc.target/i386/pr81501-4a.c: Likewise.
>         * gcc.target/i386/pr81501-4b.c: Likewise.
>         * gcc.target/i386/pr81501-5.c: Likewise.
>         * gcc.target/i386/pr81501-6a.c: Likewise.
>         * gcc.target/i386/pr81501-6b.c: Likewise.
>         * gcc.target/i386/pr81501-7.c: Likewise.
>         * gcc.target/i386/pr81501-8a.c: Likewise.
>         * gcc.target/i386/pr81501-8b.c: Likewise.
>         * gcc.target/i386/pr81501-9a.c: Likewise.
>         * gcc.target/i386/pr81501-9b.c: Likewise.
>
> Signed-off-by: H.J. Lu <hjl.to...@gmail.com>
> ---
>  gcc/config/i386/i386-features.cc           | 766 ++++++++++++++++++---
>  gcc/config/i386/i386-passes.def            |   2 +-
>  gcc/config/i386/i386-protos.h              |   4 +-
>  gcc/config/i386/i386.cc                    |   2 +-
>  gcc/config/i386/i386.h                     |   3 +
>  gcc/config/i386/i386.md                    |  25 +-
>  gcc/testsuite/g++.target/i386/pr81501-1.C  |  16 +
>  gcc/testsuite/gcc.target/i386/pr81501-1a.c |  17 +
>  gcc/testsuite/gcc.target/i386/pr81501-1b.c |   6 +
>  gcc/testsuite/gcc.target/i386/pr81501-2a.c |  17 +
>  gcc/testsuite/gcc.target/i386/pr81501-2b.c |   6 +
>  gcc/testsuite/gcc.target/i386/pr81501-3.c  |   9 +
>  gcc/testsuite/gcc.target/i386/pr81501-4a.c |  51 ++
>  gcc/testsuite/gcc.target/i386/pr81501-4b.c |   6 +
>  gcc/testsuite/gcc.target/i386/pr81501-5.c  |  13 +
>  gcc/testsuite/gcc.target/i386/pr81501-6a.c |  67 ++
>  gcc/testsuite/gcc.target/i386/pr81501-6b.c |  28 +
>  gcc/testsuite/gcc.target/i386/pr81501-7.c  |  20 +
>  gcc/testsuite/gcc.target/i386/pr81501-8a.c |  82 +++
>  gcc/testsuite/gcc.target/i386/pr81501-8b.c |  31 +
>  gcc/testsuite/gcc.target/i386/pr81501-9a.c |  39 ++
>  gcc/testsuite/gcc.target/i386/pr81501-9b.c |  22 +
>  22 files changed, 1119 insertions(+), 113 deletions(-)
>  create mode 100644 gcc/testsuite/g++.target/i386/pr81501-1.C
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-1a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-1b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-2a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-2b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-3.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-4a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-4b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-5.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-6a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-6b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-7.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-8a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-8b.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-9a.c
>  create mode 100644 gcc/testsuite/gcc.target/i386/pr81501-9b.c
>
> diff --git a/gcc/config/i386/i386-features.cc 
> b/gcc/config/i386/i386-features.cc
> index c131577805f..d38b297a89a 100644
> --- a/gcc/config/i386/i386-features.cc
> +++ b/gcc/config/i386/i386-features.cc
> @@ -3493,10 +3493,13 @@ enum x86_cse_kind
>  {
>    X86_CSE_CONST0_VECTOR,
>    X86_CSE_CONSTM1_VECTOR,
> -  X86_CSE_VEC_DUP
> +  X86_CSE_VEC_DUP,
> +  X86_CSE_TLS_GD,
> +  X86_CSE_TLS_LD_BASE,
> +  X86_CSE_TLSDESC
>  };
>
> -struct redundant_load
> +struct redundant_pattern
>  {
>    /* Bitmap of basic blocks with broadcast instructions.  */
>    auto_bitmap bbs;
> @@ -3669,22 +3672,570 @@ ix86_broadcast_inner (rtx op, machine_mode mode,
>    return op;
>  }
>
> -/* At entry of the nearest common dominator for basic blocks with vector
> -   CONST0_RTX and integer CONSTM1_RTX uses, generate a single widest
> -   vector set instruction for all CONST0_RTX and integer CONSTM1_RTX
> -   uses.
> +/* Replace CALL instruction in TLS_CALL_INSNS with SET from SRC.  */
>
> -   NB: We want to generate only a single widest vector set to cover the
> -   whole function.  The LCM algorithm isn't appropriate here since it
> -   may place a vector set inside the loop.  */
> +static void
> +replace_tls_call (rtx src, auto_bitmap &tls_call_insns)
> +{
> +  bitmap_iterator bi;
> +  unsigned int id;
>
> -static unsigned int
> -remove_redundant_vector_load (void)
> +  EXECUTE_IF_SET_IN_BITMAP (tls_call_insns, 0, id, bi)
> +    {
> +      rtx_insn *insn = DF_INSN_UID_GET (id)->insn;
> +
> +      /* If this isn't a CALL, only GNU2 TLS implicit CALL patterns are
> +        allowed.  */
> +      if (!CALL_P (insn))
> +       {
> +         attr_tls64 tls64 = get_attr_tls64 (insn);
> +         if (tls64 != TLS64_CALL && tls64 != TLS64_COMBINE)
> +           gcc_unreachable ();
> +       }
> +
> +      rtx pat = PATTERN (insn);
> +      if (GET_CODE (pat) != PARALLEL)
> +       gcc_unreachable ();
> +
> +      int j;
> +      rtx op, dest = nullptr;
> +      for (j = XVECLEN (pat, 0) - 1; j >= 0; j--)
> +       {
> +         op = XVECEXP (pat, 0, j);
> +         if (GET_CODE (op) == SET)
> +           {
> +             dest = SET_DEST (op);
> +             break;
> +           }
> +       }
> +
> +      rtx set = gen_rtx_SET (dest, src);
> +      rtx_insn *set_insn = emit_insn_after (set, insn);
> +      if (recog_memoized (set_insn) < 0)
> +       gcc_unreachable ();
> +
> +      if (dump_file)
> +       {
> +         fprintf (dump_file, "\nReplace:\n\n");
> +         print_rtl_single (dump_file, insn);
> +         fprintf (dump_file, "\nwith:\n\n");
> +         print_rtl_single (dump_file, set_insn);
> +         fprintf (dump_file, "\n");
> +       }
> +
> +      /* Delete the CALL insn.  */
> +      delete_insn (insn);
> +
> +      df_insn_rescan (set_insn);
> +    }
> +}
> +
> +/* Generate a TLS call of KIND with VAL and copy the call result to DEST,
> +   at entry of the nearest dominator for basic block map BBS, which is in
> +   the fake loop that contains the whole function, so that there is only
> +   a single TLS CALL of KIND with VAL in the whole function.  If
> +   TLSDESC_SET isn't nullptr, insert it before the TLS call.  */
> +
> +static void
> +ix86_place_single_tls_call (rtx dest, rtx val, x86_cse_kind kind,
> +                           bitmap bbs, rtx tlsdesc_set = nullptr)
> +{
> +  basic_block bb = nearest_common_dominator_for_set (CDI_DOMINATORS, bbs);
> +  while (bb->loop_father->latch
> +        != EXIT_BLOCK_PTR_FOR_FN (cfun))
> +    bb = get_immediate_dominator (CDI_DOMINATORS,
> +                                 bb->loop_father->header);
> +
> +  rtx_insn *insn = BB_HEAD (bb);
> +  while (insn && !NONDEBUG_INSN_P (insn))
> +    {
> +      if (insn == BB_END (bb))
> +       {
> +         insn = NULL;
> +         break;
> +       }
> +      insn = NEXT_INSN (insn);
> +    }
> +
> +  rtx rax = nullptr, rdi;
> +  rtx eqv = nullptr;
> +  rtx caddr;
> +  rtx set;
> +  rtx clob;
> +  rtx symbol;
> +  rtx tls;
> +  rtx_insn *tls_insn;
> +
> +  switch (kind)
> +    {
> +    case X86_CSE_TLS_GD:
> +      rax = gen_rtx_REG (Pmode, AX_REG);
> +      rdi = gen_rtx_REG (Pmode, DI_REG);
> +      caddr = ix86_tls_get_addr ();
> +
> +      symbol = XVECEXP (val, 0, 0);
> +      tls = gen_tls_global_dynamic_64 (Pmode, rax, symbol, caddr, rdi);
> +
> +      if (GET_MODE (symbol) != Pmode)
> +       symbol = gen_rtx_ZERO_EXTEND (Pmode, symbol);
> +      eqv = symbol;
> +      break;
> +
> +    case X86_CSE_TLS_LD_BASE:
> +      rax = gen_rtx_REG (Pmode, AX_REG);
> +      rdi = gen_rtx_REG (Pmode, DI_REG);
> +      caddr = ix86_tls_get_addr ();
> +
> +      tls = gen_tls_local_dynamic_base_64 (Pmode, rax, caddr, rdi);
> +
> +      /* Attach a unique REG_EQUAL to DEST, to allow the RTL optimizers
> +        to share the LD_BASE result with other LD model accesses.  */
> +      eqv = gen_rtx_UNSPEC (Pmode, gen_rtvec (1, const0_rtx),
> +                           UNSPEC_TLS_LD_BASE);
> +
> +      break;
> +
> +    case X86_CSE_TLSDESC:
> +      set = gen_rtx_SET (dest, val);
> +      clob = gen_rtx_CLOBBER (VOIDmode,
> +                             gen_rtx_REG (CCmode, FLAGS_REG));
> +      tls = gen_rtx_PARALLEL (VOIDmode, gen_rtvec (2, set, clob));
> +      break;
> +
> +    default:
> +      gcc_unreachable ();
> +    }
> +
> +  rtx_insn *before = nullptr;
> +  rtx_insn *after = nullptr;
> +  if (insn == BB_HEAD (bb))
> +    before = insn;
> +  else
> +    after = insn ? PREV_INSN (insn) : BB_END (bb);
> +
> +  /* TLS_GD and TLS_LD_BASE instructions are normal functions which
> +     clobber caller-saved registers.  TLSDESC instructions are special
> +     functions which only clobber RAX.  If any registers clobbered by
> +     the TLS instruction are live in this basic block, we must insert
> +     the TLS instruction after all live registers clobbered by the TLS
> +     instruction are dead.  */
> +
> +  auto_bitmap live_caller_saved_regs;
> +  bitmap in = df_live ? DF_LIVE_IN (bb) : DF_LR_IN (bb);
> +
> +  bool flags_live_p = bitmap_bit_p (in, FLAGS_REG);
> +
> +  unsigned int i;
> +
> +  /* Get all live caller-saved registers.  */
> +  if (kind == X86_CSE_TLSDESC)
> +    {
> +      if (bitmap_bit_p (in, AX_REG))
> +       bitmap_set_bit (live_caller_saved_regs, AX_REG);
> +    }
> +  else
> +    for (i = 0; i < FIRST_PSEUDO_REGISTER; i++)
> +      if (call_used_regs[i]
> +         && !fixed_regs[i]
> +         && bitmap_bit_p (in, i))
> +       bitmap_set_bit (live_caller_saved_regs, i);
> +
> +  if (!bitmap_empty_p (live_caller_saved_regs))
> +    {
> +      /* Search for REG_DEAD notes in this basic block.  */
> +      FOR_BB_INSNS (bb, insn)
> +       {
> +         if (!NONDEBUG_INSN_P (insn))
> +           continue;
> +
> +         /* Check if FLAGS register is live.  */
> +         set = single_set (insn);
> +         if (set)
> +           {
> +             rtx dest = SET_DEST (set);
> +             if (REG_P (dest) && REGNO (dest) == FLAGS_REG)
> +               flags_live_p = true;
> +           }
> +
> +         rtx link;
> +         for (link = REG_NOTES (insn); link; link = XEXP (link, 1))
> +           if (REG_NOTE_KIND (link) == REG_DEAD
> +               && REG_P (XEXP (link, 0)))
> +             {
> +               /* Mark the live caller-saved register as dead.  */
> +               for (i = REGNO (XEXP (link, 0));
> +                    i < END_REGNO (XEXP (link, 0));
> +                    i++)
> +                 bitmap_clear_bit (live_caller_saved_regs, i);
> +
> +               /* Check if FLAGS register is dead.  */
> +               if (REGNO (XEXP (link, 0)) == FLAGS_REG)
> +                 flags_live_p = false;
> +
> +               if (bitmap_empty_p (live_caller_saved_regs))
> +                 {
> +                   /* All live caller-saved registers are dead after
> +                      this instruction.  Since TLS instructions
> +                      clobber FLAGS register, it must be dead where
> +                      the TLS will be inserted after.  */
> +                   if (flags_live_p)
> +                     gcc_unreachable ();
> +                   after = insn;
> +                   goto insert_after;
> +                 }
> +             }
> +       }
> +
> +      /* All live caller-saved registers should be dead at the end
> +        of this basic block.  */
> +      gcc_unreachable ();
> +    }
> +
> +  /* Emit the TLS CALL insn.  */
> +  if (after)
> +    {
> +insert_after:
> +      tls_insn = emit_insn_after (tls, after);
> +    }
> +  else
> +    tls_insn = emit_insn_before (tls, before);
> +
> +  rtx_insn *tlsdesc_insn = nullptr;
> +  if (tlsdesc_set)
> +    {
> +      rtx dest = copy_rtx (SET_DEST (tlsdesc_set));
> +      rtx src = copy_rtx (SET_SRC (tlsdesc_set));
> +      tlsdesc_set = gen_rtx_SET (dest, src);
> +      tlsdesc_insn = emit_insn_before (tlsdesc_set, tls_insn);
> +    }
> +
> +  if (kind != X86_CSE_TLSDESC)
> +    {
> +      RTL_CONST_CALL_P (tls_insn) = 1;
> +
> +      /* Indicate that this function can't jump to non-local gotos.  */
> +      make_reg_eh_region_note_nothrow_nononlocal (tls_insn);
> +    }
> +
> +  if (recog_memoized (tls_insn) < 0)
> +    gcc_unreachable ();
> +
> +  if (dump_file)
> +    {
> +      if (after)
> +       {
> +         fprintf (dump_file, "\nPlace:\n\n");
> +         if (tlsdesc_insn)
> +           print_rtl_single (dump_file, tlsdesc_insn);
> +         print_rtl_single (dump_file, tls_insn);
> +         fprintf (dump_file, "\nafter:\n\n");
> +         print_rtl_single (dump_file, after);
> +         fprintf (dump_file, "\n");
> +       }
> +      else
> +       {
> +         fprintf (dump_file, "\nPlace:\n\n");
> +         if (tlsdesc_insn)
> +           print_rtl_single (dump_file, tlsdesc_insn);
> +         print_rtl_single (dump_file, tls_insn);
> +         fprintf (dump_file, "\nbefore:\n\n");
> +         print_rtl_single (dump_file, insn);
> +         fprintf (dump_file, "\n");
> +       }
> +    }
> +
> +  if (kind != X86_CSE_TLSDESC)
> +    {
> +      /* Copy RAX to DEST.  */
> +      set = gen_rtx_SET (dest, rax);
> +      rtx_insn *set_insn = emit_insn_after (set, tls_insn);
> +      set_dst_reg_note (set_insn, REG_EQUAL, copy_rtx (eqv), dest);
> +      if (dump_file)
> +       {
> +         fprintf (dump_file, "\nPlace:\n\n");
> +         print_rtl_single (dump_file, set_insn);
> +         fprintf (dump_file, "\nafter:\n\n");
> +         print_rtl_single (dump_file, tls_insn);
> +         fprintf (dump_file, "\n");
> +       }
> +    }
> +}
> +
> +namespace {
> +
> +const pass_data pass_data_x86_cse =
> +{
> +  RTL_PASS, /* type */
> +  "x86_cse", /* name */
> +  OPTGROUP_NONE, /* optinfo_flags */
> +  TV_MACH_DEP, /* tv_id */
> +  0, /* properties_required */
> +  0, /* properties_provided */
> +  0, /* properties_destroyed */
> +  0, /* todo_flags_start */
> +  0, /* todo_flags_finish */
> +};
> +
> +class pass_x86_cse : public rtl_opt_pass
> +{
> +public:
> +  pass_x86_cse (gcc::context *ctxt)
> +    : rtl_opt_pass (pass_data_x86_cse, ctxt)
> +  {}
> +
> +  /* opt_pass methods: */
> +  bool gate (function *fun) final override
> +    {
> +      return (TARGET_SSE2
> +             && optimize
> +             && optimize_function_for_speed_p (fun));
> +    }
> +
> +  unsigned int execute (function *) final override
> +    {
> +      return x86_cse ();
> +    }
> +
> +private:
> +  /* The redundant source value.  */
> +  rtx val;
> +  /* The instruction which defines the redundant value.  */
> +  rtx_insn *def_insn;
> +  /* Mode of the destination of the candidate redundant instruction.  */
> +  machine_mode mode;
> +  /* Mode of the source of the candidate redundant instruction.  */
> +  machine_mode scalar_mode;
> +  /* The classification of the candidate redundant instruction.  */
> +  x86_cse_kind kind;
> +
> +  enum candidate_kind
> +    {
> +      candidate_no,    /* Instruction isn't a candidate.  */
> +      candidate_ignore,        /* Instruction should be ignored.  */
> +      candidate_yes    /* Instruction is a candidate.  */
> +    };
> +
> +  unsigned int x86_cse (void);
> +  candidate_kind candidate_gnu_tls_p (rtx_insn *);
> +  candidate_kind candidate_gnu2_tls_p (rtx_insn *, rtx);
> +  bool candidate_vector_p (rtx, rtx);
> +}; // class pass_x86_cse
> +
> +/* Return true and output def_insn, val, mode, scalar_mode and kind if
> +   INSN is UNSPEC_TLS_GD or UNSPEC_TLS_LD_BASE.  */
> +
> +pass_x86_cse::candidate_kind
> +pass_x86_cse::candidate_gnu_tls_p (rtx_insn *insn)
> +{
> +  if (!TARGET_64BIT
> +      || !cfun->machine->tls_descriptor_call_multiple_p
> +      || !CALL_P (insn))
> +    return candidate_no;
> +
> +  /* Record the redundant TLS CALLs for 64-bit:
> +
> +     (parallel [
> +       (set (reg:DI 0 ax)
> +            (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
> +                     (const_int 0 [0])))
> +       (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
> +                   (reg/f:DI 7 sp)] UNSPEC_TLS_GD)
> +       (clobber (reg:DI 5 di))])
> +
> +
> +     and
> +
> +     (parallel [
> +       (set (reg:DI 0 ax)
> +            (call:DI (mem:QI (symbol_ref:DI ("__tls_get_addr")))
> +                     (const_int 0 [0])))
> +       (unspec:DI [(reg/f:DI 7 sp)] UNSPEC_TLS_LD_BASE)])
> +
> +   */
> +
> +  rtx pat, set, dest;
> +  attr_tls64 tls64 = get_attr_tls64 (insn);
> +  switch (tls64)
> +    {
> +    default:
> +      return candidate_ignore;
> +
> +    case TLS64_GD:
> +    case TLS64_LD_BASE:
> +      pat = PATTERN (insn);
> +      set = XVECEXP (pat, 0, 0);
> +      gcc_assert (GET_CODE (set) == SET);
> +      dest = SET_DEST (set);
> +      scalar_mode = mode = GET_MODE (dest);
> +      val = XVECEXP (pat, 0, 1);
> +      gcc_assert (GET_CODE (val) == UNSPEC);
> +      break;
> +    }
> +
> +  if (tls64 == TLS64_GD)
> +    kind = X86_CSE_TLS_GD;
> +  else
> +    kind = X86_CSE_TLS_LD_BASE;
> +
> +  def_insn = nullptr;
> +  return candidate_yes;
> +}
> +
> +/* Return true and output def_insn, val, mode, scalar_mode and kind if
> +   INSN is UNSPEC_TLSDESC.  */
> +
> +pass_x86_cse::candidate_kind
> +pass_x86_cse::candidate_gnu2_tls_p (rtx_insn *insn, rtx src)
> +{
> +  if (!TARGET_64BIT || !cfun->machine->tls_descriptor_call_multiple_p)
> +    return candidate_no;
> +
> +  /* Record GNU2 TLS CALLs for 64-bit:
> +
> +     (parallel [
> +       (set (reg/f:DI 104)
> +            (plus:DI (unspec:DI [
> +                        (symbol_ref:DI ("_TLS_MODULE_BASE_") [flags 0x10])
> +                        (reg:DI 114)
> +                        (reg/f:DI 7 sp)] UNSPEC_TLSDESC)
> +                     (const:DI (unspec:DI [
> +                                  (symbol_ref:DI ("e") [flags 0x1a])
> +                                ] UNSPEC_DTPOFF))))
> +       (clobber (reg:CC 17 flags))])
> +
> +     and
> +
> +     (parallel [
> +       (set (reg:DI 101)
> +            (unspec:DI [(symbol_ref:DI ("foo") [flags 0x50])
> +                        (reg:DI 112)
> +                        (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
> +       (clobber (reg:CC 17 flags))])
> +
> +   */
> +
> +  attr_tls64 tls64 = get_attr_tls64 (insn);
> +  if (tls64 == TLS64_CALL)
> +    val = src;
> +  else if (tls64 == TLS64_COMBINE)
> +    {
> +      val = src;
> +      src = XEXP (src, 0);
> +    }
> +  else
> +    return candidate_no;
> +
> +  kind = X86_CSE_TLSDESC;
> +  gcc_assert (GET_CODE (src) == UNSPEC);
> +  src = XVECEXP (src, 0, 1);
> +  scalar_mode = mode = GET_MODE (src);
> +  if (REG_P (src))
> +    {
> +      /* All definitions of reg:DI 129 in
> +
> +        (set (reg:DI 110)
> +             (unspec:DI [(symbol_ref:DI ("foo"))
> +                         (reg:DI 129)
> +                         (reg/f:DI 7 sp)] UNSPEC_TLSDESC))
> +
> +        should have the same source as in
> +
> +        (set (reg:DI 129)
> +             (unspec:DI [(symbol_ref:DI ("foo"))] UNSPEC_TLSDESC))
> +
> +       */
> +
> +      df_ref ref;
> +      rtx_insn *set_insn = nullptr;
> +      rtx tls_src = nullptr;
> +      for (ref = DF_REG_DEF_CHAIN (REGNO (src));
> +          ref;
> +          ref = DF_REF_NEXT_REG (ref))
> +       {
> +         if (DF_REF_IS_ARTIFICIAL (ref))
> +           break;
> +
> +         set_insn = DF_REF_INSN (ref);
> +         tls64 = get_attr_tls64 (set_insn);
> +         if (tls64 != TLS64_LEA)
> +           {
> +             set_insn = nullptr;
> +             break;
> +           }
> +
> +         rtx tls_set = PATTERN (set_insn);
> +         if (!tls_src)
> +           tls_src = SET_SRC (tls_set);
> +         else if (!rtx_equal_p (tls_src, SET_SRC (tls_set)))
> +           {
> +             set_insn = nullptr;
> +             break;
> +           }
> +       }
> +
> +      if (!set_insn)
> +       return candidate_ignore;
> +
> +      rtx set = single_set (insn);
> +      if (!set)
> +       return candidate_ignore;
> +
> +      def_insn = set_insn;
> +    }
> +  else if (GET_CODE (src) == UNSPEC
> +          && XINT (src, 1) == UNSPEC_TLSDESC
> +          && SYMBOL_REF_P (XVECEXP (src, 0, 0)))
> +    def_insn = nullptr;
> +  else
> +    gcc_unreachable ();
> +
> +  return candidate_yes;
> +}
> +
> +/* Return true and output def_insn, val, mode, scalar_mode and kind if
> +  INSN is a vector broadcast instruction.  */
> +
> +bool
> +pass_x86_cse::candidate_vector_p (rtx set, rtx src)
> +{
> +  rtx dest = SET_DEST (set);
> +  mode = GET_MODE (dest);
> +  /* Skip non-vector instruction.  */
> +  if (!VECTOR_MODE_P (mode))
> +    return false;
> +
> +  /* Skip non-vector load instruction.  */
> +  if (!REG_P (dest) && !SUBREG_P (dest))
> +    return false;
> +
> +  val = ix86_broadcast_inner (src, mode, &scalar_mode, &kind,
> +                             &def_insn);
> +  return val ? true : false;
> +}
> +
> +/* At entry of the nearest common dominator for basic blocks with
> +
> +   1. Vector CONST0_RTX patterns.
> +   2. Vector CONSTM1_RTX patterns.
> +   3. Vector broadcast patterns.
> +   4. UNSPEC_TLS_GD patterns.
> +   5. UNSPEC_TLS_LD_BASE patterns.
> +   6. UNSPEC_TLSDESC patterns.
> +
> +   generate a single pattern whose destination is used to replace the
> +   source in all identical patterns.
> +
> +   NB: We want to generate a pattern, which is executed only once, to
> +   cover the whole function.  The LCM algorithm isn't appropriate here
> +   since it may place a pattern inside the loop.  */
> +
> +unsigned int
> +pass_x86_cse::x86_cse (void)
>  {
>    timevar_push (TV_MACH_DEP);
>
> -  auto_vec<redundant_load *> loads;
> -  redundant_load *load;
> +  auto_vec<redundant_pattern *> loads;
> +  redundant_pattern *load;
>    basic_block bb;
>    rtx_insn *insn;
>    unsigned int i;
> @@ -3700,61 +4251,74 @@ remove_redundant_vector_load (void)
>           if (!NONDEBUG_INSN_P (insn))
>             continue;
>
> -         rtx set = single_set (insn);
> -         if (!set)
> -           continue;
> +         bool matched = false;
> +         rtx set, src;
> +         /* Remove redundant pattens if there are more than 2 of
> +            them.  */
> +         unsigned int threshold = 2;
>
> -         /* Record single set vector instruction with CONST0_RTX and
> -            CONSTM1_RTX source.  Record basic blocks with CONST0_RTX and
> -            CONSTM1_RTX.  Count CONST0_RTX and CONSTM1_RTX.  Record the
> -            maximum size of CONST0_RTX and CONSTM1_RTX.  */
> +         /* First check UNSPEC_TLS_GD and UNSPEC_TLS_LD_BASE.  */
> +         switch (candidate_gnu_tls_p (insn))

Can we just
 switch (get_attr_tls64 (insn))
  {
    case TLS64_GD:
    case TLS64_LD_BASE:
      if (!candidate_gnu_tls_p (insn)  ----> return true if it's a
candidate, otherwise return false.
         continue;
      break;
    case TLS64_CALL:
    case TLS64_COMBINE:
       if (!candidate_gnu2_tls_p (insn))
         continue;
        break;
     case  none:
        if (!candidate_vector_p (insn)
           continue;
       break;

    default:
       continue;
  }
> +           {
> +           case candidate_no:
> +             /* This isn't UNSPEC_TLS_GD nor UNSPEC_TLS_LD_BASE.  */
> +             set = single_set (insn);
> +             if (!set)
> +               continue;
>
> -         rtx dest = SET_DEST (set);
> -         machine_mode mode = GET_MODE (dest);
> -         /* Skip non-vector instruction.  */
> -         if (!VECTOR_MODE_P (mode))
> -           continue;
> +             src = SET_SRC (set);
>
> -         rtx src = SET_SRC (set);
> -         /* Skip non-vector load instruction.  */
> -         if (!REG_P (dest) && !SUBREG_P (dest))
> -           continue;
> +             /* Check UNSPEC_TLSDESC.  */
> +             switch (candidate_gnu2_tls_p (insn, src))
> +               {
> +               case candidate_no:
> +                 /* Check vector instruction.  */
> +                 if (candidate_vector_p (set, src))
> +                   break;
> +                 continue;
> +               case candidate_ignore:
> +                 /* Not a candidate.  Skip.  */
> +                 continue;
> +               case candidate_yes:
> +                 break;
> +               }
> +             break;
>
> -         rtx_insn *def_insn;
> -         machine_mode scalar_mode;
> -         x86_cse_kind kind;
> -         rtx val = ix86_broadcast_inner (src, mode, &scalar_mode,
> -                                         &kind, &def_insn);
> -         if (!val)
> -           continue;
> +           case candidate_ignore:
> +             /* Not a candidate.  Skip.  */
> +             continue;
>
> -          /* Remove redundant register loads if there are more than 2
> -             loads will be used.  */
> -         unsigned int threshold = 2;
> +           case candidate_yes:
> +             /* This is UNSPEC_TLS_GD or UNSPEC_TLS_LD_BASE.  */
> +             break;
> +           }
>
> -         /* Check if there is a matching redundant vector load.   */
> -         bool matched = false;
> +         /* Check if there is a matching redundant load.   */
>           FOR_EACH_VEC_ELT (loads, i, load)
>             if (load->val
>                 && load->kind == kind
>                 && load->mode == scalar_mode
>                 && (load->bb == bb
> -                   || kind < X86_CSE_VEC_DUP
> +                   || kind != X86_CSE_VEC_DUP
>                     /* Non all 0s/1s vector load must be in the same
>                        basic block if it is in a recursive call.  */
>                     || !recursive_call_p)
>                 && rtx_equal_p (load->val, val))
>               {
> -               /* Record vector instruction.  */
> +               /* Record instruction.  */
>                 bitmap_set_bit (load->insns, INSN_UID (insn));
>
>                 /* Record the maximum vector size.  */
> -               if (load->size < GET_MODE_SIZE (mode))
> +               if (kind <= X86_CSE_VEC_DUP
> +                   && load->size < GET_MODE_SIZE (mode))
>                   load->size = GET_MODE_SIZE (mode);
>
>                 /* Record the basic block.  */
>                 bitmap_set_bit (load->bbs, bb->index);
> +
> +               /* Increment the count.  */
>                 load->count++;
> +
>                 matched = true;
>                 break;
>               }
> @@ -3762,8 +4326,11 @@ remove_redundant_vector_load (void)
>           if (matched)
>             continue;
>
> -         /* We see this vector broadcast the first time.  */
> -         load = new redundant_load;
> +         /* We see this instruction the first time.  Record the
> +            redundant source value, its mode, the destination size,
> +            instruction which defines the redundant source value,
> +            instruction basic block and the instruction kind.  */
> +         load = new redundant_pattern;
>
>           load->val = copy_rtx (val);
>           load->mode = scalar_mode;
> @@ -3786,6 +4353,15 @@ remove_redundant_vector_load (void)
>    FOR_EACH_VEC_ELT (loads, i, load)
>      if (load->count >= load->threshold)
>        {
And we can also have
switch (load->kind)
  {
      case X86_CSE_TLS_GD:
      case X86_CSE_TLD_LD_BASE:
      case X86_CSE_TLSDESC:
         ix86_cse_replace_tls_call (....);
         break;

      case X86_CSE_CONST0_VECTOR:
      case X86_CSE_CONSTM1_VECTOR:
         ix86_cse_replace_const0_m1 (...);
         break;

      case X86_CSE_VEC_DUP:
        ix86_cse_replace_vec_dup (...);
        break;

      default:
          gcc_unreachable();
  }

I think that would be more readable and easy to maintain.

> +       if (load->kind > X86_CSE_VEC_DUP)
> +         {
> +           broadcast_reg = gen_reg_rtx (load->mode);
> +           replace_tls_call (broadcast_reg, load->insns);
> +           load->broadcast_reg = broadcast_reg;
> +           replaced = true;
> +           continue;
> +         }
> +
>         machine_mode mode = ix86_get_vector_cse_mode (load->size,
>                                                       load->mode);
>         broadcast_reg = gen_reg_rtx (mode);
> @@ -3841,34 +4417,48 @@ remove_redundant_vector_load (void)
>           {
>             if (load->def_insn)
>               {
> -               /* Insert a broadcast after the original scalar
> -                  definition.  */
> -               rtx set = gen_rtx_SET (load->broadcast_reg,
> -                                      load->broadcast_source);
> -               insn = emit_insn_after (set, load->def_insn);
> -
> -               if (cfun->can_throw_non_call_exceptions)
> +               if (load->kind == X86_CSE_TLSDESC)
> +                 ix86_place_single_tls_call (load->broadcast_reg,
> +                                             load->val,
> +                                             load->kind,
> +                                             load->bbs,
> +                                             PATTERN (load->def_insn));
> +               else
>                   {
> -                   /* Handle REG_EH_REGION note in DEF_INSN.  */
> -                   rtx note = find_reg_note (load->def_insn,
> -                                             REG_EH_REGION, nullptr);
> -                   if (note)
> +                   /* Insert a broadcast after the original scalar
> +                      definition.  */
> +                   rtx set = gen_rtx_SET (load->broadcast_reg,
> +                                          load->broadcast_source);
> +                   insn = emit_insn_after (set, load->def_insn);
> +
> +                   if (cfun->can_throw_non_call_exceptions)
>                       {
> -                       control_flow_insns.safe_push (load->def_insn);
> -                       add_reg_note (insn, REG_EH_REGION,
> -                                     XEXP (note, 0));
> +                       /* Handle REG_EH_REGION note in DEF_INSN.  */
> +                       rtx note = find_reg_note (load->def_insn,
> +                                                 REG_EH_REGION, nullptr);
> +                       if (note)
> +                         {
> +                           control_flow_insns.safe_push (load->def_insn);
> +                           add_reg_note (insn, REG_EH_REGION,
> +                                         XEXP (note, 0));
> +                         }
>                       }
> -                 }
>
> -               if (dump_file)
> -                 {
> -                   fprintf (dump_file, "\nAdd:\n\n");
> -                   print_rtl_single (dump_file, insn);
> -                   fprintf (dump_file, "\nafter:\n\n");
> -                   print_rtl_single (dump_file, load->def_insn);
> -                   fprintf (dump_file, "\n");
> +                   if (dump_file)
> +                     {
> +                       fprintf (dump_file, "\nAdd:\n\n");
> +                       print_rtl_single (dump_file, insn);
> +                       fprintf (dump_file, "\nafter:\n\n");
> +                       print_rtl_single (dump_file, load->def_insn);
> +                       fprintf (dump_file, "\n");
> +                     }
>                   }
>               }
> +           else if (load->kind > X86_CSE_VEC_DUP)
> +             ix86_place_single_tls_call (load->broadcast_reg,
> +                                         load->val,
> +                                         load->kind,
> +                                         load->bbs);
>             else
>               ix86_place_single_vector_set (load->broadcast_reg,
>                                             load->broadcast_source,
> @@ -3905,48 +4495,12 @@ remove_redundant_vector_load (void)
>    return 0;
>  }
>
> -namespace {
> -
> -const pass_data pass_data_remove_redundant_vector_load =
> -{
> -  RTL_PASS, /* type */
> -  "rrvl", /* name */
> -  OPTGROUP_NONE, /* optinfo_flags */
> -  TV_MACH_DEP, /* tv_id */
> -  0, /* properties_required */
> -  0, /* properties_provided */
> -  0, /* properties_destroyed */
> -  0, /* todo_flags_start */
> -  0, /* todo_flags_finish */
> -};
> -
> -class pass_remove_redundant_vector_load : public rtl_opt_pass
> -{
> -public:
> -  pass_remove_redundant_vector_load (gcc::context *ctxt)
> -    : rtl_opt_pass (pass_data_remove_redundant_vector_load, ctxt)
> -  {}
> -
> -  /* opt_pass methods: */
> -  bool gate (function *fun) final override
> -    {
> -      return (TARGET_SSE2
> -             && optimize
> -             && optimize_function_for_speed_p (fun));
> -    }
> -
> -  unsigned int execute (function *) final override
> -    {
> -      return remove_redundant_vector_load ();
> -    }
> -}; // class pass_remove_redundant_vector_load
> -
>  } // anon namespace
>
>  rtl_opt_pass *
> -make_pass_remove_redundant_vector_load (gcc::context *ctxt)
> +make_pass_x86_cse (gcc::context *ctxt)
>  {
> -  return new pass_remove_redundant_vector_load (ctxt);
> +  return new pass_x86_cse (ctxt);
>  }
>
>  /* Convert legacy instructions that clobbers EFLAGS to APX_NF
> diff --git a/gcc/config/i386/i386-passes.def b/gcc/config/i386/i386-passes.def
> index 06f0288b067..553b46d1fdc 100644
> --- a/gcc/config/i386/i386-passes.def
> +++ b/gcc/config/i386/i386-passes.def
> @@ -35,6 +35,6 @@ along with GCC; see the file COPYING3.  If not see
>       PR116174.  */
>    INSERT_PASS_BEFORE (pass_shorten_branches, 1, pass_align_tight_loops);
>
> -  INSERT_PASS_AFTER (pass_late_combine, 1, 
> pass_remove_redundant_vector_load);
> +  INSERT_PASS_AFTER (pass_late_combine, 1, pass_x86_cse);
>    INSERT_PASS_AFTER (pass_late_combine, 1, 
> pass_remove_partial_avx_dependency);
>    INSERT_PASS_AFTER (pass_rtl_ifcvt, 1, pass_apx_nf_convert);
> diff --git a/gcc/config/i386/i386-protos.h b/gcc/config/i386/i386-protos.h
> index 69bc0ee570d..ee6b78b2c77 100644
> --- a/gcc/config/i386/i386-protos.h
> +++ b/gcc/config/i386/i386-protos.h
> @@ -290,6 +290,7 @@ extern rtx ix86_tls_module_base (void);
>  extern bool ix86_gpr_tls_address_pattern_p (rtx);
>  extern bool ix86_tls_address_pattern_p (rtx);
>  extern rtx ix86_rewrite_tls_address (rtx);
> +extern rtx ix86_tls_get_addr (void);
>
>  extern void ix86_expand_vector_init (bool, rtx, rtx);
>  extern void ix86_expand_vector_set (bool, rtx, rtx, int);
> @@ -430,8 +431,7 @@ extern rtl_opt_pass 
> *make_pass_insert_endbr_and_patchable_area
>    (gcc::context *);
>  extern rtl_opt_pass *make_pass_remove_partial_avx_dependency
>    (gcc::context *);
> -extern rtl_opt_pass *make_pass_remove_redundant_vector_load
> -  (gcc::context *);
> +extern rtl_opt_pass *make_pass_x86_cse (gcc::context *);
>  extern rtl_opt_pass *make_pass_apx_nf_convert (gcc::context *);
>  extern rtl_opt_pass *make_pass_align_tight_loops (gcc::context *);
>
> diff --git a/gcc/config/i386/i386.cc b/gcc/config/i386/i386.cc
> index 4682db85ce4..8e66362862a 100644
> --- a/gcc/config/i386/i386.cc
> +++ b/gcc/config/i386/i386.cc
> @@ -12439,7 +12439,7 @@ ix86_tls_index (void)
>
>  static GTY(()) rtx ix86_tls_symbol;
>
> -static rtx
> +rtx
>  ix86_tls_get_addr (void)
>  {
>    if (!ix86_tls_symbol)
> diff --git a/gcc/config/i386/i386.h b/gcc/config/i386/i386.h
> index 791f3b9e133..912b942aa1e 100644
> --- a/gcc/config/i386/i386.h
> +++ b/gcc/config/i386/i386.h
> @@ -2865,6 +2865,9 @@ struct GTY(()) machine_function {
>       approximation.  */
>    BOOL_BITFIELD tls_descriptor_call_expanded_p : 1;
>
> +  /* True if TLS descriptor is called more than once.  */
> +  BOOL_BITFIELD tls_descriptor_call_multiple_p : 1;
> +
>    /* If true, the current function has a STATIC_CHAIN is placed on the
>       stack below the return address.  */
>    BOOL_BITFIELD static_chain_on_stack : 1;
> diff --git a/gcc/config/i386/i386.md b/gcc/config/i386/i386.md
> index eb526997584..6f15d850c82 100644
> --- a/gcc/config/i386/i386.md
> +++ b/gcc/config/i386/i386.md
> @@ -901,6 +901,10 @@ (define_attr "i387_cw" 
> "roundeven,floor,ceil,trunc,uninitialized,any"
>  (define_attr "avx_partial_xmm_update" "false,true"
>    (const_string "false"))
>
> +;; Define attribute to indicate 64-bit TLS insns.
> +(define_attr "tls64" "gd,ld_base,call,combine,lea,none"
> +  (const_string "none"))
> +
>  ;; Define attribute to classify add/sub insns that consumes carry flag (CF)
>  (define_attr "use_carry" "0,1" (const_string "0"))
>
> @@ -23243,6 +23247,7 @@ (define_insn "*tls_global_dynamic_64_<mode>"
>    return "call\t{*%p2@GOTPCREL(%%rip)|[QWORD PTR %p2@GOTPCREL[rip]]}";
>  }
>    [(set_attr "type" "multi")
> +   (set_attr "tls64" "gd")
>     (set (attr "length")
>         (symbol_ref "TARGET_X32 ? 15 : 16"))])
>
> @@ -23281,7 +23286,11 @@ (define_expand "@tls_global_dynamic_64_<mode>"
>                UNSPEC_TLS_GD)
>       (clobber (match_operand:P 3 "register_operand"))])]
>    "TARGET_64BIT"
> -  "ix86_tls_descriptor_calls_expanded_in_cfun = true;")
> +{
> +  if (ix86_tls_descriptor_calls_expanded_in_cfun)
> +    cfun->machine->tls_descriptor_call_multiple_p = true;
> +  ix86_tls_descriptor_calls_expanded_in_cfun = true;
> +})
>
>  (define_insn "*tls_local_dynamic_base_32_gnu"
>    [(set (match_operand:SI 0 "register_operand" "=a")
> @@ -23343,6 +23352,7 @@ (define_insn "*tls_local_dynamic_base_64_<mode>"
>    return "call\t{*%p1@GOTPCREL(%%rip)|[QWORD PTR %p1@GOTPCREL[rip]]}";
>  }
>    [(set_attr "type" "multi")
> +   (set_attr "tls64" "ld_base")
>     (set_attr "length" "12")])
>
>  (define_insn "*tls_local_dynamic_base_64_largepic"
> @@ -23376,7 +23386,11 @@ (define_expand "@tls_local_dynamic_base_64_<mode>"
>        (unspec:P [(reg:P SP_REG)] UNSPEC_TLS_LD_BASE)
>        (clobber (match_operand:P 2 "register_operand"))])]
>    "TARGET_64BIT"
> -  "ix86_tls_descriptor_calls_expanded_in_cfun = true;")
> +{
> +  if (ix86_tls_descriptor_calls_expanded_in_cfun)
> +    cfun->machine->tls_descriptor_call_multiple_p = true;
> +  ix86_tls_descriptor_calls_expanded_in_cfun = true;
> +})
>
>  ;; Local dynamic of a single variable is a lose.  Show combine how
>  ;; to convert that back to global dynamic.
> @@ -23570,6 +23584,8 @@ (define_expand "@tls_dynamic_gnu2_64_<mode>"
>    "TARGET_64BIT && TARGET_GNU2_TLS"
>  {
>    operands[2] = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : 
> operands[0];
> +  if (ix86_tls_descriptor_calls_expanded_in_cfun)
> +    cfun->machine->tls_descriptor_call_multiple_p = true;
>    ix86_tls_descriptor_calls_expanded_in_cfun = true;
>  })
>
> @@ -23581,6 +23597,7 @@ (define_insn "*tls_dynamic_gnu2_lea_64_<mode>"
>    "lea%z0\t{%E1@TLSDESC(%%rip), %0|%0, %E1@TLSDESC[rip]}"
>    [(set_attr "type" "lea")
>     (set_attr "mode" "<MODE>")
> +   (set_attr "tls64" "lea")
>     (set_attr "length" "7")
>     (set_attr "length_address" "4")])
>
> @@ -23594,6 +23611,7 @@ (define_insn "*tls_dynamic_gnu2_call_64_<mode>"
>    "TARGET_64BIT && TARGET_GNU2_TLS"
>    "call\t{*%a1@TLSCALL(%2)|[QWORD PTR [%2+%a1@TLSCALL]]}"
>    [(set_attr "type" "call")
> +   (set_attr "tls64" "call")
>     (set_attr "length" "2")
>     (set_attr "length_address" "0")])
>
> @@ -23615,7 +23633,8 @@ (define_insn_and_split 
> "*tls_dynamic_gnu2_combine_64_<mode>"
>  {
>    operands[4] = can_create_pseudo_p () ? gen_reg_rtx (ptr_mode) : 
> operands[0];
>    emit_insn (gen_tls_dynamic_gnu2_64 (ptr_mode, operands[4], operands[1]));
> -})
> +}
> +  [(set_attr "tls64" "combine")])
>
>  (define_split
>    [(match_operand 0 "tls_address_pattern")]
> diff --git a/gcc/testsuite/g++.target/i386/pr81501-1.C 
> b/gcc/testsuite/g++.target/i386/pr81501-1.C
> new file mode 100644
> index 00000000000..b2e89f4a5f0
> --- /dev/null
> +++ b/gcc/testsuite/g++.target/i386/pr81501-1.C
> @@ -0,0 +1,16 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-std=c++14 -mtls-dialect=gnu -O2 -fpic -fplt" } */
> +/* { dg-final { scan-assembler-times "call\[ \t\]__tls_get_addr@PLT" 1 { 
> target { ! ia32 } } } } */
> +
> +struct foo
> +{
> +  foo();
> +  ~foo();
> +};
> +
> +foo *
> +test ()
> +{
> +  static thread_local foo foo_tls;
> +  return &foo_tls;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr81501-1a.c 
> b/gcc/testsuite/gcc.target/i386/pr81501-1a.c
> new file mode 100644
> index 00000000000..30b4642a9ee
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr81501-1a.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -fpic -fplt -mtls-dialect=gnu" } */
> +
> +void a(long *);
> +int b(void);
> +void c(void);
> +static __thread long e;
> +long
> +d(void)
> +{
> +  a(&e);
> +  if (b())
> +    c();
> +  return e;
> +}
> +
> +/* { dg-final { scan-assembler-times "call\[ \t\]__tls_get_addr@PLT" 1 { 
> target { ! ia32 } } } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr81501-1b.c 
> b/gcc/testsuite/gcc.target/i386/pr81501-1b.c
> new file mode 100644
> index 00000000000..de25f226990
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr81501-1b.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -fpic -fplt -mtls-dialect=gnu2" } */
> +
> +#include "pr81501-1a.c"
> +
> +/* { dg-final { scan-assembler-times "call\[ 
> \t\]\\*e@TLSCALL\\(%(?:r|e)ax\\)" 1 { target { ! ia32 } } } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr81501-2a.c 
> b/gcc/testsuite/gcc.target/i386/pr81501-2a.c
> new file mode 100644
> index 00000000000..a06302a468f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr81501-2a.c
> @@ -0,0 +1,17 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -fpic -fplt -mtls-dialect=gnu" } */
> +
> +void a(long *);
> +int b(void);
> +void c(void);
> +extern __thread long e;
> +long
> +d(void)
> +{
> +  a(&e);
> +  if (b())
> +    c();
> +  return e;
> +}
> +
> +/* { dg-final { scan-assembler-times "call\[ \t\]__tls_get_addr@PLT" 1 { 
> target { ! ia32 } } } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr81501-2b.c 
> b/gcc/testsuite/gcc.target/i386/pr81501-2b.c
> new file mode 100644
> index 00000000000..4afb7426c81
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr81501-2b.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -fpic -fplt -mtls-dialect=gnu2" } */
> +
> +#include "pr81501-2a.c"
> +
> +/* { dg-final { scan-assembler-times "call\[ 
> \t\]\\*e@TLSCALL\\(%(?:r|e)ax\\)" 1 { target { ! ia32 } } } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr81501-3.c 
> b/gcc/testsuite/gcc.target/i386/pr81501-3.c
> new file mode 100644
> index 00000000000..d4220630900
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr81501-3.c
> @@ -0,0 +1,9 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -fpic -fplt -mtls-dialect=gnu2" } */
> +
> +static __thread int local1;
> +int *
> +get_local1 (void)
> +{
> +  return &local1;
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr81501-4a.c 
> b/gcc/testsuite/gcc.target/i386/pr81501-4a.c
> new file mode 100644
> index 00000000000..0c655e259ff
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr81501-4a.c
> @@ -0,0 +1,51 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -march=x86-64 -fpic -fplt -mtls-dialect=gnu" } */
> +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
> +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } 
> {^\t?\.} } } */
> +
> +/*
> +**in_dso:
> +**.LFB[0-9]+:
> +**...
> +**     movl    %edi, %.*
> +**...
> +**     mov(l|q)        %(e|r)si, %.*
> +**...
> +**     call    __tls_get_addr@PLT
> +**...
> +*/
> +
> +__thread int foo;
> +
> +extern void bar1 (int *, int *);
> +extern void bar2 (int);
> +extern void bar3 (const char *);
> +
> +int
> +in_dso (int n, int *caller_foop)
> +{
> +  int *foop;
> +  int result = 0;
> +
> +  bar3 ("foo");                        /* Make sure PLT is used before 
> macros.  */
> +  asm ("" ::: "memory");
> +
> +  foop = &foo;
> +
> +  if (caller_foop != (void *) 0 && foop != caller_foop)
> +    {
> +      bar1 (caller_foop, foop);
> +      result = 1;
> +    }
> +  else if (*foop != n)
> +    {
> +      bar2 (n);
> +      result = 1;
> +    }
> +
> +  *foop = 16;
> +
> +  return result;
> +}
> +
> +/* { dg-final { scan-assembler-times "call\[ \t\]__tls_get_addr@PLT" 1 { 
> target { ! ia32 } } } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr81501-4b.c 
> b/gcc/testsuite/gcc.target/i386/pr81501-4b.c
> new file mode 100644
> index 00000000000..5d35712b70d
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr81501-4b.c
> @@ -0,0 +1,6 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -fpic -fplt -mtls-dialect=gnu2" } */
> +
> +#include "pr81501-4a.c"
> +
> +/* { dg-final { scan-assembler-times "call\[ 
> \t\]\\*\*foo@TLSCALL\\(%(?:r|e)ax\\)" 1 { target { ! ia32 } } } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr81501-5.c 
> b/gcc/testsuite/gcc.target/i386/pr81501-5.c
> new file mode 100644
> index 00000000000..7f666e1c006
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr81501-5.c
> @@ -0,0 +1,13 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -fpic -fplt -mtls-dialect=gnu" } */
> +
> +extern __thread int __bid_IDEC_glbflags;
> +extern long __bid64qq_div_bid_y_0_1;
> +extern void get_BID64(int *);
> +void
> +__bid64qq_div(void)
> +{
> +  if (__bid64qq_div_bid_y_0_1)
> +    __bid_IDEC_glbflags |= 1;
> +  get_BID64(&__bid_IDEC_glbflags);
> +}
> diff --git a/gcc/testsuite/gcc.target/i386/pr81501-6a.c 
> b/gcc/testsuite/gcc.target/i386/pr81501-6a.c
> new file mode 100644
> index 00000000000..db8acf82883
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr81501-6a.c
> @@ -0,0 +1,67 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -march=x86-64 -fpic -fplt -mtls-dialect=gnu" } */
> +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
> +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } 
> {^\t?\.} } } */
> +
> +/*
> +**in_dso:
> +**.LFB[0-9]+:
> +**...
> +**     mov(l|q)        %(e|r)dx, %.*
> +**...
> +**     movl    %edi, %.*
> +**...
> +**     mov(l|q)        %(e|r)si, %.*
> +**...
> +**     call    __tls_get_addr@PLT
> +**...
> +*/
> +
> +__thread int foo;
> +__thread int bar;
> +
> +extern void fun1 (int *, int *);
> +extern void fun2 (int);
> +extern void fun3 (const char *);
> +
> +int
> +in_dso (int n, int *caller_foop, int *caller_barp)
> +{
> +  int *foop;
> +  int *barp;
> +  int result = 0;
> +
> +  fun3 ("foo");                        /* Make sure PLT is used before 
> macros.  */
> +  asm ("" ::: "memory");
> +
> +  foop = &foo;
> +  barp = &bar;
> +
> +  if (caller_foop != (void *) 0 && foop != caller_foop)
> +    {
> +      fun1 (caller_foop, foop);
> +      result = 1;
> +      if (caller_barp != (void *) 0 && barp != caller_barp)
> +       {
> +         fun1 (caller_barp, barp);
> +         result = 2;
> +       }
> +      else if (*barp != n)
> +       {
> +         fun2 (n);
> +         result = 3;
> +       }
> +    }
> +  else if (*foop != n)
> +    {
> +      fun2 (n);
> +      result = 4;
> +    }
> +
> +  *barp = 16;
> +  *foop = 16;
> +
> +  return result;
> +}
> +
> +/* { dg-final { scan-assembler-times "call\[ \t\]__tls_get_addr@PLT" 2 { 
> target { ! ia32 } } } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr81501-6b.c 
> b/gcc/testsuite/gcc.target/i386/pr81501-6b.c
> new file mode 100644
> index 00000000000..0b71f0a9039
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr81501-6b.c
> @@ -0,0 +1,28 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -march=x86-64 -fpic -fplt -mtls-dialect=gnu2" } */
> +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
> +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } 
> {^\t?\.} } } */
> +
> +/*
> +**in_dso:
> +**.LFB[0-9]+:
> +**...
> +**     lea(l|q)        bar@TLSDESC\(%rip\), %(e|r)ax
> +**     mov(l|q)        %(e|r)si, %.*
> +**...
> +**     mov(l|q)        %(e|r)dx, %.*
> +**...
> +**     movl    %edi, %.*
> +**...
> +**     call    \*bar@TLSCALL\(%(e|r)ax\)
> +**...
> +**     lea(l|q)        foo@TLSDESC\(%rip\), %(e|r)ax
> +**...
> +**     call    \*foo@TLSCALL\(%(e|r)ax\)
> +**...
> +*/
> +
> +#include "pr81501-6a.c"
> +
> +/* { dg-final { scan-assembler-times "call\[ 
> \t\]\\*foo@TLSCALL\\(%(?:r|e)ax\\)" 1 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "call\[ 
> \t\]\\*bar@TLSCALL\\(%(?:r|e)ax\\)" 1 { target { ! ia32 } } } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr81501-7.c 
> b/gcc/testsuite/gcc.target/i386/pr81501-7.c
> new file mode 100644
> index 00000000000..b2fe5d5eb85
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr81501-7.c
> @@ -0,0 +1,20 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -fpic -fplt -mtls-dialect=gnu" } */
> +
> +extern int __bid_IDEC_glbround, __bid64qqq_fma_save_fpsf;
> +extern __thread int __bid_IDEC_glbflags;
> +typedef struct {
> +  long w[2];
> +} UINT128;
> +extern long __bid64qqq_fma_res_0_1;
> +extern void bid128_ext_fma(UINT128, UINT128);
> +void
> +__bid64qqq_fma(UINT128 y, UINT128 z)
> +{
> +  __bid_IDEC_glbflags = 0;
> +  bid128_ext_fma(y, z);
> +  if (__bid_IDEC_glbround || __bid64qqq_fma_res_0_1)
> +    __bid_IDEC_glbflags |= __bid64qqq_fma_save_fpsf;
> +}
> +
> +/* { dg-final { scan-assembler-times "call\[ \t\]__tls_get_addr@PLT" 1 { 
> target { ! ia32 } } } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr81501-8a.c 
> b/gcc/testsuite/gcc.target/i386/pr81501-8a.c
> new file mode 100644
> index 00000000000..7e14ef5cd4f
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr81501-8a.c
> @@ -0,0 +1,82 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -march=x86-64 -fpic -fplt -mtls-dialect=gnu" } */
> +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
> +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } 
> {^\t?\.} } } */
> +
> +/*
> +**in_dso:
> +**.LFB[0-9]+:
> +**...
> +**     mov(l|q)        %(e|r)dx, %.*
> +**...
> +**     movl    %edi, %.*
> +**...
> +**     mov(l|q)        %(e|r)si, %.*
> +**...
> +**     testb   %al, %al
> +**...
> +**     call    __tls_get_addr@PLT
> +**...
> +*/
> +
> +#include <stdarg.h>
> +
> +__thread int foo;
> +__thread int bar;
> +
> +extern void fun1 (int *, int *);
> +extern void fun2 (int);
> +extern void fun3 (const char *);
> +
> +int
> +in_dso (int n, int *caller_foop, int *caller_barp, ...)
> +{
> +  int *foop;
> +  int *barp;
> +  int result;
> +  va_list ap;
> +  double d;
> +
> +  va_start (ap, caller_barp);
> +
> +  result = 0;
> +
> +  fun3 ("foo");                        /* Make sure PLT is used before 
> macros.  */
> +  asm ("" ::: "memory");
> +
> +  foop = &foo;
> +  barp = &bar;
> +
> +  if (caller_foop != (void *) 0 && foop != caller_foop)
> +    {
> +      fun1 (caller_foop, foop);
> +      result = 1;
> +      if (caller_barp != (void *) 0 && barp != caller_barp)
> +       {
> +         fun1 (caller_barp, barp);
> +         result = 2;
> +       }
> +      else if (*barp != n)
> +       {
> +         fun2 (n);
> +         result = 3;
> +       }
> +    }
> +  else if (*foop != n)
> +    {
> +      fun2 (n);
> +      result = 4;
> +    }
> +
> +  *barp = 16;
> +  *foop = 16;
> +
> +  d = va_arg (ap, double);
> +  if (d != 1234.0)
> +    result = 10;
> +  va_end (ap);
> +
> +  return result;
> +}
> +
> +/* { dg-final { scan-assembler-times "call\[ \t\]__tls_get_addr@PLT" 2 { 
> target { ! ia32 } } } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr81501-8b.c 
> b/gcc/testsuite/gcc.target/i386/pr81501-8b.c
> new file mode 100644
> index 00000000000..778b2fb3507
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr81501-8b.c
> @@ -0,0 +1,31 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -march=x86-64 -fpic -fplt -mtls-dialect=gnu2" } */
> +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
> +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } 
> {^\t?\.} } } */
> +
> +/*
> +**in_dso:
> +**.LFB[0-9]+:
> +**...
> +**     mov(l|q)        %(e|r)si, %.*
> +**...
> +**     mov(l|q)        %(e|r)dx, %.*
> +**...
> +**     movl    %edi, %.*
> +**...
> +**     testb   %al, %al
> +**...
> +**     lea(l|q)        bar@TLSDESC\(%rip\), %(e|r)ax
> +**...
> +**     call    \*bar@TLSCALL\(%(e|r)ax\)
> +**...
> +**     lea(l|q)        foo@TLSDESC\(%rip\), %(e|r)ax
> +**...
> +**     call    \*foo@TLSCALL\(%(e|r)ax\)
> +**...
> +*/
> +
> +#include "pr81501-8a.c"
> +
> +/* { dg-final { scan-assembler-times "call\[ 
> \t\]\\*foo@TLSCALL\\(%(?:r|e)ax\\)" 1 { target { ! ia32 } } } } */
> +/* { dg-final { scan-assembler-times "call\[ 
> \t\]\\*bar@TLSCALL\\(%(?:r|e)ax\\)" 1 { target { ! ia32 } } } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr81501-9a.c 
> b/gcc/testsuite/gcc.target/i386/pr81501-9a.c
> new file mode 100644
> index 00000000000..c5de37009c1
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr81501-9a.c
> @@ -0,0 +1,39 @@
> +/* { dg-do compile } */
> +/* { dg-options "-O2 -march=x86-64-v4 -fpic -fplt -mtls-dialect=gnu" } */
> +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
> +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } 
> {^\t?\.} } } */
> +
> +/*
> +**foo:
> +**.LFB[0-9]+:
> +**...
> +**     vpbroadcastb    %edi, %zmm0
> +**...
> +**     call    __tls_get_addr@PLT
> +**...
> +*/
> +
> +#include <immintrin.h>
> +
> +extern __m512i sinkz;
> +extern __m256i sinky;
> +extern __m128i sinkx;
> +extern void func1 (long *);
> +extern int func2 (void);
> +extern void func3 (void);
> +static __thread long var;
> +
> +long
> +foo (char c)
> +{
> +  func1 (&var);
> +  if (func2 ())
> +    func3 ();
> +  sinkx = _mm_set1_epi8 (c);
> +  sinkz = _mm512_set1_epi8 (c);
> +  sinky = _mm256_set1_epi8 (c);
> +  return var;
> +}
> +
> +/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
> +/* { dg-final { scan-assembler-times "call\[ \t\]__tls_get_addr@PLT" 1 { 
> target { ! ia32 } } } } */
> diff --git a/gcc/testsuite/gcc.target/i386/pr81501-9b.c 
> b/gcc/testsuite/gcc.target/i386/pr81501-9b.c
> new file mode 100644
> index 00000000000..711b177bc1e
> --- /dev/null
> +++ b/gcc/testsuite/gcc.target/i386/pr81501-9b.c
> @@ -0,0 +1,22 @@
> +/* { dg-do compile { target *-*-linux* } } */
> +/* { dg-options "-O2 -march=x86-64-v4 -fpic -fplt -mtls-dialect=gnu2" } */
> +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
> +/* { dg-final { check-function-bodies "**" "" "" { target { ! ia32 } } 
> {^\t?\.} } } */
> +/* Keep labels and directives ('.cfi_startproc', '.cfi_endproc').  */
> +
> +/*
> +**foo:
> +**.LFB[0-9]+:
> +**...
> +**     vpbroadcastb    %edi, %zmm0
> +**...
> +**     lea(l|q)        var@TLSDESC\(%rip\), %(e|r)ax
> +**...
> +**     call    \*var@TLSCALL\(%(e|r)ax\)
> +**...
> +*/
> +
> +#include "pr81501-9a.c"
> +
> +/* { dg-final { scan-assembler-times "vpbroadcastb" 1 } } */
> +/* { dg-final { scan-assembler-times "call\[ 
> \t\]\\*var@TLSCALL\\(%(?:r|e)ax\\)" 1 { target { ! ia32 } } } } */
> --
> 2.50.1
>


-- 
BR,
Hongtao

Reply via email to