On Thu, 2023-08-31 at 10:46 +0800, chenxiaolong wrote:
> +;; Implement __builtin_fabs128 function.
> +
> +(define_expand "abstf2"
> +  [(match_operand:TF 0 "register_operand")
> +   (match_operand:TF 1 "register_operand")]
> +  "TARGET_64BIT"
> +{
> +  loongarch_emit_move (operands[0], operands[1]);
> +  emit_insn (gen_abstf_local (operands[0]));
> +  DONE;
> +})
> +
> +(define_insn "abstf_local"
> +  [(set (match_operand:TF 0 "register_operand" "+r")
> +       (abs:TF (match_dup 0)))]
> +  "TARGET_64BIT"
> +{
> +  operands[0] = gen_rtx_REG (DImode, REGNO (operands[0]) + 1);
> +  return "bstrins.d\t%0,$r0,0x3f,0x3f";
> +})

This should be removed because the "generic" expand works fine:

$ cat t.c
_Float128 fabsf128 (_Float128 in)
{
  return __builtin_fabsf128 (in);
}
$ cc t.c -S -O2 -o-
fabsf128:
.LFB0 = .
        .cfi_startproc
        bstrpick.d      $r5,$r5,62,0
        jr      $r1
        .cfi_endproc

It does not work with -O0, but -O0 means "not optimized" anyway.

> +;; Implement __builtin_copysignf128 function.
> +
> +(define_insn_and_split "copysigntf3"
> +  [(set (match_operand:TF 0 "register_operand" "=&r")
> +       (unspec:TF [(match_operand:TF 1 "register_operand" "r")
> +                   (match_operand:TF 2 "register_operand" "r")]
> +                   UNSPEC_COPYSIGNF128))]
> +  "TARGET_64BIT"
> +  "#"
> +  "reload_completed"
> + [(const_int 0)]
> +{
> +  rtx op0_lo = gen_rtx_REG (DImode,REGNO (operands[0]) + 0);
> +  rtx op0_hi = gen_rtx_REG (DImode,REGNO (operands[0]) + 1);
> +  rtx op1_lo = gen_rtx_REG (DImode,REGNO (operands[1]) + 0);
> +  rtx op1_hi = gen_rtx_REG (DImode,REGNO (operands[1]) + 1);
> +  rtx op2_hi = gen_rtx_REG (DImode,REGNO (operands[2]) + 1);
> +
> +  if (REGNO (operands[1]) == REGNO (operands[2]))
> +    {
> +      loongarch_emit_move (operands[0], operands[1]);
> +      DONE;
> +    }
> +  else
> +    {
> +      loongarch_emit_move (op0_hi, op2_hi);
> +      loongarch_emit_move (op0_lo, op1_lo);
> +      emit_insn (gen_insvdi (op0_hi, GEN_INT (63), GEN_INT (0), op1_hi));
> +      DONE;
> +    }
> +})

Hmm... The generic implementation does not work:

copysignf128:
.LFB0 = .
        .cfi_startproc
        or      $r12,$r0,$r0
        lu52i.d $r12,$r12,0x8000000000000000>>52
        and     $r7,$r7,$r12
        bstrpick.d      $r5,$r5,62,0
        or      $r5,$r5,$r7
        jr      $r1
        .cfi_endproc

It's sub-optimal.  But there seems a general issue about cases like

int test(int a, int b)
{
  return (a & ~0x10) | (b & 0x10);
}

It's compiled to:

test:
.LFB0 = .
        .cfi_startproc
        addi.w  $r12,$r0,-17                    # 0xffffffffffffffef
        and     $r12,$r12,$r4
        andi    $r5,$r5,16
        or      $r12,$r12,$r5
        slli.w  $r4,$r12,0
        jr      $r1
        .cfi_endproc

But the optimal implementation should be:

bstrpick.w $r4, $r4, 4, 4
bstrins.w  $r5, $r4, 4, 4
or         $r5, $r4, $r0

So to me we should fix the general case instead.  Please hold this part
(you can commit the remains of the patch w/o the loongarch.md change for
now), and I'll try to fix the general case.

Created https://gcc.gnu.org/PR111252 for tracking the issue.

-- 
Xi Ruoyao <xry...@xry111.site>
School of Aerospace Science and Technology, Xidian University

Reply via email to