Re: [PATCH] expansion: Improve double-word modulo by certain constant divisors [PR97459]

Richard Biener Mon, 30 Nov 2020 01:24:22 -0800

On Sat, 28 Nov 2020, Jakub Jelinek wrote:

> Hi!
> 
> As discussed in the PR, e.g. on x86_64 (both -m32 and -m64) there is no
> double-word modulo and so we expand it to a __{,u}mod[dt]i3 call.
> For certain constant divisors we can do better.  E.g. consider
> 32-bit word-size, 0x100000000ULL % 3 == 1, so we can use partly the Hacker's
> delight modulo by summing digits approach and optimize
> unsigned long long foo (unsigned long long x) { return x % 3; }
> as
> unsigned long long foo (unsigned long long x) {
>   unsigned int sum, carry;
>   carry = __builtin_add_overflow ((unsigned int) x, (unsigned int) (x >> 32), 
> &sum);
>   sum += carry;
>   return sum % 3;
> }
> Similarly, 0x10000000ULL % 5 == 1 (note, 1 << 28), so
> unsigned long long bar (unsigned long long x) { return x % 5; }
> as
> unsigned long long bar (unsigned long long x) {
>   unsigned int sum = x & ((1 << 28) - 1);
>   sum += (x >> 28) & ((1 << 28) - 1);
>   sum += (x >> 56);
>   return sum % 5;
> }
> etc.
> And we can do also signed modulo,
> long long baz (long long x) { return x % 5; }
> as
> long long baz (long long x) {
>   unsigned int sum = x & ((1 << 28) - 1);
>   sum += ((unsigned long long) x >> 28) & ((1 << 28) - 1);
>   sum += ((unsigned long long) x >> 56);
>   /* Sum adjustment for negative x.  */
>   sum += (x >> 63) & 3;
>   unsigned int rem = sum % 5;
>   /* And finally adjust it to the right interval for negative values.  */
>   return (int) (rem + ((x >> 63) & -4));
> }
> 
> Bootstrapped/regtested on x86_64-linux and i686-linux, ok for trunk?


OK.

Thanks,
Richard.

> 2020-11-28  Jakub Jelinek  <[email protected]>
> 
>       PR rtl-optimization/97459
>       * internal-fn.h (expand_addsub_overflow): Declare.
>       * internal-fn.c (expand_addsub_overflow): No longer static.
>       * optabs.c (expand_doubleword_mod): New function.
>       (expand_binop): Optimize double-word mod with constant divisor.
> 
>       * gcc.dg/pr97459-1.c: New test.
>       * gcc.dg/pr97459-2.c: New test.
> 
> --- gcc/internal-fn.h.jj      2020-11-27 11:19:37.950190425 +0100
> +++ gcc/internal-fn.h 2020-11-27 13:18:13.116798464 +0100
> @@ -224,6 +224,8 @@ extern bool internal_gather_scatter_fn_s
>  extern bool internal_check_ptrs_fn_supported_p (internal_fn, tree,
>                                               poly_uint64, unsigned int);
>  
> +extern void expand_addsub_overflow (location_t, tree_code, tree, tree, tree,
> +                                 bool, bool, bool, bool, tree *);
>  extern void expand_internal_call (gcall *);
>  extern void expand_internal_call (internal_fn, gcall *);
>  extern void expand_PHI (internal_fn, gcall *);
> --- gcc/internal-fn.c.jj      2020-11-27 11:19:37.950190425 +0100
> +++ gcc/internal-fn.c 2020-11-27 13:18:13.117798453 +0100
> @@ -798,7 +798,7 @@ expand_ubsan_result_store (rtx target, r
>  /* Add sub/add overflow checking to the statement STMT.
>     CODE says whether the operation is +, or -.  */
>  
> -static void
> +void
>  expand_addsub_overflow (location_t loc, tree_code code, tree lhs,
>                       tree arg0, tree arg1, bool unsr_p, bool uns0_p,
>                       bool uns1_p, bool is_ubsan, tree *datap)
> --- gcc/optabs.c.jj   2020-11-27 11:19:38.000189859 +0100
> +++ gcc/optabs.c      2020-11-27 16:06:30.971435747 +0100
> @@ -44,6 +44,8 @@ along with GCC; see the file COPYING3.
>  #include "expr.h"
>  #include "optabs-tree.h"
>  #include "libfuncs.h"
> +#include "internal-fn.h"
> +#include "langhooks.h"
>  
>  static void prepare_float_lib_cmp (rtx, rtx, enum rtx_code, rtx *,
>                                  machine_mode *);
> @@ -926,6 +928,196 @@ expand_doubleword_mult (machine_mode mod
>    emit_move_insn (product_high, adjust);
>    return product;
>  }
> +
> +/* Subroutine of expand_binop.  Optimize unsigned double-word OP0 % OP1 for
> +   constant OP1.  If for some bit in [BITS_PER_WORD / 2, BITS_PER_WORD] range
> +   (prefer higher bits) ((1w << bit) % OP1) == 1, then the modulo can be
> +   computed in word-mode as ((OP0 & (bit - 1)) + ((OP0 >> bit) & (bit - 1))
> +   + (OP0 >> (2 * bit))) % OP1.  Whether we need to sum 2, 3 or 4 values
> +   depends on the bit value, if 2, then carry from the addition needs to be
> +   added too, i.e. like:
> +   sum += __builtin_add_overflow (low, high, &sum)
> +
> +   Optimize signed double-word OP0 % OP1 similarly, just apply some 
> correction
> +   factor to the sum before doing unsigned remainder, in the form of
> +   sum += (((signed) OP0 >> (2 * BITS_PER_WORD - 1)) & const);
> +   then perform unsigned
> +   remainder = sum % OP1;
> +   and finally
> +   remainder += ((signed) OP0 >> (2 * BITS_PER_WORD - 1)) & (1 - OP1);  */
> +
> +static rtx
> +expand_doubleword_mod (machine_mode mode, rtx op0, rtx op1, bool unsignedp)
> +{
> +  if (INTVAL (op1) <= 1)
> +    return NULL_RTX;
> +
> +  rtx_insn *last = get_last_insn ();
> +  for (int bit = BITS_PER_WORD; bit >= BITS_PER_WORD / 2; bit--)
> +    {
> +      wide_int w = wi::shifted_mask (bit, 1, false, 2 * BITS_PER_WORD);
> +      if (wi::ne_p (wi::umod_trunc (w, INTVAL (op1)), 1))
> +     continue;
> +      rtx sum = NULL_RTX, mask = NULL_RTX;
> +      if (bit == BITS_PER_WORD)
> +     {
> +       /* For signed modulo we need to add correction to the sum
> +          and that might again overflow.  */
> +       if (!unsignedp)
> +         continue;
> +       if (optab_handler (uaddv4_optab, word_mode) == CODE_FOR_nothing)
> +         continue;
> +       tree wtype = lang_hooks.types.type_for_mode (word_mode, 1);
> +       if (wtype == NULL_TREE)
> +         continue;
> +       tree ctype = build_complex_type (wtype);
> +       if (TYPE_MODE (ctype) != GET_MODE_COMPLEX_MODE (word_mode))
> +         continue;
> +       machine_mode cmode = TYPE_MODE (ctype);
> +       rtx op00 = operand_subword_force (op0, 0, mode);
> +       rtx op01 = operand_subword_force (op0, 1, mode);
> +       rtx cres = gen_rtx_CONCAT (cmode, gen_reg_rtx (word_mode),
> +                                  gen_reg_rtx (word_mode));
> +       tree lhs = make_tree (ctype, cres);
> +       tree arg0 = make_tree (wtype, op00);
> +       tree arg1 = make_tree (wtype, op01);
> +       expand_addsub_overflow (UNKNOWN_LOCATION, PLUS_EXPR, lhs, arg0,
> +                               arg1, true, true, true, false, NULL);
> +       sum = expand_simple_binop (word_mode, PLUS, XEXP (cres, 0),
> +                                  XEXP (cres, 1), NULL_RTX, 1,
> +                                  OPTAB_DIRECT);
> +       if (sum == NULL_RTX)
> +         return NULL_RTX;
> +     }
> +      else
> +     {
> +       /* Code below uses GEN_INT, so we need the masks to be representable
> +          in HOST_WIDE_INTs.  */
> +       if (bit >= HOST_BITS_PER_WIDE_INT)
> +         continue;
> +       /* If op0 is e.g. -1 or -2 unsigned, then the 2 additions might
> +          overflow.  Consider 64-bit -1ULL for word size 32, if we add
> +          0x7fffffffU + 0x7fffffffU + 3U, it wraps around to 1.  */
> +       if (bit == BITS_PER_WORD - 1)
> +         continue;
> +
> +       int count = (2 * BITS_PER_WORD + bit - 1) / bit;
> +       rtx sum_corr = NULL_RTX;
> +
> +       if (!unsignedp)
> +         {
> +           /* For signed modulo, compute it as unsigned modulo of
> +              sum with a correction added to it if OP0 is negative,
> +              such that the result can be computed as unsigned
> +              remainder + ((OP1 >> (2 * BITS_PER_WORD - 1)) & (1 - OP1).  */
> +           w = wi::min_value (2 * BITS_PER_WORD, SIGNED);
> +           wide_int wmod1 = wi::umod_trunc (w, INTVAL (op1));
> +           wide_int wmod2 = wi::smod_trunc (w, INTVAL (op1));
> +           /* wmod2 == -wmod1.  */
> +           wmod2 = wmod2 + (INTVAL (op1) - 1);
> +           if (wi::ne_p (wmod1, wmod2))
> +             {
> +               wide_int wcorr = wmod2 - wmod1;
> +               if (wi::neg_p (w))
> +                 wcorr = wcorr + INTVAL (op1);
> +               /* Now verify if the count sums can't overflow, and punt
> +                  if they could.  */
> +               w = wi::mask (bit, false, 2 * BITS_PER_WORD);
> +               w = w * (count - 1);
> +               w = w + wi::mask (2 * BITS_PER_WORD - (count - 1) * bit,
> +                                 false, 2 * BITS_PER_WORD);
> +               w = w + wcorr;
> +               w = wi::lrshift (w, BITS_PER_WORD);
> +               if (wi::ne_p (w, 0))
> +                 continue;
> +
> +               mask = operand_subword_force (op0, WORDS_BIG_ENDIAN ? 0 : 1,
> +                                             mode);
> +               mask = expand_simple_binop (word_mode, ASHIFTRT, mask,
> +                                           GEN_INT (BITS_PER_WORD - 1),
> +                                           NULL_RTX, 0, OPTAB_DIRECT);
> +               if (mask == NULL_RTX)
> +                 return NULL_RTX;
> +               sum_corr = immed_wide_int_const (wcorr, word_mode);
> +               sum_corr = expand_simple_binop (word_mode, AND, mask,
> +                                               sum_corr, NULL_RTX, 1,
> +                                               OPTAB_DIRECT);
> +               if (sum_corr == NULL_RTX)
> +                 return NULL_RTX;
> +             }
> +         }
> +
> +       for (int i = 0; i < count; i++)
> +         {
> +           rtx v = op0;
> +           if (i)
> +             v = expand_simple_binop (mode, LSHIFTRT, v, GEN_INT (i * bit),
> +                                      NULL_RTX, 1, OPTAB_DIRECT);
> +           if (v == NULL_RTX)
> +             return NULL_RTX;
> +           v = lowpart_subreg (word_mode, v, mode);
> +           if (v == NULL_RTX)
> +             return NULL_RTX;
> +           if (i != count - 1)
> +             v = expand_simple_binop (word_mode, AND, v,
> +                                      GEN_INT ((HOST_WIDE_INT_1U << bit)
> +                                               - 1), NULL_RTX, 1,
> +                                      OPTAB_DIRECT);
> +           if (v == NULL_RTX)
> +             return NULL_RTX;
> +           if (sum == NULL_RTX)
> +             sum = v;
> +           else
> +             sum = expand_simple_binop (word_mode, PLUS, sum, v, NULL_RTX,
> +                                        1, OPTAB_DIRECT);
> +           if (sum == NULL_RTX)
> +             return NULL_RTX;
> +         }
> +       if (sum_corr)
> +         {
> +           sum = expand_simple_binop (word_mode, PLUS, sum, sum_corr,
> +                                      NULL_RTX, 1, OPTAB_DIRECT);
> +           if (sum == NULL_RTX)
> +             return NULL_RTX;
> +         }
> +     }
> +      rtx remainder = expand_divmod (1, TRUNC_MOD_EXPR, word_mode, sum, op1,
> +                                  NULL_RTX, 1);
> +      if (remainder == NULL_RTX)
> +     return NULL_RTX;
> +
> +      if (!unsignedp)
> +     {
> +       if (mask == NULL_RTX)
> +         {
> +           mask = operand_subword_force (op0, WORDS_BIG_ENDIAN ? 0 : 1,
> +                                         mode);
> +           mask = expand_simple_binop (word_mode, ASHIFTRT, mask,
> +                                       GEN_INT (BITS_PER_WORD - 1),
> +                                       NULL_RTX, 0, OPTAB_DIRECT);
> +           if (mask == NULL_RTX)
> +             return NULL_RTX;
> +         }
> +       mask = expand_simple_binop (word_mode, AND, mask,
> +                                   GEN_INT (1 - INTVAL (op1)),
> +                                   NULL_RTX, 1, OPTAB_DIRECT);
> +       if (mask == NULL_RTX)
> +         return NULL_RTX;
> +       remainder = expand_simple_binop (word_mode, PLUS, remainder,
> +                                        mask, NULL_RTX, 1, OPTAB_DIRECT);
> +       if (remainder == NULL_RTX)
> +         return NULL_RTX;
> +     }
> +
> +      remainder = convert_modes (mode, word_mode, remainder, unsignedp);
> +      /* Punt if we need any library calls.  */
> +      for (; last; last = NEXT_INSN (last))
> +     if (CALL_P (last))
> +       return NULL_RTX;
> +      return remainder;
> +    }
> +  return NULL_RTX;
> +}
>  
>  /* Wrapper around expand_binop which takes an rtx code to specify
>     the operation to perform, not an optab pointer.  All other
> @@ -1806,6 +1998,37 @@ expand_binop (machine_mode mode, optab b
>       }
>      }
>  
> +  /* Attempt to synthetize double word modulo by constant divisor.  */
> +  if ((binoptab == umod_optab || binoptab == smod_optab)
> +      && optimize
> +      && CONST_INT_P (op1)
> +      && is_int_mode (mode, &int_mode)
> +      && GET_MODE_SIZE (int_mode) == 2 * UNITS_PER_WORD
> +      && optab_handler (lshr_optab, int_mode) != CODE_FOR_nothing
> +      && optab_handler (and_optab, word_mode) != CODE_FOR_nothing
> +      && optab_handler (add_optab, word_mode) != CODE_FOR_nothing
> +      && optimize_insn_for_speed_p ())
> +    {
> +      rtx remainder = expand_doubleword_mod (int_mode, op0, op1,
> +                                          binoptab == umod_optab);
> +      if (remainder != NULL_RTX)
> +     {
> +       if (optab_handler (mov_optab, int_mode) != CODE_FOR_nothing)
> +         {
> +           rtx_insn *move = emit_move_insn (target ? target : remainder,
> +                                            remainder);
> +           set_dst_reg_note (move,
> +                             REG_EQUAL,
> +                             gen_rtx_fmt_ee (UMOD, int_mode,
> +                                             copy_rtx (op0), op1),
> +                             target ? target : remainder);
> +         }
> +       return remainder;
> +     }
> +      else
> +     delete_insns_since (last);
> +    }
> +
>    /* It can't be open-coded in this mode.
>       Use a library call if one is available and caller says that's ok.  */
>  
> --- gcc/testsuite/gcc.dg/pr97459-1.c.jj       2020-11-27 14:16:50.735828637 
> +0100
> +++ gcc/testsuite/gcc.dg/pr97459-1.c  2020-11-27 14:16:12.212259188 +0100
> @@ -0,0 +1,54 @@
> +/* PR rtl-optimization/97459 */
> +/* { dg-do run } */
> +/* { dg-options "-O2" } */
> +/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */
> +
> +#ifdef __SIZEOF_INT128__
> +typedef __uint128_t T;
> +#else
> +typedef unsigned long long T;
> +#endif
> +
> +T __attribute__((noipa)) foo (T x, T n) { return x % n; }
> +#define C(n) T __attribute__((noipa)) foo##n (T x) { return x % (n - 10000); 
> }
> +
> +#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9)
> +#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \
> +           C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9)
> +#ifdef EXPENSIVE
> +#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \
> +           C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9)
> +#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \
> +           C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9)
> +#else
> +#define C3(n) C2(n##0) C2(n##4) C2(n##9)
> +#define C4(n) C3(n##0) C3(n##3) C3(n##7)
> +#endif
> +#define TESTS C4(1)
> +
> +TESTS
> +
> +struct S { T x; T (*foo) (T); };
> +
> +#undef C
> +#define C(n) { n - 10000, foo##n },
> +
> +struct S tests[] = {
> +TESTS
> +  { 0, 0 }
> +};
> +
> +int
> +main ()
> +{
> +  int i, j, k;
> +  for (k = 0; tests[k].x; k++)
> +    for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++)
> +      for (j = -5; j <= 5; j++)
> +     {
> +       T x = ((T) 1 << i) + j;
> +       if (foo (x, tests[k].x) != tests[k].foo (x))
> +         __builtin_abort ();
> +     }
> +  return 0;
> +}
> --- gcc/testsuite/gcc.dg/pr97459-2.c.jj       2020-11-27 15:53:36.831080388 
> +0100
> +++ gcc/testsuite/gcc.dg/pr97459-2.c  2020-11-27 15:50:20.763269826 +0100
> @@ -0,0 +1,57 @@
> +/* PR rtl-optimization/97459 */
> +/* { dg-do run } */
> +/* { dg-options "-O2" } */
> +/* { dg-additional-options "-DEXPENSIVE" { target run_expensive_tests } } */
> +
> +#ifdef __SIZEOF_INT128__
> +typedef __int128_t T;
> +typedef __uint128_t U;
> +#else
> +typedef long long T;
> +typedef unsigned long long U;
> +#endif
> +
> +T __attribute__((noipa)) foo (T x, T n) { return x % n; }
> +#define C(n) T __attribute__((noipa)) foo##n (T x) { return x % (n - 10000); 
> }
> +
> +#define C1(n) C(n##1) C(n##3) C(n##5) C(n##7) C(n##9)
> +#define C2(n) C1(n##0) C1(n##1) C1(n##2) C1(n##3) C1(n##4) \
> +           C1(n##5) C1(n##6) C1(n##7) C1(n##8) C1(n##9)
> +#ifdef EXPENSIVE
> +#define C3(n) C2(n##0) C2(n##1) C2(n##2) C2(n##3) C2(n##4) \
> +           C2(n##5) C2(n##6) C2(n##7) C2(n##8) C2(n##9)
> +#define C4(n) C3(n##0) C3(n##1) C3(n##2) C3(n##3) C3(n##4) \
> +           C3(n##5) C3(n##6) C3(n##7) C3(n##8) C3(n##9)
> +#else
> +#define C3(n) C2(n##0) C2(n##4) C2(n##9)
> +#define C4(n) C3(n##0) C3(n##3) C3(n##7)
> +#endif
> +#define TESTS C4(1)
> +
> +TESTS
> +
> +struct S { T x; T (*foo) (T); };
> +
> +#undef C
> +#define C(n) { n - 10000, foo##n },
> +
> +struct S tests[] = {
> +TESTS
> +  { 0, 0 }
> +};
> +
> +int
> +main ()
> +{
> +  int i, j, k;
> +  for (k = 0; tests[k].x; k++)
> +    for (i = 0; i < sizeof (T) * __CHAR_BIT__; i++)
> +      for (j = -5; j <= 5; j++)
> +     {
> +       U x = ((U) 1 << i) + j;
> +       if (foo ((T) x, tests[k].x) != tests[k].foo ((T) x)
> +           || foo ((T) -x, tests[k].x) != tests[k].foo ((T) -x))
> +         __builtin_abort ();
> +     }
> +  return 0;
> +}
> 
>       Jakub
> 
> 

-- 
Richard Biener <[email protected]>
SUSE Software Solutions Germany GmbH, Maxfeldstrasse 5, 90409 Nuernberg,
Germany; GF: Felix Imendörffer; HRB 36809 (AG Nuernberg)

Re: [PATCH] expansion: Improve double-word modulo by certain constant divisors [PR97459]

Reply via email to