On Thu, 9 Jul 2015, Bernhard Reutner-Fischer wrote:

> gcc/ChangeLog
> 
> 2015-07-09  Bernhard Reutner-Fischer  <al...@gcc.gnu.org>
> 
>       * builtins.c (fold_builtin_tolower, fold_builtin_toupper): New
>       static functions.
>       (fold_builtin_1): Handle BUILT_IN_TOLOWER, BUILT_IN_TOUPPER.

As I read it you fold tolower (X) to (X) >= target_char_set ('A')
&& (X) <= target_char_set ('Z') ? (X) - target_char_set ('A') + 
target_char_set ('a');

I don't think this can be correct for all locales which need not
have a lower-case character for all upper-case ones nor do
all letters having one need to be in the range of 'A' to 'Z'.

Joseph will surely correct me if I am wrong.

What works would eventually be constant folding.

Richard.

> Signed-off-by: Bernhard Reutner-Fischer <rep.dot....@gmail.com>
> ---
>  gcc/builtins.c | 99 
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 99 insertions(+)
> 
> Using the three testcases attached to PR66741 where the -1.c one is using
> builtins
> $ for i in 0 1 2;do gcc -o tolower_strcpy-$i tolower_strcpy-$i.c -Ofast -W 
> -Wall -Wextra -pedantic -DMAIN -msse4.2;done
> 
> pristine (trunk@225368):
> # tolower_strcpy-0
> 
> real  0m6.068s
> user  0m3.204s
> sys   0m2.840s
> # tolower_strcpy-1
> 
> real  0m8.097s
> user  0m5.548s
> sys   0m2.528s
> # tolower_strcpy-2
> 
> real  0m3.568s
> user  0m0.804s
> sys   0m2.748s
> 
> trunk@225368 + fold tolower/toupper below
> 
> # tolower_strcpy-0
> 
> real  0m6.055s
> user  0m3.212s
> sys   0m2.832s
> # tolower_strcpy-1
> 
> real  0m5.383s
> user  0m2.464s
> sys   0m2.900s
> # tolower_strcpy-2
> 
> real  0m3.605s
> user  0m0.668s
> sys   0m2.924s
> 
> The tolower loop now ends up as
> .L5:
>         movsbl  (%rbx), %edx
>         leal    32(%rdx), %ecx
>         movl    %edx, %eax
>         subl    $65, %edx
>         cmpl    $25, %edx
>         cmovbe  %ecx, %eax
>         addq    $1, %rbx
>         movb    %al, -1(%rbx)
>         cmpq    %rsi, %rbx
>         jne     .L5
> 
> instead of the former call
> 
> .L5:
>         movsbl  (%rbx), %edi
>         addq    $1, %rbx
>         call    tolower
>         movb    %al, -1(%rbx)
>         cmpq    %rbp, %rbx
>         jne     .L5
> 
> Would something like attached be ok for trunk after proper testing?
> Advise on the questions inline WRT caching lang_hooks intermediate
> results?
> Hints on further steps towards fixing the PR?
> 
> I think the next step would be to try to teach graphite to fuse the two
> loops in tolower_strcpy-0.c. Need to look at graphite..
> Then see how to classify builtins that could be expanded early and what
> breaks if doing so. This sounds like a potential disaster, fun.
> Next, see why the vectorizer (or something else) does not pave the way
> to use SSE instruction as the tolower_strcpy-2.c does.
> 
> thanks,
> 
> diff --git a/gcc/builtins.c b/gcc/builtins.c
> index 5f53342..421c908 100644
> --- a/gcc/builtins.c
> +++ b/gcc/builtins.c
> @@ -204,6 +204,9 @@ static tree fold_builtin_strrchr (location_t, tree, tree, 
> tree);
>  static tree fold_builtin_strspn (location_t, tree, tree);
>  static tree fold_builtin_strcspn (location_t, tree, tree);
>  
> +static tree fold_builtin_tolower (location_t, tree);
> +static tree fold_builtin_toupper (location_t, tree);
> +
>  static rtx expand_builtin_object_size (tree);
>  static rtx expand_builtin_memory_chk (tree, rtx, machine_mode,
>                                     enum built_in_function);
> @@ -10285,6 +10288,12 @@ fold_builtin_1 (location_t loc, tree fndecl, tree 
> arg0)
>      case BUILT_IN_ISDIGIT:
>        return fold_builtin_isdigit (loc, arg0);
>  
> +    case BUILT_IN_TOLOWER:
> +      return fold_builtin_tolower (loc, arg0);
> +
> +    case BUILT_IN_TOUPPER:
> +      return fold_builtin_toupper (loc, arg0);
> +
>      CASE_FLT_FN (BUILT_IN_FINITE):
>      case BUILT_IN_FINITED32:
>      case BUILT_IN_FINITED64:
> @@ -11208,6 +11217,96 @@ fold_builtin_strcspn (location_t loc, tree s1, tree 
> s2)
>      }
>  }
>  
> +
> +/* Simplify a call to the tolower builtin.  ARG is the argument to the call.
> +
> +   Return NULL_TREE if no simplification was possible, otherwise return the
> +   simplified form of the call as a tree.  */
> +
> +static tree
> +fold_builtin_tolower (location_t loc, tree arg)
> +{
> +  if (!validate_arg (arg, INTEGER_TYPE))
> +    return NULL_TREE;
> +
> +  /* Transform tolower(c) -> (unsigned)(c) | 0x20.
> +
> +     More specifically:
> +     unsigned tem = arg - 'A';
> +     if (tem <= ('Z' - 'A'))
> +       arg += 'a' - 'A';
> +     return arg;
> +   */
> +  unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A');
> +  unsigned HOST_WIDE_INT target_Z = lang_hooks.to_target_charset ('Z');
> +  unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a');
> +  if (target_A == 0
> +      || target_Z == 0
> +      || target_a == 0)
> +    return NULL_TREE;
> +
> +  arg = fold_convert_loc (loc, unsigned_type_node, arg);
> +  tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
> +                       build_int_cst (unsigned_type_node, target_A));
> +  /* ??? x19 and x20 would better live in static storage; Think:
> +   * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done};
> +   */
> +  unsigned HOST_WIDE_INT x19 = target_Z - target_A;
> +  unsigned HOST_WIDE_INT x20 = target_a - target_A;
> +  tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem,
> +                      build_int_cst (unsigned_type_node, x19));
> +  tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem,
> +                      fold_build2 (PLUS_EXPR, unsigned_type_node, arg,
> +                                   build_int_cst (unsigned_type_node, x20)),
> +                      arg);
> +  return fold_convert_loc (loc, integer_type_node, tem);
> +}
> +
> +/* Simplify a call to the toupper builtin.  ARG is the argument to the call.
> +
> +   Return NULL_TREE if no simplification was possible, otherwise return the
> +   simplified form of the call as a tree.  */
> +
> +static tree
> +fold_builtin_toupper (location_t loc, tree arg)
> +{
> +  if (!validate_arg (arg, INTEGER_TYPE))
> +    return NULL_TREE;
> +
> +  /* Transform toupper(c) -> (unsigned)(c) ^ 0x20.
> +
> +     More specifically:
> +     unsigned tem = arg - 'a';
> +     if (tem <= ('z' - 'a'))
> +       arg -= 'a' - 'A';
> +     return arg;
> +   */
> +  unsigned HOST_WIDE_INT target_A = lang_hooks.to_target_charset ('A');
> +  unsigned HOST_WIDE_INT target_z = lang_hooks.to_target_charset ('z');
> +  unsigned HOST_WIDE_INT target_a = lang_hooks.to_target_charset ('a');
> +  if (target_A == 0
> +      || target_z == 0
> +      || target_a == 0)
> +    return NULL_TREE;
> +
> +  arg = fold_convert_loc (loc, unsigned_type_node, arg);
> +  tree tem = fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
> +                       build_int_cst (unsigned_type_node, target_a));
> +  /* ??? x19 and x20 would better live in static storage; Think:
> +   * static struct static_fold_tolower {uHWI x19, x20; unsigned probe_done};
> +   */
> +  unsigned HOST_WIDE_INT x19 = target_z - target_a;
> +  unsigned HOST_WIDE_INT x20 = target_a - target_A;
> +  tem = fold_build2_loc (loc, LE_EXPR, integer_type_node, tem,
> +                      build_int_cst (unsigned_type_node, x19));
> +  tem = fold_build3_loc (loc, COND_EXPR, unsigned_type_node, tem,
> +                      fold_build2 (MINUS_EXPR, unsigned_type_node, arg,
> +                                   build_int_cst (unsigned_type_node, x20)),
> +                      arg);
> +  return fold_convert_loc (loc, integer_type_node, tem);
> +}
> +
> +
>  /* Fold the next_arg or va_start call EXP. Returns true if there was an error
>     produced.  False otherwise.  This is done so that we don't output the 
> error
>     or warning twice or three times.  */
> 

-- 
Richard Biener <rguent...@suse.de>
SUSE LINUX GmbH, GF: Felix Imendoerffer, Jane Smithard, Dilip Upmanyu, Graham 
Norton, HRB 21284 (AG Nuernberg)

Reply via email to