Hello,

On Sat, Aug 27 2022, Richard Biener wrote:
>> Am 26.08.2022 um 23:45 schrieb Martin Jambor <mjam...@suse.cz>:
>>
>> Hi,
>>
>>> On Fri, Aug 26 2022, Richard Sandiford wrote:
>>> Richard Biener <rguent...@suse.de> writes:
>>>>> Am 26.08.2022 um 18:40 schrieb Martin Jambor <mjam...@suse.cz>:
>>>>>
>>>>> Hi,
>>>>>
>>>>> This adds a method to binary search in a sorted array_slice.
>>>>>
>>>>> The implementation is direct copy of vec:bsearch.  Moreover, to only
>>>>> copy it once and not twice, I used const_cast in the non-const
>>>>> variants to be able to use the const variants.  I hope that is
>>>>> acceptable abuse of const_cast but I'll be happy to change that if
>>>>> not.
>>>>>
>>>>> Bootstrapped and tested along code that actually uses it on
>>>>> x86_64-linux.  OK for trunk?
>>>>
>>>> Can you avoid the copying by using array slice bsearch from the vec<> 
>>>> bsearch?
>>>
>>> IMO it would be better to transition to using <algorithm> routines
>>> for this kind of thing (for new code).  In this case that would be
>>> std::lower_bound.
>>>
>>> Using std::lower_bound is more convenient because it avoids the void *
>>> thing (you can use lambdas to capture any number of variables instead)
>>> and because it works on subranges, not just whole ranges.
>>>
>>
>> OK, I can use std::lower_bound with simple lambdas too.  The semantics
>> of returning the first matching a criterion actually allows me to use it
>> one more time.
>
> Can you try to compare generated code?

I have had a look and the std::lower_bound is slightly less efficient,
we get 40 instructions or so instead of 28 or so, but it is mostly
because it lacks the early exit bsearch has when its comparator returns
zero and because of the final checks whether the lower bound is what we
were searching for.  But the templates and lambdas themselves do not
seem to lead to any egregious overhead.

As I wrote earlier, in one of the three searches I want to do I actually
want to look for a lower bound (in the example below the first with the
given index) and so I'd be willing to accept the trade-off.

Full story:

The data structure being searched is essentially an array of these
structures, sorted primarily by index and those with the same index by
unit_offset:

  struct GTY(()) ipa_argagg_value
  {
    tree value;
    unsigned unit_offset;
    unsigned index : 16;
    unsigned by_ref : 1;
  };


The C-way implementation:

  ----------------------------------------------------------------------
  /* Callback for bsearch and qsort to sort aggregate values.  */

  static int
  compare_agg_values (const void *a, const void *b)
  {
    const ipa_argagg_value *agg1 = (const ipa_argagg_value *) a;
    const ipa_argagg_value *agg2 = (const ipa_argagg_value *) b;

    if (agg1->index < agg2->index)
      return -1;
    if (agg1->index > agg2->index)
      return 1;

    if (agg1->unit_offset < agg2->unit_offset)
      return -1;
    if (agg1->unit_offset > agg2->unit_offset)
      return 1;
    return 0;
  }

  const ipa_argagg_value * __attribute__((noinline))
  testfun (const array_slice<const ipa_argagg_value> &elts,
          int index, unsigned unit_offset)
  {
    ipa_argagg_value key;
    key.index = index;
    key.unit_offset = unit_offset;
    return elts.bsearch (&key, compare_agg_values);
  }
  ----------------------------------------------------------------------

The C++-ish one:

  ----------------------------------------------------------------------
  const ipa_argagg_value * __attribute__((noinline))
  testfun (const array_slice<const ipa_argagg_value> &elts,
          int index, unsigned unit_offset)
  {
    ipa_argagg_value key;
    key.index = index;
    key.unit_offset = unit_offset;
    const ipa_argagg_value *res
      = std::lower_bound (elts.begin (), elts.end (), key,
                        [] (const ipa_argagg_value &elt,
                            const ipa_argagg_value &val)
                        {
                          if (elt.index < val.index)
                            return true;
                          if (elt.index > val.index)
                            return false;
                          if (elt.unit_offset < val.unit_offset)
                            return true;
                          return false;
                        });

    if (res == elts.end ()
        || res->index != index
        || res->unit_offset != unit_offset)
      res = nullptr;
    return res;
  }
  ----------------------------------------------------------------------

Using GCC 12.1, the use of bsearch yields the following optimized dump:

  ----------------------------------------------------------------------
  ;; Function testfun (_Z7testfunRK11array_sliceIK16ipa_argagg_valueEij, 
funcdef_no=3449, decl_uid=129622, cgraph_uid=2433, symbol_order=2603)

  __attribute__((noinline))
  const struct ipa_argagg_value * testfun (const struct array_slice & elts, int 
index, unsigned int unit_offset)
  {
    const void * p;
    size_t idx;
    size_t u;
    size_t l;
    short unsigned int _1;
    const struct ipa_argagg_value * _11;
    unsigned int _12;
    long unsigned int _15;
    long unsigned int _18;
    long unsigned int _20;
    const struct ipa_argagg_value * _24;
    short unsigned int _29;
    unsigned int _33;

    <bb 2> [local count: 1073741824]:
    _1 = (short unsigned int) index_2(D);
    _11 = MEM[(const struct ipa_argagg_value * *)elts_7(D)];
    _12 = MEM[(unsigned int *)elts_7(D) + 8B];
    _15 = (long unsigned int) _12;
    goto <bb 8>; [100.00%]

    <bb 3> [local count: 11844779787]:
    # u_30 = PHI <idx_19(9), u_27(8)>
    _18 = u_30 + l_34;
    idx_19 = _18 >> 1;
    _20 = idx_19 * 16;
    p_21 = _11 + _20;
    _29 = MEM[(const struct ipa_argagg_value *)p_21].index;
    if (_1 < _29)
      goto <bb 9>; [34.00%]
    else
      goto <bb 4>; [66.00%]

    <bb 4> [local count: 7817554615]:
    if (_1 > _29)
      goto <bb 7>; [34.00%]
    else
      goto <bb 5>; [66.00%]

    <bb 5> [local count: 5159586016]:
    _33 = MEM[(const struct ipa_argagg_value *)p_21].unit_offset;
    if (unit_offset_5(D) < _33)
      goto <bb 9>; [34.00%]
    else
      goto <bb 6>; [66.00%]

    <bb 6> [local count: 3405326750]:
    if (unit_offset_5(D) > _33)
      goto <bb 7>; [94.50%]
    else
      goto <bb 10>; [5.50%]

    <bb 7> [local count: 6604057032]:
    l_23 = idx_19 + 1;

    <bb 8> [local count: 7677798856]:
    # l_34 = PHI <0(2), l_23(7)>
    # u_27 = PHI <_15(2), u_30(7)>
    if (u_27 > l_34)
      goto <bb 3>; [94.50%]
    else
      goto <bb 10>; [5.50%]

    <bb 9> [local count: 5781484434]:
    if (idx_19 > l_34)
      goto <bb 3>; [94.50%]
    else
      goto <bb 10>; [5.50%]

    <bb 10> [local count: 1073741824]:
    # _24 = PHI <p_21(6), 0B(9), 0B(8)>
    return _24;

  }
  ----------------------------------------------------------------------

And the following assembly.

  ----------------------------------------------------------------------
        .p2align 4
        .globl  _Z7testfunRK11array_sliceIK16ipa_argagg_valueEij
        .type   _Z7testfunRK11array_sliceIK16ipa_argagg_valueEij, @function
  _Z7testfunRK11array_sliceIK16ipa_argagg_valueEij:
  .LFB3449:
        .cfi_startproc
        movq    (%rdi), %r10
        movl    8(%rdi), %r9d
        xorl    %edi, %edi
        jmp     .L1392
        .p2align 4,,10
        .p2align 3
  .L1400:
        cmpw    %si, %r8w
        jb      .L1394
        movl    8(%rax), %r8d
        cmpl    %r8d, %edx
        jb      .L1393
        cmpl    %edx, %r8d
        jnb     .L1391
  .L1394:
        leaq    1(%rcx), %rdi
  .L1392:
        cmpq    %r9, %rdi
        jnb     .L1399
  .L1396:
        leaq    (%r9,%rdi), %rcx
        shrq    %rcx
        movq    %rcx, %rax
        salq    $4, %rax
        addq    %r10, %rax
        movzwl  12(%rax), %r8d
        cmpw    %r8w, %si
        jnb     .L1400
  .L1393:
        cmpq    %rcx, %rdi
        jnb     .L1399
        movq    %rcx, %r9
        jmp     .L1396
        .p2align 4,,10
        .p2align 3
  .L1399:
        xorl    %eax, %eax
  .L1391:
        ret
        .cfi_endproc
  .LFE3449:
        .size   _Z7testfunRK11array_sliceIK16ipa_argagg_valueEij, 
.-_Z7testfunRK11array_sliceIK16ipa_argagg_valueEij
  ----------------------------------------------------------------------

The C++ <algorithm> std::lower_bound leads to the following optimized
dump:

  ----------------------------------------------------------------------
  ;; Function testfun (_Z7testfunRK11array_sliceIK16ipa_argagg_valueEij, 
funcdef_no=4007, decl_uid=138795, cgraph_uid=2481, symbol_order=2656)

  __attribute__((noinline))
  const struct ipa_argagg_value * testfun (const struct array_slice & elts, int 
index, unsigned int unit_offset)
  {
    _DistanceType __half;
    _DistanceType __len;
    const struct ipa_argagg_value * __first;
    const struct ipa_argagg_value * res;
    short unsigned int _1;
    short unsigned int _2;
    int _3;
    unsigned int _4;
    const struct ipa_argagg_value * _14;
    unsigned int _15;
    long unsigned int _16;
    long unsigned int _17;
    const struct ipa_argagg_value * _18;
    long int _20;
    long unsigned int __n.58_39;
    signed long _53;
    long unsigned int _56;
    const struct ipa_argagg_value * _57;
    short unsigned int _58;
    long int _60;
    unsigned int _62;

    <bb 2> [local count: 1073741823]:
    _1 = (short unsigned int) index_6(D);
    _14 = elts_11(D)->m_base;
    _15 = elts_11(D)->m_size;
    _16 = (long unsigned int) _15;
    _17 = _16 * 16;
    _18 = _14 + _17;
    _53 = (signed long) _17;
    _20 = _53 /[ex] 16;
    if (_53 != 0)
      goto <bb 3>; [89.00%]
    else
      goto <bb 8>; [11.00%]

    <bb 3> [local count: 8687547686]:
    # __len_32 = PHI <_20(2), __len_64(7)>
    # __first_35 = PHI <_14(2), __first_63(7)>
    __half_38 = __len_32 >> 1;
    __n.58_39 = (long unsigned int) __half_38;
    _56 = __n.58_39 * 16;
    _57 = __first_35 + _56;
    _58 = _57->index;
    if (_1 > _58)
      goto <bb 6>; [34.00%]
    else
      goto <bb 4>; [66.00%]

    <bb 4> [local count: 5733781450]:
    if (_1 < _58)
      goto <bb 7>; [34.00%]
    else
      goto <bb 5>; [66.00%]

    <bb 5> [local count: 3784295727]:
    _62 = _57->unit_offset;
    if (unit_offset_9(D) > _62)
      goto <bb 6>; [34.00%]
    else
      goto <bb 7>; [66.00%]

    <bb 6> [local count: 4240426809]:
    __first_59 = _57 + 16;
    _60 = __len_32 - __half_38;
    __len_61 = _60 + -1;

    <bb 7> [local count: 8687547695]:
    # __first_63 = PHI <__first_59(6), __first_35(5), __first_35(4)>
    # __len_64 = PHI <__len_61(6), __half_38(5), __half_38(4)>
    if (__len_64 > 0)
      goto <bb 3>; [89.00%]
    else
      goto <bb 8>; [11.00%]

    <bb 8> [local count: 1073741841]:
    # __first_51 = PHI <__first_63(7), _14(2)>
    if (_18 == __first_51)
      goto <bb 12>; [14.90%]
    else
      goto <bb 9>; [85.10%]

    <bb 9> [local count: 913754310]:
    _2 = __first_51->index;
    _3 = (int) _2;
    if (_3 != index_6(D))
      goto <bb 12>; [44.22%]
    else
      goto <bb 10>; [55.78%]

    <bb 10> [local count: 509692156]:
    _4 = __first_51->unit_offset;
    if (_4 != unit_offset_9(D))
      goto <bb 11>; [44.22%]
    else
      goto <bb 12>; [55.78%]

    <bb 11> [local count: 225385870]:

    <bb 12> [local count: 1073741841]:
    # res_5 = PHI <__first_51(10), 0B(8), 0B(9), 0B(11)>
    return res_5;

  }
  ----------------------------------------------------------------------

And the following assembly:

  ----------------------------------------------------------------------

        .p2align 4
        .globl  _Z7testfunRK11array_sliceIK16ipa_argagg_valueEij
        .type   _Z7testfunRK11array_sliceIK16ipa_argagg_valueEij, @function
  _Z7testfunRK11array_sliceIK16ipa_argagg_valueEij:
  .LFB4007:
        .cfi_startproc
        movl    8(%rdi), %eax
        movq    (%rdi), %r8
        movl    %esi, %r10d
        movl    %esi, %r9d
        salq    $4, %rax
        movq    %rax, %rcx
        leaq    (%r8,%rax), %r11
        sarq    $4, %rcx
        testq   %rax, %rax
        jne     .L1395
        jmp     .L1392
        .p2align 4,,10
        .p2align 3
  .L1405:
        cmpw    %di, %r9w
        jb      .L1398
        cmpl    %edx, 8(%rax)
        jb      .L1393
  .L1398:
        movq    %rsi, %rcx
        testq   %rcx, %rcx
        jle     .L1392
  .L1395:
        movq    %rcx, %rsi
        sarq    %rsi
        movq    %rsi, %rax
        salq    $4, %rax
        addq    %r8, %rax
        movzwl  12(%rax), %edi
        cmpw    %r9w, %di
        jnb     .L1405
  .L1393:
        subq    %rsi, %rcx
        leaq    16(%rax), %r8
        subq    $1, %rcx
        testq   %rcx, %rcx
        jg      .L1395
  .L1392:
        cmpq    %r8, %r11
        je      .L1400
        movzwl  12(%r8), %eax
        cmpl    %r10d, %eax
        jne     .L1400
        cmpl    %edx, 8(%r8)
        je      .L1391
  .L1400:
        xorl    %r8d, %r8d
  .L1391:
        movq    %r8, %rax
        ret
        .cfi_endproc
  .LFE4007:
        .size   _Z7testfunRK11array_sliceIK16ipa_argagg_valueEij, 
.-_Z7testfunRK11array_sliceIK16ipa_argagg_valueEij
  ----------------------------------------------------------------------

Martin

Reply via email to