On 5/17/19 9:47 AM, Richard Henderson wrote: > first_equal = n; > first_zero = n; > for (i = n - 1; i >= 0; --i) { > if (data1 == data2) { > first_equal = i; > } > if (data1 == 0) { > first_zero = i; > } > } > > // As an aside, there are bit tricks for the above, > // but let's stay simple(r) for now.
What the hell, it's not /that/ tricky. /* * Returns a bit set in the MSB of each element that is zero, * as defined by the mask M. */ static inline uint64_t zero_search(uint64_t a, uint64_t m) { return ~(((a & m) + m) | a | m); } /* * Returns the byte offset for the first match, or 16 for no match. */ static inline int match_index(uint64_t c0, uint64_t c1) { return (c0 ? clz64(c0) : clz64(c1) + 64) >> 3; } Use dup_const(MO_8, 0x7f) dup_const(MO_16, 0x7fff) dup_const(MO_32, 0x7fffffff) for the M parameter for the different element sizes. uint64_t a0, a1, b0, b1, e0, e1, z0, z1; a0 = s390_vec_read_element64(v2, 0); a1 = s390_vec_read_element64(v2, 1); b0 = s390_vec_read_element64(v3, 0); b1 = s390_vec_read_element64(v3, 1); e0 = zero_search(a0 ^ b0, m); e1 = zero_search(a1 ^ b1, m); first_equal = match_index(e0, e1); if (zs) { z0 = zero_search(a0, m); z1 = zero_search(a1, m); first_zero = match_index(z0, z1); ... r~