On 3/14/20 4:12 PM, LIU Zhiwei wrote: > I am not sure whether I get it. In my opinion, the code should be modified > like > > static inline int8_t aadd8_rnu(CPURISCVState *env, int8_t a, int8_t b) > { > int16_t res = (int16_t)a + (int16_t)b; > uint8_t round = res & 0x1; > res = (res >> 1) + round; > return res; > } > > static inline int8_t aadd8_rne(CPURISCVState *env, int8_t a, int8_t b) > { > int16_t res = (int16_t)a + (int16_t)b; > uint8_t round = ((res & 0x3) == 0x3); > res = (res >> 1) + round; > return res; > } > > static inline int8_t aadd8_rdn(CPURISCVState *env, int8_t a, int8_t b) > { > int16_t res = (int16_t)a + (int16_t)b; > res = (res >> 1); > return res; > } > > static inline int8_t aadd8_rod(CPURISCVState *env, int8_t a, int8_t b) > { > int16_t res = (int16_t)a + (int16_t)b; > uint8_t round = ((res & 0x3) == 0x1); > res = (res >> 1) + round; > return res; > } > > RVVCALL(OPIVV2_ENV, vaadd_vv_b_rnu, OP_SSS_B, H1, H1, H1, aadd8_rnu) > RVVCALL(OPIVV2_ENV, vaadd_vv_b_rne, OP_SSS_B, H1, H1, H1, aadd8_rne) > RVVCALL(OPIVV2_ENV, vaadd_vv_b_rdn, OP_SSS_B, H1, H1, H1, aadd8_rdn) > RVVCALL(OPIVV2_ENV, vaadd_vv_b_rod, OP_SSS_B, H1, H1, H1, aadd8_rod) > > void do_vext_vv_env(void *vd, void *v0, void *vs1, > void *vs2, CPURISCVState *env, uint32_t desc, > uint32_t esz, uint32_t dsz, > opivv2_fn *fn, clear_fn *clearfn) > { > uint32_t vlmax = vext_maxsz(desc) / esz; > uint32_t mlen = vext_mlen(desc); > uint32_t vm = vext_vm(desc); > uint32_t vl = env->vl; > uint32_t i; > for (i = 0; i < vl; i++) { > if (!vm && !vext_elem_mask(v0, mlen, i)) { > continue; > } > fn(vd, vs1, vs2, i, env); > } > if (i != 0) { > clear_fn(vd, vl, vl * dsz, vlmax * dsz); > } > } > > #define GEN_VEXT_VV_ENV(NAME, ESZ, DSZ, CLEAR_FN) \ > void HELPER(NAME)(void *vd, void *v0, void *vs1, \ > void *vs2, CPURISCVState *env, \ > uint32_t desc) \ > { \ > static opivv2_fn *fns[4] = { \ > NAME##_rnu, NAME##_rne, \ > NAME##_rdn, NAME##_rod \ > } \ > return do_vext_vv_env(vd, v0, vs1, vs2, env, desc, \ > ESZ, DSZ, fns[env->vxrm], \ > CLEAR_FN); \ > } > > Is it true?
While that does look good for this case, there are many other uses of get_round(), and it may not be quite as simple there. My suggestion was static inline int32_t aadd32(int vxrm, int32_t a, int32_t b) { int64_t res = (int64_t)a + b; uint8_t round = get_round(vxrm, res, 1); return (res >> 1) + round; } static inline int64_t aadd64(int vxrm, int64_t a, int64_t b) { int64_t res = a + b; uint8_t round = get_round(vxrm, res, 1); int64_t over = (res ^ a) & (res ^ b) & INT64_MIN; /* With signed overflow, bit 64 is inverse of bit 63. */ return ((res >> 1) ^ over) + round; } RVVCALL(OPIVV2_RM, vaadd_vv_b, OP_SSS_B, H1, H1, H1, aadd32) RVVCALL(OPIVV2_RM, vaadd_vv_h, OP_SSS_H, H2, H2, H2, aadd32) RVVCALL(OPIVV2_RM, vaadd_vv_w, OP_SSS_W, H4, H4, H4, aadd32) RVVCALL(OPIVV2_RM, vaadd_vv_d, OP_SSS_D, H8, H8, H8, aadd64) static inline void vext_vv_rm_1(void *vd, void *v0, void *vs1, void *vs2, uint32_t vl, uint32_t vm, uint32_t mlen, int vxrm, opivv2_rm_fn *fn) { for (uint32_t i = 0; i < vl; i++) { if (!vm && !vext_elem_mask(v0, mlen, i)) { continue; } fn(vd, vs1, vs2, i, vxrm); } } static inline void vext_vv_rm_2(void *vd, void *v0, void *vs1, void *vs2, CPURISCVState *env, uint32_t desc, uint32_t esz, uint32_t dsz, opivv2_rm_fn *fn, clear_fn *clearfn) { uint32_t vlmax = vext_maxsz(desc) / esz; uint32_t mlen = vext_mlen(desc); uint32_t vm = vext_vm(desc); uint32_t vl = env->vl; if (vl == 0) { return; } switch (env->vxrm) { case 0: /* rnu */ vext_vv_rm_1(vd, v0, vs1, vs2, vl, vm, mlen, 0, fn); break; case 1: /* rne */ vext_vv_rm_1(vd, v0, vs1, vs2, vl, vm, mlen, 1, fn); break; case 2: /* rdn */ vext_vv_rm_1(vd, v0, vs1, vs2, vl, vm, mlen, 2, fn); break; default: /* rod */ vext_vv_rm_1(vd, v0, vs1, vs2, vl, vm, mlen, 3, fn); break; } clear_fn(vd, vl, vl * dsz, vlmax * dsz); } >From vext_vv_rm_2, a constant is passed down all of the inline functions, so that a constant arrives in get_round() at the bottom of the call chain. At which point all of the expressions get folded by the compiler and we *should* get very similar generated code as to what you have above. r~