On 3/12/20 7:58 AM, LIU Zhiwei wrote: > +#define GEN_VEXT_VSLIDEUP_VX(NAME, ETYPE, H, CLEAR_FN) \ > +void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ > + CPURISCVState *env, uint32_t desc) \ > +{ \ > + uint32_t mlen = vext_mlen(desc); \ > + uint32_t vlmax = env_archcpu(env)->cfg.vlen / mlen; \ > + uint32_t vm = vext_vm(desc); \ > + uint32_t vl = env->vl; \ > + uint32_t offset = s1, i; \ > + \ > + if (offset > vl) { \ > + offset = vl; \ > + } \
This isn't right. > + for (i = 0; i < vl; i++) { \ > + if (((i < offset)) || (!vm && !vext_elem_mask(v0, mlen, i))) { \ > + continue; \ > + } \ > + *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - offset)); \ > + } \ > + if (i == 0) { \ > + return; \ > + } \ You need to eliminate vl == 0 first, not last. Then for (i = offset; i < vl; i++) The types of i and vl need to be extended to target_ulong, so that you don't incorrectly crop the input offset. It may be worth special-casing vm=1, or hoisting it out of the loop. The operation becomes a memcpy (at least for little-endian) at that point. See swap_memmove in arm/sve_helper.c. > +#define GEN_VEXT_VSLIDEDOWN_VX(NAME, ETYPE, H, CLEAR_FN) \ > +void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ > + CPURISCVState *env, uint32_t desc) \ > +{ \ > + uint32_t mlen = vext_mlen(desc); \ > + uint32_t vlmax = env_archcpu(env)->cfg.vlen / mlen; \ > + uint32_t vm = vext_vm(desc); \ > + uint32_t vl = env->vl; \ > + uint32_t offset = s1, i; \ > + \ > + for (i = 0; i < vl; i++) { \ > + if (!vm && !vext_elem_mask(v0, mlen, i)) { \ > + continue; \ > + } \ > + if (i + offset < vlmax) { \ > + *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i + offset)); \ Again, eliminate vl == 0 first. In fact, why don't we make that a global request for all of the patches for the next revision. Checking for i == 0 last is silly, and checks for the zero twice: once in the loop bounds and again at the end. It is probably worth changing the loop bounds to if (offset >= vlmax) { max = 0; } else { max = MIN(vl, vlmax - offset); } for (i = 0; i < max; ++i) > + } else { \ > + *((ETYPE *)vd + H(i)) = 0; \ > + } Which lets these zeros merge into... > + for (; i < vlmax; i++) { \ > + CLEAR_FN(vd, vl, vl * sizeof(ETYPE), vlmax * sizeof(ETYPE)); \ > + } \ These zeros. > +#define GEN_VEXT_VSLIDE1UP_VX(NAME, ETYPE, H, CLEAR_FN) \ > +void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ > + CPURISCVState *env, uint32_t desc) \ > +{ \ > + uint32_t mlen = vext_mlen(desc); \ > + uint32_t vlmax = env_archcpu(env)->cfg.vlen / mlen; \ > + uint32_t vm = vext_vm(desc); \ > + uint32_t vl = env->vl; \ > + uint32_t i; \ > + \ > + for (i = 0; i < vl; i++) { \ > + if (!vm && !vext_elem_mask(v0, mlen, i)) { \ > + continue; \ > + } \ > + if (i == 0) { \ > + *((ETYPE *)vd + H(i)) = s1; \ > + } else { \ > + *((ETYPE *)vd + H(i)) = *((ETYPE *)vs2 + H(i - 1)); \ > + } \ > + } \ > + if (i == 0) { \ > + return; \ > + } \ > + for (; i < vlmax; i++) { \ > + CLEAR_FN(vd, vl, vl * sizeof(ETYPE), vlmax * sizeof(ETYPE)); \ > + } \ > +} As a preference, I think you can do away with this helper. Simply use the slideup helper with argument 1, and then afterwards store the integer register into element 0. You should be able to re-use code from vmv.s.x for that. > +#define GEN_VEXT_VSLIDE1DOWN_VX(NAME, ETYPE, H, CLEAR_FN) \ > +void HELPER(NAME)(void *vd, void *v0, target_ulong s1, void *vs2, \ > + CPURISCVState *env, uint32_t desc) \ > +{ \ Likewise. r~