At 2024-12-04 23:39:00, "Micro Daryl Robles" <[email protected]> wrote: >Also add a new helper function transpose_4x8_s16. >+static inline void transpose_4x8_s16(int16x4_t s0, int16x4_t s1, int16x4_t >s2, int16x4_t s3, >+ int16x4_t s4, int16x4_t s5, int16x4_t >s6, int16x4_t s7, >+ int16x8_t &d0, int16x8_t &d1, int16x8_t >&d2, int16x8_t &d3) >+{ >+ int16x8_t s0q = vcombine_s16(s0, vdup_n_s16(0)); >+ int16x8_t s1q = vcombine_s16(s1, vdup_n_s16(0)); >+ int16x8_t s2q = vcombine_s16(s2, vdup_n_s16(0)); >+ int16x8_t s3q = vcombine_s16(s3, vdup_n_s16(0)); >+ int16x8_t s4q = vcombine_s16(s4, vdup_n_s16(0)); >+ int16x8_t s5q = vcombine_s16(s5, vdup_n_s16(0)); >+ int16x8_t s6q = vcombine_s16(s6, vdup_n_s16(0)); >+ int16x8_t s7q = vcombine_s16(s7, vdup_n_s16(0)); Same as previous, high 64 bits unnecessary to clear >+template<int shift> >+static inline void partialButterflyInverse8_neon(const int16_t *src, int16_t >*dst, intptr_t dstStride) >+ if (vget_lane_u64(vreinterpret_u64_s16(vget_low_s16(s3)), 0) != 0) detect zeros is good idea, however, 4 instructions not enough to hidden pipeline flush cost, suggest combine below each two of if_sections (O_lo & O_hi) into one >+ { >+ O_lo[0] = vmlal_lane_s16(O_lo[0], vget_low_s16(s3), c_odd, 1); // 75 >+ O_lo[1] = vmlsl_lane_s16(O_lo[1], vget_low_s16(s3), c_odd, 3); // -18 >+ O_lo[2] = vmlsl_lane_s16(O_lo[2], vget_low_s16(s3), c_odd, 0); // -89 >+ O_lo[3] = vmlsl_lane_s16(O_lo[3], vget_low_s16(s3), c_odd, 2); // -50 >+ } >+ if (vget_lane_u64(vreinterpret_u64_s16(vget_high_s16(s3)), 0) != 0) >+ { >+ O_hi[0] = vmlal_lane_s16(O_hi[0], vget_high_s16(s3), c_odd, 1); // 75 >+ O_hi[1] = vmlsl_lane_s16(O_hi[1], vget_high_s16(s3), c_odd, 3); // -18 >+ O_hi[2] = vmlsl_lane_s16(O_hi[2], vget_high_s16(s3), c_odd, 0); // -89 >+ O_hi[3] = vmlsl_lane_s16(O_hi[3], vget_high_s16(s3), c_odd, 2); // -50 >+ }
_______________________________________________ x265-devel mailing list [email protected] https://mailman.videolan.org/listinfo/x265-devel
