From: Maxim Maslov <mas...@eltechs.com> --- src/gallium/drivers/vc4/vc4_tiling_lt.c | 93 +++++++++++++++++++++++++++++++-- 1 file changed, 90 insertions(+), 3 deletions(-)
diff --git a/src/gallium/drivers/vc4/vc4_tiling_lt.c b/src/gallium/drivers/vc4/vc4_tiling_lt.c index c9cbc65..d291262 100644 --- a/src/gallium/drivers/vc4/vc4_tiling_lt.c +++ b/src/gallium/drivers/vc4/vc4_tiling_lt.c @@ -105,6 +105,49 @@ vc4_load_utile(void *cpu, void *gpu, uint32_t cpu_stride, uint32_t cpp) : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) : "q0", "q1", "q2", "q3"); } +#elif defined(USE_SSE_ASM) + if (gpu_stride == 8) { + __asm__ volatile ( + "movdqu 0(%1), %%xmm0;" + "movdqu 0x10(%1), %%xmm1;" + "movdqu 0x20(%1), %%xmm2;" + "movdqu 0x30(%1), %%xmm3;" + "movlpd %%xmm0, 0(%0);" + "mov %2, %%ecx;" + "movhpd %%xmm0, 0(%0,%%ecx,1);" + "add %2, %%ecx;" + "movlpd %%xmm1, 0(%0,%%ecx,1);" + "add %2, %%ecx;" + "movhpd %%xmm1, 0(%0,%%ecx,1);" + "add %2, %%ecx;" + "movlpd %%xmm2, 0(%0,%%ecx,1);" + "add %2, %%ecx;" + "movhpd %%xmm2, 0(%0,%%ecx,1);" + "add %2, %%ecx;" + "movlpd %%xmm3, 0(%0,%%ecx,1);" + "add %2, %%ecx;" + "movhpd %%xmm3, 0(%0,%%ecx,1);" + : + : "r"(cpu), "r"(gpu), "r"(cpu_stride) + : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%ecx"); + } else { + assert(gpu_stride == 16); + __asm__ volatile ( + "movdqu 0(%1), %%xmm0;" + "movdqu 0x10(%1), %%xmm1;" + "movdqu 0x20(%1), %%xmm2;" + "movdqu 0x30(%1), %%xmm3;" + "movdqu %%xmm0, 0(%0);" + "mov %2, %%ecx;" + "movdqu %%xmm1, 0(%0,%%ecx,1);" + "add %2, %%ecx;" + "movdqu %%xmm2, 0(%0,%%ecx,1);" + "add %2, %%ecx;" + "movdqu %%xmm3, 0(%0,%%ecx,1);" + : + : "r"(cpu), "r"(gpu), "r"(cpu_stride) + : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%ecx"); + } #else for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { memcpy(cpu, gpu + gpu_offset, gpu_stride); @@ -160,13 +203,55 @@ vc4_store_utile(void *gpu, void *cpu, uint32_t cpu_stride, uint32_t cpp) : "r"(gpu), "r"(cpu), "r"(cpu + 8), "r"(cpu_stride) : "q0", "q1", "q2", "q3"); } +#elif defined(USE_SSE_ASM) + if (gpu_stride == 8) { + __asm__ volatile ( + "movlpd 0(%1), %%xmm0;" + "mov %2, %%ecx;" + "movhpd 0(%1,%%ecx,1), %%xmm0;" + "add %2, %%ecx;" + "movlpd 0(%1,%%ecx,1), %%xmm1;" + "add %2, %%ecx;" + "movhpd 0(%1,%%ecx,1), %%xmm1;" + "add %2, %%ecx;" + "movlpd 0(%1,%%ecx,1), %%xmm2;" + "add %2, %%ecx;" + "movhpd 0(%1,%%ecx,1), %%xmm2;" + "add %2, %%ecx;" + "movlpd 0(%1,%%ecx,1), %%xmm3;" + "add %2, %%ecx;" + "movhpd 0(%1,%%ecx,1), %%xmm3;" + "movdqu %%xmm0, 0(%0);" + "movdqu %%xmm1, 0x10(%0);" + "movdqu %%xmm2, 0x20(%0);" + "movdqu %%xmm3, 0x30(%0);" + : + : "r"(gpu), "r"(cpu), "r"(cpu_stride) + : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%ecx"); + } else { + assert(gpu_stride == 16); + __asm__ volatile ( + "movdqu 0(%1), %%xmm0;" + "mov %2, %%ecx;" + "movdqu 0(%1,%%ecx,1), %%xmm1;" + "add %2, %%ecx;" + "movdqu 0(%1,%%ecx,1), %%xmm2;" + "add %2, %%ecx;" + "movdqu 0(%1,%%ecx,1), %%xmm3;" + "movdqu %%xmm0, 0(%0);" + "movdqu %%xmm1, 0x10(%0);" + "movdqu %%xmm2, 0x20(%0);" + "movdqu %%xmm3, 0x30(%0);" + : + : "r"(gpu), "r"(cpu), "r"(cpu_stride) + : "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%ecx"); + } #else for (uint32_t gpu_offset = 0; gpu_offset < 64; gpu_offset += gpu_stride) { memcpy(gpu + gpu_offset, cpu, gpu_stride); cpu += cpu_stride; } #endif - } void @@ -175,6 +260,7 @@ NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride, int cpp, const struct pipe_box *box) { uint32_t utile_w = vc4_utile_width(cpp); + uint32_t xfactor = 64 / utile_w; uint32_t utile_h = vc4_utile_height(cpp); uint32_t xstart = box->x; uint32_t ystart = box->y; @@ -184,7 +270,7 @@ NEON_TAG(vc4_load_lt_image)(void *dst, uint32_t dst_stride, vc4_load_utile(dst + (dst_stride * y + x * cpp), src + ((ystart + y) * src_stride + - (xstart + x) * 64 / utile_w), + (xstart + x) * xfactor), dst_stride, cpp); } } @@ -196,6 +282,7 @@ NEON_TAG(vc4_store_lt_image)(void *dst, uint32_t dst_stride, int cpp, const struct pipe_box *box) { uint32_t utile_w = vc4_utile_width(cpp); + uint32_t xfactor = 64 / utile_w; uint32_t utile_h = vc4_utile_height(cpp); uint32_t xstart = box->x; uint32_t ystart = box->y; @@ -203,7 +290,7 @@ NEON_TAG(vc4_store_lt_image)(void *dst, uint32_t dst_stride, for (uint32_t y = 0; y < box->height; y += utile_h) { for (int x = 0; x < box->width; x += utile_w) { vc4_store_utile(dst + ((ystart + y) * dst_stride + - (xstart + x) * 64 / utile_w), + (xstart + x) * xfactor), src + (src_stride * y + x * cpp), src_stride, cpp); -- 2.7.4 _______________________________________________ mesa-dev mailing list mesa-dev@lists.freedesktop.org https://lists.freedesktop.org/mailman/listinfo/mesa-dev