From: Michel Dänzer <daen...@vmware.com>

Also add fast paths for untransformed Composite operations.

This can significantly reduce the CPU overhead in RadeonCompositeTileCP, at
least for TCL capable GPUs.
---

I think the basic idea is sound, but I'm not sure if some parts are going too
far, e.g. the float fw, fh locals in the fastpath. Opinions?


 src/r600_exa.c           |    2 -
 src/radeon.h             |    6 +-
 src/radeon_commonfuncs.c |    4 +-
 src/radeon_exa_render.c  |  242 +++++++++++++++++++++++++---------------------
 src/radeon_render.c      |   12 +-
 5 files changed, 141 insertions(+), 125 deletions(-)

diff --git a/src/r600_exa.c b/src/r600_exa.c
index f6f2007..a794598 100644
--- a/src/r600_exa.c
+++ b/src/r600_exa.c
@@ -1114,8 +1114,6 @@ R600DoneCopy(PixmapPtr pDst)
 }
 
 
-#define xFixedToFloat(f) (((float) (f)) / 65536)
-
 struct blendinfo {
     Bool dst_alpha;
     Bool src_alpha;
diff --git a/src/radeon.h b/src/radeon.h
index 9d283bb..2fa4714 100644
--- a/src/radeon.h
+++ b/src/radeon.h
@@ -236,7 +236,7 @@ typedef enum {
                                   * for something else.
                                   */
 
-#define xFixedToFloat(f) (((float) (f)) / 65536)
+#define xFixedToFloat(f) (((float) (f)) * (1.0f / 65536.0f))
 
 #define RADEON_LOGLEVEL_DEBUG 4
 
@@ -657,8 +657,8 @@ struct radeon_accel_state {
     uint32_t          dst_pitch_offset;
 
     /* render accel */
-    unsigned short    texW[2];
-    unsigned short    texH[2];
+    float             texWrcp[2];
+    float             texHrcp[2];
     Bool              XInited3D; /* X itself has the 3D context */
     int               num_gb_pipes;
     Bool              has_tcl;
diff --git a/src/radeon_commonfuncs.c b/src/radeon_commonfuncs.c
index 8c46235..61a5b75 100644
--- a/src/radeon_commonfuncs.c
+++ b/src/radeon_commonfuncs.c
@@ -59,8 +59,8 @@ static void FUNC_NAME(RADEONInit3DEngine)(ScrnInfoPtr pScrn)
     int size;
     ACCEL_PREAMBLE();
 
-    info->accel_state->texW[0] = info->accel_state->texH[0] =
-       info->accel_state->texW[1] = info->accel_state->texH[1] = 1;
+    info->accel_state->texWrcp[0] = info->accel_state->texHrcp[0] =
+       info->accel_state->texWrcp[1] = info->accel_state->texHrcp[1] = 1.0f;
 
     if (IS_R300_3D || IS_R500_3D) {
 
diff --git a/src/radeon_exa_render.c b/src/radeon_exa_render.c
index 7bc8ef0..116f00d 100644
--- a/src/radeon_exa_render.c
+++ b/src/radeon_exa_render.c
@@ -409,8 +409,8 @@ static Bool FUNC_NAME(R100TextureSetup)(PicturePtr pPict, 
PixmapPtr pPix,
        txformat |= RADEON_TXFORMAT_NON_POWER2;
     txformat |= unit << 24; /* RADEON_TXFORMAT_ST_ROUTE_STQX */
 
-    info->accel_state->texW[unit] = w;
-    info->accel_state->texH[unit] = h;
+    info->accel_state->texWrcp[unit] = 1.0f / (float)(65536 * w);
+    info->accel_state->texHrcp[unit] = 1.0f / (float)(65536 * h);
 
     switch (pPict->filter) {
     case PictFilterNearest:
@@ -794,8 +794,8 @@ static Bool FUNC_NAME(R200TextureSetup)(PicturePtr pPict, 
PixmapPtr pPix,
        txformat |= R200_TXFORMAT_NON_POWER2;
     txformat |= unit << R200_TXFORMAT_ST_ROUTE_SHIFT;
 
-    info->accel_state->texW[unit] = w;
-    info->accel_state->texH[unit] = h;
+    info->accel_state->texWrcp[unit] = 1.0f / (float)(65536 * w);
+    info->accel_state->texHrcp[unit] = 1.0f / (float)(65536 * h);
 
     switch (pPict->filter) {
     case PictFilterNearest:
@@ -1244,64 +1244,44 @@ static Bool FUNC_NAME(R300TextureSetup)(PicturePtr 
pPict, PixmapPtr pPix,
        OUT_ACCEL_REG(R300_TX_BORDER_COLOR_0 + (unit * 4), 0);
     FINISH_ACCEL();
 
-    if (pPict->transform != 0) {
-       info->accel_state->is_transform[unit] = TRUE;
-       info->accel_state->transform[unit] = pPict->transform;
+    if (info->accel_state->has_tcl) {
+       info->accel_state->is_transform[unit] = FALSE;
 
        /* setup the PVS consts */
-       if (info->accel_state->has_tcl) {
-           info->accel_state->texW[unit] = 1;
-           info->accel_state->texH[unit] = 1;
-           BEGIN_ACCEL(9);
-           if (IS_R300_3D)
-               OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, 
R300_PVS_VECTOR_CONST_INDEX(unit * 2));
-           else
-               OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, 
R500_PVS_VECTOR_CONST_INDEX(unit * 2));
+       BEGIN_ACCEL(9);
+       if (IS_R300_3D)
+           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, 
R300_PVS_VECTOR_CONST_INDEX(unit * 2));
+       else
+           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, 
R500_PVS_VECTOR_CONST_INDEX(unit * 2));
 
+       if (pPict->transform) {
            OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, 
F_TO_DW(xFixedToFloat(pPict->transform->matrix[0][0])));
            OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, 
F_TO_DW(xFixedToFloat(pPict->transform->matrix[0][1])));
            OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, 
F_TO_DW(xFixedToFloat(pPict->transform->matrix[0][2])));
-           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/w));
+           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)w));
 
            OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, 
F_TO_DW(xFixedToFloat(pPict->transform->matrix[1][0])));
            OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, 
F_TO_DW(xFixedToFloat(pPict->transform->matrix[1][1])));
            OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, 
F_TO_DW(xFixedToFloat(pPict->transform->matrix[1][2])));
-           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/h));
-
-           FINISH_ACCEL();
+           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)h));
        } else {
-           info->accel_state->texW[unit] = w;
-           info->accel_state->texH[unit] = h;
+           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f));
+           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f));
+           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f));
+           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)w));
+
+           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f));
+           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f));
+           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f));
+           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)h));
        }
-    } else {
-       info->accel_state->is_transform[unit] = FALSE;
-
-       /* setup the PVS consts */
-       if (info->accel_state->has_tcl) {
-           info->accel_state->texW[unit] = 1;
-           info->accel_state->texH[unit] = 1;
 
-           BEGIN_ACCEL(9);
-           if (IS_R300_3D)
-               OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, 
R300_PVS_VECTOR_CONST_INDEX(unit * 2));
-           else
-               OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, 
R500_PVS_VECTOR_CONST_INDEX(unit * 2));
-
-           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0));
-           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
-           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
-           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/w));
-
-           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
-           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0));
-           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0));
-           OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/h));
-
-           FINISH_ACCEL();
-       } else {
-           info->accel_state->texW[unit] = w;
-           info->accel_state->texH[unit] = h;
-       }
+       FINISH_ACCEL();
+    } else {
+       info->accel_state->is_transform[unit] = !!pPict->transform;
+       info->accel_state->transform[unit] = pPict->transform;
+       info->accel_state->texWrcp[unit] = 1.0f / (float)(65536 * w);
+       info->accel_state->texHrcp[unit] = 1.0f / (float)(65536 * h);
     }
 
     return TRUE;
@@ -2147,8 +2127,6 @@ static void FUNC_NAME(RadeonCompositeTile)(ScrnInfoPtr 
pScrn,
                                           int w, int h)
 {
     int vtx_count;
-    xPointFixed srcTopLeft, srcTopRight, srcBottomLeft, srcBottomRight;
-    static xPointFixed maskTopLeft, maskTopRight, maskBottomLeft, 
maskBottomRight;
     ACCEL_PREAMBLE();
 
     ENTER_DRAW(0);
@@ -2172,45 +2150,9 @@ static void FUNC_NAME(RadeonCompositeTile)(ScrnInfoPtr 
pScrn,
     }
 #endif
 
-    srcTopLeft.x     = IntToxFixed(srcX);
-    srcTopLeft.y     = IntToxFixed(srcY);
-    srcTopRight.x    = IntToxFixed(srcX + w);
-    srcTopRight.y    = IntToxFixed(srcY);
-    srcBottomLeft.x  = IntToxFixed(srcX);
-    srcBottomLeft.y  = IntToxFixed(srcY + h);
-    srcBottomRight.x = IntToxFixed(srcX + w);
-    srcBottomRight.y = IntToxFixed(srcY + h);
-
-    if (info->accel_state->is_transform[0]) {
-       if ((info->ChipFamily < CHIP_FAMILY_R300) || 
!info->accel_state->has_tcl) {
-           transformPoint(info->accel_state->transform[0], &srcTopLeft);
-           transformPoint(info->accel_state->transform[0], &srcTopRight);
-           transformPoint(info->accel_state->transform[0], &srcBottomLeft);
-           transformPoint(info->accel_state->transform[0], &srcBottomRight);
-       }
-    }
-
-    if (info->accel_state->msk_pic) {
-       maskTopLeft.x     = IntToxFixed(maskX);
-       maskTopLeft.y     = IntToxFixed(maskY);
-       maskTopRight.x    = IntToxFixed(maskX + w);
-       maskTopRight.y    = IntToxFixed(maskY);
-       maskBottomLeft.x  = IntToxFixed(maskX);
-       maskBottomLeft.y  = IntToxFixed(maskY + h);
-       maskBottomRight.x = IntToxFixed(maskX + w);
-       maskBottomRight.y = IntToxFixed(maskY + h);
-
-       if (info->accel_state->is_transform[1]) {
-           if ((info->ChipFamily < CHIP_FAMILY_R300) || 
!info->accel_state->has_tcl) {
-               transformPoint(info->accel_state->transform[1], &maskTopLeft);
-               transformPoint(info->accel_state->transform[1], &maskTopRight);
-               transformPoint(info->accel_state->transform[1], 
&maskBottomLeft);
-               transformPoint(info->accel_state->transform[1], 
&maskBottomRight);
-           }
-       }
-
+    if (info->accel_state->msk_pic)
        vtx_count = 6;
-    } else
+    else
        vtx_count = 4;
 
     if (info->accel_state->vsync)
@@ -2285,32 +2227,108 @@ static void FUNC_NAME(RadeonCompositeTile)(ScrnInfoPtr 
pScrn,
 
 #endif
 
-    if (info->accel_state->msk_pic) {
-       if (IS_R300_3D || IS_R500_3D) {
-           VTX_OUT_MASK((float)dstX,                                      
(float)dstY,
-                        xFixedToFloat(srcTopLeft.x) / 
info->accel_state->texW[0],      xFixedToFloat(srcTopLeft.y) / 
info->accel_state->texH[0],
-                        xFixedToFloat(maskTopLeft.x) / 
info->accel_state->texW[1],     xFixedToFloat(maskTopLeft.y) / 
info->accel_state->texH[1]);
+    if ((info->ChipFamily >= CHIP_FAMILY_R300) && info->accel_state->has_tcl) {
+       float dstX1, dstY1, dstX2, dstY2;
+       float srcX1, srcY1, srcX2, srcY2;
+       float fw, fh;
+
+       fw = w;
+       fh = h;
+       dstX1 = dstX;
+       dstY1 = dstY;
+       dstX2 = dstX1 + fw;
+       dstY2 = dstY1 + fh;
+       srcX1 = srcX;
+       srcY1 = srcY;
+       srcX2 = srcX1 + fw;
+       srcY2 = srcY1 + fh;
+
+       if (info->accel_state->msk_pic) {
+           float maskX1, maskY1, maskX2, maskY2;
+
+           maskX1 = maskX;
+           maskY1 = maskY;
+           maskX2 = maskX1 + fw;
+           maskY2 = maskY1 + fh;
+
+           VTX_OUT_MASK(dstX1,  dstY1,  srcX1,  srcY1,  maskX1,  maskY1);
+           VTX_OUT_MASK(dstX1,  dstY2,  srcX1,  srcY2,  maskX1,  maskY2);
+           VTX_OUT_MASK(dstX2,  dstY2,  srcX2,  srcY2,  maskX2,  maskY2);
+           VTX_OUT_MASK(dstX2,  dstY1,  srcX2,  srcY1,  maskX2,  maskY1);
+       } else {
+           VTX_OUT(dstX1,  dstY1,  srcX1,  srcY1);
+           VTX_OUT(dstX1,  dstY2,  srcX1,  srcY2);
+           VTX_OUT(dstX2,  dstY2,  srcX2,  srcY2);
+           VTX_OUT(dstX2,  dstY1,  srcX2,  srcY1);
        }
-       VTX_OUT_MASK((float)dstX,                                      
(float)(dstY + h),
-               xFixedToFloat(srcBottomLeft.x) / info->accel_state->texW[0],   
xFixedToFloat(srcBottomLeft.y) / info->accel_state->texH[0],
-               xFixedToFloat(maskBottomLeft.x) / info->accel_state->texW[1],  
xFixedToFloat(maskBottomLeft.y) / info->accel_state->texH[1]);
-       VTX_OUT_MASK((float)(dstX + w),                                
(float)(dstY + h),
-               xFixedToFloat(srcBottomRight.x) / info->accel_state->texW[0],  
xFixedToFloat(srcBottomRight.y) / info->accel_state->texH[0],
-               xFixedToFloat(maskBottomRight.x) / info->accel_state->texW[1], 
xFixedToFloat(maskBottomRight.y) / info->accel_state->texH[1]);
-       VTX_OUT_MASK((float)(dstX + w),                                
(float)dstY,
-               xFixedToFloat(srcTopRight.x) / info->accel_state->texW[0],     
xFixedToFloat(srcTopRight.y) / info->accel_state->texH[0],
-               xFixedToFloat(maskTopRight.x) / info->accel_state->texW[1],    
xFixedToFloat(maskTopRight.y) / info->accel_state->texH[1]);
     } else {
-       if (IS_R300_3D || IS_R500_3D) {
-           VTX_OUT((float)dstX,                                      
(float)dstY,
-                   xFixedToFloat(srcTopLeft.x) / info->accel_state->texW[0],   
   xFixedToFloat(srcTopLeft.y) / info->accel_state->texH[0]);
+       xPointFixed srcTopLeft, srcTopRight, srcBottomLeft, srcBottomRight;
+       float srcWrcp = info->accel_state->texWrcp[0];
+       float srcHrcp = info->accel_state->texHrcp[0];
+
+       srcTopLeft.x     = IntToxFixed(srcX);
+       srcTopLeft.y     = IntToxFixed(srcY);
+       srcTopRight.x    = IntToxFixed(srcX + w);
+       srcTopRight.y    = IntToxFixed(srcY);
+       srcBottomLeft.x  = IntToxFixed(srcX);
+       srcBottomLeft.y  = IntToxFixed(srcY + h);
+       srcBottomRight.x = IntToxFixed(srcX + w);
+       srcBottomRight.y = IntToxFixed(srcY + h);
+
+       if (info->accel_state->is_transform[0]) {
+           transformPoint(info->accel_state->transform[0], &srcTopLeft);
+           transformPoint(info->accel_state->transform[0], &srcTopRight);
+           transformPoint(info->accel_state->transform[0], &srcBottomLeft);
+           transformPoint(info->accel_state->transform[0], &srcBottomRight);
+       }
+
+       if (info->accel_state->msk_pic) {
+           xPointFixed maskTopLeft, maskTopRight, maskBottomLeft, 
maskBottomRight;
+           float maskWrcp = info->accel_state->texWrcp[1];
+           float maskHrcp = info->accel_state->texHrcp[1];
+
+           maskTopLeft.x     = IntToxFixed(maskX);
+           maskTopLeft.y     = IntToxFixed(maskY);
+           maskTopRight.x    = IntToxFixed(maskX + w);
+           maskTopRight.y    = IntToxFixed(maskY);
+           maskBottomLeft.x  = IntToxFixed(maskX);
+           maskBottomLeft.y  = IntToxFixed(maskY + h);
+           maskBottomRight.x = IntToxFixed(maskX + w);
+           maskBottomRight.y = IntToxFixed(maskY + h);
+
+           if (info->accel_state->is_transform[1]) {
+               transformPoint(info->accel_state->transform[1], &maskTopLeft);
+               transformPoint(info->accel_state->transform[1], &maskTopRight);
+               transformPoint(info->accel_state->transform[1], 
&maskBottomLeft);
+               transformPoint(info->accel_state->transform[1], 
&maskBottomRight);
+           }
+
+           if (IS_R300_3D || IS_R500_3D) {
+               VTX_OUT_MASK((float)dstX,              (float)dstY,
+                            srcTopLeft.x * srcWrcp,   srcTopLeft.y * srcHrcp,
+                            maskTopLeft.x * maskWrcp, maskTopLeft.y * 
maskHrcp);
+           }
+           VTX_OUT_MASK((float)dstX,                  (float)(dstY + h),
+                        srcBottomLeft.x * srcWrcp,    srcBottomLeft.y * 
srcHrcp,
+                        maskBottomLeft.x * maskWrcp,  maskBottomLeft.y * 
maskHrcp);
+           VTX_OUT_MASK((float)(dstX + w),            (float)(dstY + h),
+                        srcBottomRight.x * srcWrcp,   srcBottomRight.y * 
srcHrcp,
+                        maskBottomRight.x * maskWrcp, maskBottomRight.y * 
maskHrcp);
+           VTX_OUT_MASK((float)(dstX + w),            (float)dstY,
+                        srcTopRight.x * srcWrcp,      srcTopRight.y * srcHrcp,
+                        maskTopRight.x * maskWrcp,    maskTopRight.y * 
maskHrcp);
+       } else {
+           if (IS_R300_3D || IS_R500_3D) {
+               VTX_OUT((float)dstX,            (float)dstY,
+                       srcTopLeft.x * srcWrcp, srcTopLeft.y * srcHrcp);
+           }
+           VTX_OUT((float)dstX,                (float)(dstY + h),
+                   srcBottomLeft.x * srcWrcp,  srcBottomLeft.y * srcHrcp);
+           VTX_OUT((float)(dstX + w),          (float)(dstY + h),
+                   srcBottomRight.x * srcWrcp, srcBottomRight.y * srcHrcp);
+           VTX_OUT((float)(dstX + w),          (float)dstY,
+                   srcTopRight.x * srcWrcp,    srcTopRight.y * srcHrcp);
        }
-       VTX_OUT((float)dstX,                                      (float)(dstY 
+ h),
-               xFixedToFloat(srcBottomLeft.x) / info->accel_state->texW[0],   
xFixedToFloat(srcBottomLeft.y) / info->accel_state->texH[0]);
-       VTX_OUT((float)(dstX + w),                                (float)(dstY 
+ h),
-               xFixedToFloat(srcBottomRight.x) / info->accel_state->texW[0],  
xFixedToFloat(srcBottomRight.y) / info->accel_state->texH[0]);
-       VTX_OUT((float)(dstX + w),                                (float)dstY,
-               xFixedToFloat(srcTopRight.x) / info->accel_state->texW[0],     
xFixedToFloat(srcTopRight.y) / info->accel_state->texH[0]);
     }
 
 #ifdef ACCEL_CP
diff --git a/src/radeon_render.c b/src/radeon_render.c
index 6668fe0..68811b7 100644
--- a/src/radeon_render.c
+++ b/src/radeon_render.c
@@ -773,8 +773,8 @@ static Bool FUNC_NAME(R200SetupTexture)(
        txformat |= RADEON_TXFORMAT_NON_POWER2;
     }
 
-    info->accel_state->texW[0] = width;
-    info->accel_state->texH[0] = height;
+    info->accel_state->texWrcp[0] = 1.0f / width;
+    info->accel_state->texHrcp[0] = 1.0f / height;
 
     offset = info->accel_state->RenderTex->offset * pScrn->bitsPerPixel / 8;
     dst = (uint8_t*)(info->FB + offset);
@@ -975,10 +975,10 @@ FUNC_NAME(R200SubsequentCPUToScreenTexture) (
     
     r = width + l;
     b = height + t;
-    fl = (float)srcx / info->accel_state->texW[0];
-    fr = (float)(srcx + width) / info->accel_state->texW[0];
-    ft = (float)srcy / info->accel_state->texH[0];
-    fb = (float)(srcy + height) / info->accel_state->texH[0];
+    fl = (float)srcx * info->accel_state->texWrcp[0];
+    fr = (float)(srcx + width) * info->accel_state->texWrcp[0];
+    ft = (float)srcy * info->accel_state->texHrcp[0];
+    fb = (float)(srcy + height) * info->accel_state->texHrcp[0];
 
 #ifdef ACCEL_CP
     BEGIN_RING(24);
-- 
1.6.4.3

_______________________________________________
xorg-driver-ati mailing list
xorg-driver-ati@lists.x.org
http://lists.x.org/mailman/listinfo/xorg-driver-ati

Reply via email to