From: Michel Dänzer <daen...@vmware.com> Also add fast paths for untransformed Composite operations.
This can significantly reduce the CPU overhead in RadeonCompositeTileCP, at least for TCL capable GPUs. --- I think the basic idea is sound, but I'm not sure if some parts are going too far, e.g. the float fw, fh locals in the fastpath. Opinions? src/r600_exa.c | 2 - src/radeon.h | 6 +- src/radeon_commonfuncs.c | 4 +- src/radeon_exa_render.c | 242 +++++++++++++++++++++++++--------------------- src/radeon_render.c | 12 +- 5 files changed, 141 insertions(+), 125 deletions(-) diff --git a/src/r600_exa.c b/src/r600_exa.c index f6f2007..a794598 100644 --- a/src/r600_exa.c +++ b/src/r600_exa.c @@ -1114,8 +1114,6 @@ R600DoneCopy(PixmapPtr pDst) } -#define xFixedToFloat(f) (((float) (f)) / 65536) - struct blendinfo { Bool dst_alpha; Bool src_alpha; diff --git a/src/radeon.h b/src/radeon.h index 9d283bb..2fa4714 100644 --- a/src/radeon.h +++ b/src/radeon.h @@ -236,7 +236,7 @@ typedef enum { * for something else. */ -#define xFixedToFloat(f) (((float) (f)) / 65536) +#define xFixedToFloat(f) (((float) (f)) * (1.0f / 65536.0f)) #define RADEON_LOGLEVEL_DEBUG 4 @@ -657,8 +657,8 @@ struct radeon_accel_state { uint32_t dst_pitch_offset; /* render accel */ - unsigned short texW[2]; - unsigned short texH[2]; + float texWrcp[2]; + float texHrcp[2]; Bool XInited3D; /* X itself has the 3D context */ int num_gb_pipes; Bool has_tcl; diff --git a/src/radeon_commonfuncs.c b/src/radeon_commonfuncs.c index 8c46235..61a5b75 100644 --- a/src/radeon_commonfuncs.c +++ b/src/radeon_commonfuncs.c @@ -59,8 +59,8 @@ static void FUNC_NAME(RADEONInit3DEngine)(ScrnInfoPtr pScrn) int size; ACCEL_PREAMBLE(); - info->accel_state->texW[0] = info->accel_state->texH[0] = - info->accel_state->texW[1] = info->accel_state->texH[1] = 1; + info->accel_state->texWrcp[0] = info->accel_state->texHrcp[0] = + info->accel_state->texWrcp[1] = info->accel_state->texHrcp[1] = 1.0f; if (IS_R300_3D || IS_R500_3D) { diff --git a/src/radeon_exa_render.c b/src/radeon_exa_render.c index 7bc8ef0..116f00d 100644 --- a/src/radeon_exa_render.c +++ b/src/radeon_exa_render.c @@ -409,8 +409,8 @@ static Bool FUNC_NAME(R100TextureSetup)(PicturePtr pPict, PixmapPtr pPix, txformat |= RADEON_TXFORMAT_NON_POWER2; txformat |= unit << 24; /* RADEON_TXFORMAT_ST_ROUTE_STQX */ - info->accel_state->texW[unit] = w; - info->accel_state->texH[unit] = h; + info->accel_state->texWrcp[unit] = 1.0f / (float)(65536 * w); + info->accel_state->texHrcp[unit] = 1.0f / (float)(65536 * h); switch (pPict->filter) { case PictFilterNearest: @@ -794,8 +794,8 @@ static Bool FUNC_NAME(R200TextureSetup)(PicturePtr pPict, PixmapPtr pPix, txformat |= R200_TXFORMAT_NON_POWER2; txformat |= unit << R200_TXFORMAT_ST_ROUTE_SHIFT; - info->accel_state->texW[unit] = w; - info->accel_state->texH[unit] = h; + info->accel_state->texWrcp[unit] = 1.0f / (float)(65536 * w); + info->accel_state->texHrcp[unit] = 1.0f / (float)(65536 * h); switch (pPict->filter) { case PictFilterNearest: @@ -1244,64 +1244,44 @@ static Bool FUNC_NAME(R300TextureSetup)(PicturePtr pPict, PixmapPtr pPix, OUT_ACCEL_REG(R300_TX_BORDER_COLOR_0 + (unit * 4), 0); FINISH_ACCEL(); - if (pPict->transform != 0) { - info->accel_state->is_transform[unit] = TRUE; - info->accel_state->transform[unit] = pPict->transform; + if (info->accel_state->has_tcl) { + info->accel_state->is_transform[unit] = FALSE; /* setup the PVS consts */ - if (info->accel_state->has_tcl) { - info->accel_state->texW[unit] = 1; - info->accel_state->texH[unit] = 1; - BEGIN_ACCEL(9); - if (IS_R300_3D) - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R300_PVS_VECTOR_CONST_INDEX(unit * 2)); - else - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R500_PVS_VECTOR_CONST_INDEX(unit * 2)); + BEGIN_ACCEL(9); + if (IS_R300_3D) + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R300_PVS_VECTOR_CONST_INDEX(unit * 2)); + else + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R500_PVS_VECTOR_CONST_INDEX(unit * 2)); + if (pPict->transform) { OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[0][0]))); OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[0][1]))); OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[0][2]))); - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/w)); + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)w)); OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[1][0]))); OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[1][1]))); OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(xFixedToFloat(pPict->transform->matrix[1][2]))); - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/h)); - - FINISH_ACCEL(); + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)h)); } else { - info->accel_state->texW[unit] = w; - info->accel_state->texH[unit] = h; + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f)); + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f)); + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f)); + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)w)); + + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f)); + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f)); + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0f)); + OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0f/(float)h)); } - } else { - info->accel_state->is_transform[unit] = FALSE; - - /* setup the PVS consts */ - if (info->accel_state->has_tcl) { - info->accel_state->texW[unit] = 1; - info->accel_state->texH[unit] = 1; - BEGIN_ACCEL(9); - if (IS_R300_3D) - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R300_PVS_VECTOR_CONST_INDEX(unit * 2)); - else - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_INDX_REG, R500_PVS_VECTOR_CONST_INDEX(unit * 2)); - - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0)); - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0)); - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0)); - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/w)); - - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0)); - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0)); - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(0.0)); - OUT_ACCEL_REG(R300_VAP_PVS_VECTOR_DATA_REG, F_TO_DW(1.0/h)); - - FINISH_ACCEL(); - } else { - info->accel_state->texW[unit] = w; - info->accel_state->texH[unit] = h; - } + FINISH_ACCEL(); + } else { + info->accel_state->is_transform[unit] = !!pPict->transform; + info->accel_state->transform[unit] = pPict->transform; + info->accel_state->texWrcp[unit] = 1.0f / (float)(65536 * w); + info->accel_state->texHrcp[unit] = 1.0f / (float)(65536 * h); } return TRUE; @@ -2147,8 +2127,6 @@ static void FUNC_NAME(RadeonCompositeTile)(ScrnInfoPtr pScrn, int w, int h) { int vtx_count; - xPointFixed srcTopLeft, srcTopRight, srcBottomLeft, srcBottomRight; - static xPointFixed maskTopLeft, maskTopRight, maskBottomLeft, maskBottomRight; ACCEL_PREAMBLE(); ENTER_DRAW(0); @@ -2172,45 +2150,9 @@ static void FUNC_NAME(RadeonCompositeTile)(ScrnInfoPtr pScrn, } #endif - srcTopLeft.x = IntToxFixed(srcX); - srcTopLeft.y = IntToxFixed(srcY); - srcTopRight.x = IntToxFixed(srcX + w); - srcTopRight.y = IntToxFixed(srcY); - srcBottomLeft.x = IntToxFixed(srcX); - srcBottomLeft.y = IntToxFixed(srcY + h); - srcBottomRight.x = IntToxFixed(srcX + w); - srcBottomRight.y = IntToxFixed(srcY + h); - - if (info->accel_state->is_transform[0]) { - if ((info->ChipFamily < CHIP_FAMILY_R300) || !info->accel_state->has_tcl) { - transformPoint(info->accel_state->transform[0], &srcTopLeft); - transformPoint(info->accel_state->transform[0], &srcTopRight); - transformPoint(info->accel_state->transform[0], &srcBottomLeft); - transformPoint(info->accel_state->transform[0], &srcBottomRight); - } - } - - if (info->accel_state->msk_pic) { - maskTopLeft.x = IntToxFixed(maskX); - maskTopLeft.y = IntToxFixed(maskY); - maskTopRight.x = IntToxFixed(maskX + w); - maskTopRight.y = IntToxFixed(maskY); - maskBottomLeft.x = IntToxFixed(maskX); - maskBottomLeft.y = IntToxFixed(maskY + h); - maskBottomRight.x = IntToxFixed(maskX + w); - maskBottomRight.y = IntToxFixed(maskY + h); - - if (info->accel_state->is_transform[1]) { - if ((info->ChipFamily < CHIP_FAMILY_R300) || !info->accel_state->has_tcl) { - transformPoint(info->accel_state->transform[1], &maskTopLeft); - transformPoint(info->accel_state->transform[1], &maskTopRight); - transformPoint(info->accel_state->transform[1], &maskBottomLeft); - transformPoint(info->accel_state->transform[1], &maskBottomRight); - } - } - + if (info->accel_state->msk_pic) vtx_count = 6; - } else + else vtx_count = 4; if (info->accel_state->vsync) @@ -2285,32 +2227,108 @@ static void FUNC_NAME(RadeonCompositeTile)(ScrnInfoPtr pScrn, #endif - if (info->accel_state->msk_pic) { - if (IS_R300_3D || IS_R500_3D) { - VTX_OUT_MASK((float)dstX, (float)dstY, - xFixedToFloat(srcTopLeft.x) / info->accel_state->texW[0], xFixedToFloat(srcTopLeft.y) / info->accel_state->texH[0], - xFixedToFloat(maskTopLeft.x) / info->accel_state->texW[1], xFixedToFloat(maskTopLeft.y) / info->accel_state->texH[1]); + if ((info->ChipFamily >= CHIP_FAMILY_R300) && info->accel_state->has_tcl) { + float dstX1, dstY1, dstX2, dstY2; + float srcX1, srcY1, srcX2, srcY2; + float fw, fh; + + fw = w; + fh = h; + dstX1 = dstX; + dstY1 = dstY; + dstX2 = dstX1 + fw; + dstY2 = dstY1 + fh; + srcX1 = srcX; + srcY1 = srcY; + srcX2 = srcX1 + fw; + srcY2 = srcY1 + fh; + + if (info->accel_state->msk_pic) { + float maskX1, maskY1, maskX2, maskY2; + + maskX1 = maskX; + maskY1 = maskY; + maskX2 = maskX1 + fw; + maskY2 = maskY1 + fh; + + VTX_OUT_MASK(dstX1, dstY1, srcX1, srcY1, maskX1, maskY1); + VTX_OUT_MASK(dstX1, dstY2, srcX1, srcY2, maskX1, maskY2); + VTX_OUT_MASK(dstX2, dstY2, srcX2, srcY2, maskX2, maskY2); + VTX_OUT_MASK(dstX2, dstY1, srcX2, srcY1, maskX2, maskY1); + } else { + VTX_OUT(dstX1, dstY1, srcX1, srcY1); + VTX_OUT(dstX1, dstY2, srcX1, srcY2); + VTX_OUT(dstX2, dstY2, srcX2, srcY2); + VTX_OUT(dstX2, dstY1, srcX2, srcY1); } - VTX_OUT_MASK((float)dstX, (float)(dstY + h), - xFixedToFloat(srcBottomLeft.x) / info->accel_state->texW[0], xFixedToFloat(srcBottomLeft.y) / info->accel_state->texH[0], - xFixedToFloat(maskBottomLeft.x) / info->accel_state->texW[1], xFixedToFloat(maskBottomLeft.y) / info->accel_state->texH[1]); - VTX_OUT_MASK((float)(dstX + w), (float)(dstY + h), - xFixedToFloat(srcBottomRight.x) / info->accel_state->texW[0], xFixedToFloat(srcBottomRight.y) / info->accel_state->texH[0], - xFixedToFloat(maskBottomRight.x) / info->accel_state->texW[1], xFixedToFloat(maskBottomRight.y) / info->accel_state->texH[1]); - VTX_OUT_MASK((float)(dstX + w), (float)dstY, - xFixedToFloat(srcTopRight.x) / info->accel_state->texW[0], xFixedToFloat(srcTopRight.y) / info->accel_state->texH[0], - xFixedToFloat(maskTopRight.x) / info->accel_state->texW[1], xFixedToFloat(maskTopRight.y) / info->accel_state->texH[1]); } else { - if (IS_R300_3D || IS_R500_3D) { - VTX_OUT((float)dstX, (float)dstY, - xFixedToFloat(srcTopLeft.x) / info->accel_state->texW[0], xFixedToFloat(srcTopLeft.y) / info->accel_state->texH[0]); + xPointFixed srcTopLeft, srcTopRight, srcBottomLeft, srcBottomRight; + float srcWrcp = info->accel_state->texWrcp[0]; + float srcHrcp = info->accel_state->texHrcp[0]; + + srcTopLeft.x = IntToxFixed(srcX); + srcTopLeft.y = IntToxFixed(srcY); + srcTopRight.x = IntToxFixed(srcX + w); + srcTopRight.y = IntToxFixed(srcY); + srcBottomLeft.x = IntToxFixed(srcX); + srcBottomLeft.y = IntToxFixed(srcY + h); + srcBottomRight.x = IntToxFixed(srcX + w); + srcBottomRight.y = IntToxFixed(srcY + h); + + if (info->accel_state->is_transform[0]) { + transformPoint(info->accel_state->transform[0], &srcTopLeft); + transformPoint(info->accel_state->transform[0], &srcTopRight); + transformPoint(info->accel_state->transform[0], &srcBottomLeft); + transformPoint(info->accel_state->transform[0], &srcBottomRight); + } + + if (info->accel_state->msk_pic) { + xPointFixed maskTopLeft, maskTopRight, maskBottomLeft, maskBottomRight; + float maskWrcp = info->accel_state->texWrcp[1]; + float maskHrcp = info->accel_state->texHrcp[1]; + + maskTopLeft.x = IntToxFixed(maskX); + maskTopLeft.y = IntToxFixed(maskY); + maskTopRight.x = IntToxFixed(maskX + w); + maskTopRight.y = IntToxFixed(maskY); + maskBottomLeft.x = IntToxFixed(maskX); + maskBottomLeft.y = IntToxFixed(maskY + h); + maskBottomRight.x = IntToxFixed(maskX + w); + maskBottomRight.y = IntToxFixed(maskY + h); + + if (info->accel_state->is_transform[1]) { + transformPoint(info->accel_state->transform[1], &maskTopLeft); + transformPoint(info->accel_state->transform[1], &maskTopRight); + transformPoint(info->accel_state->transform[1], &maskBottomLeft); + transformPoint(info->accel_state->transform[1], &maskBottomRight); + } + + if (IS_R300_3D || IS_R500_3D) { + VTX_OUT_MASK((float)dstX, (float)dstY, + srcTopLeft.x * srcWrcp, srcTopLeft.y * srcHrcp, + maskTopLeft.x * maskWrcp, maskTopLeft.y * maskHrcp); + } + VTX_OUT_MASK((float)dstX, (float)(dstY + h), + srcBottomLeft.x * srcWrcp, srcBottomLeft.y * srcHrcp, + maskBottomLeft.x * maskWrcp, maskBottomLeft.y * maskHrcp); + VTX_OUT_MASK((float)(dstX + w), (float)(dstY + h), + srcBottomRight.x * srcWrcp, srcBottomRight.y * srcHrcp, + maskBottomRight.x * maskWrcp, maskBottomRight.y * maskHrcp); + VTX_OUT_MASK((float)(dstX + w), (float)dstY, + srcTopRight.x * srcWrcp, srcTopRight.y * srcHrcp, + maskTopRight.x * maskWrcp, maskTopRight.y * maskHrcp); + } else { + if (IS_R300_3D || IS_R500_3D) { + VTX_OUT((float)dstX, (float)dstY, + srcTopLeft.x * srcWrcp, srcTopLeft.y * srcHrcp); + } + VTX_OUT((float)dstX, (float)(dstY + h), + srcBottomLeft.x * srcWrcp, srcBottomLeft.y * srcHrcp); + VTX_OUT((float)(dstX + w), (float)(dstY + h), + srcBottomRight.x * srcWrcp, srcBottomRight.y * srcHrcp); + VTX_OUT((float)(dstX + w), (float)dstY, + srcTopRight.x * srcWrcp, srcTopRight.y * srcHrcp); } - VTX_OUT((float)dstX, (float)(dstY + h), - xFixedToFloat(srcBottomLeft.x) / info->accel_state->texW[0], xFixedToFloat(srcBottomLeft.y) / info->accel_state->texH[0]); - VTX_OUT((float)(dstX + w), (float)(dstY + h), - xFixedToFloat(srcBottomRight.x) / info->accel_state->texW[0], xFixedToFloat(srcBottomRight.y) / info->accel_state->texH[0]); - VTX_OUT((float)(dstX + w), (float)dstY, - xFixedToFloat(srcTopRight.x) / info->accel_state->texW[0], xFixedToFloat(srcTopRight.y) / info->accel_state->texH[0]); } #ifdef ACCEL_CP diff --git a/src/radeon_render.c b/src/radeon_render.c index 6668fe0..68811b7 100644 --- a/src/radeon_render.c +++ b/src/radeon_render.c @@ -773,8 +773,8 @@ static Bool FUNC_NAME(R200SetupTexture)( txformat |= RADEON_TXFORMAT_NON_POWER2; } - info->accel_state->texW[0] = width; - info->accel_state->texH[0] = height; + info->accel_state->texWrcp[0] = 1.0f / width; + info->accel_state->texHrcp[0] = 1.0f / height; offset = info->accel_state->RenderTex->offset * pScrn->bitsPerPixel / 8; dst = (uint8_t*)(info->FB + offset); @@ -975,10 +975,10 @@ FUNC_NAME(R200SubsequentCPUToScreenTexture) ( r = width + l; b = height + t; - fl = (float)srcx / info->accel_state->texW[0]; - fr = (float)(srcx + width) / info->accel_state->texW[0]; - ft = (float)srcy / info->accel_state->texH[0]; - fb = (float)(srcy + height) / info->accel_state->texH[0]; + fl = (float)srcx * info->accel_state->texWrcp[0]; + fr = (float)(srcx + width) * info->accel_state->texWrcp[0]; + ft = (float)srcy * info->accel_state->texHrcp[0]; + fb = (float)(srcy + height) * info->accel_state->texHrcp[0]; #ifdef ACCEL_CP BEGIN_RING(24); -- 1.6.4.3 _______________________________________________ xorg-driver-ati mailing list xorg-driver-ati@lists.x.org http://lists.x.org/mailman/listinfo/xorg-driver-ati