Rico, 

can you give a try to this patch?
If it is slightly slower than native, we could at first merge it.


Anyway, if the application is well coded, this function should not be called 
often. Usually an application uses transformations matrices that are a lot 
easier to inverse

Nozomi




________________________________
 De : Henri Verbeet <hverb...@gmail.com>
À : Rico Schüller <kgbric...@web.de> 
Cc : wine-devel@winehq.org; Nozomi Kodama <nozomi.kod...@yahoo.com> 
Envoyé le : Lundi 25 février 2013 0h08
Objet : Re: d3dx9: Avoid expensive computations
 
On 25 February 2013 10:24, Rico Schüller <kgbric...@web.de> wrote:
> I did some small tests for speed with the following results. You may also
> avoid such a lot of variable assignments like *pout = out and you may use 4
> vecs instead. This should save ~48 assignments and it should also improve
> the speed a bit more (~10%). Though, native is still 40% faster than that.
>
I'd somewhat expect native to use SSE versions of this kind of thing
when the CPU supports those instructions. You also generally want to
pay attention to the order in which you access memory, although
perhaps it doesn't matter so much here because an entire matrix should
be able to fit in a single cacheline, provided it's properly aligned.
From 5cc1a57aede7c920d105fec1ac738ba21f144787 Mon Sep 17 00:00:00 2001
From: Nozomi Kodama <nozomi.kod...@yahoo.com>
Date: Mon, 25 Feb 2013 01:26:13 -1000
Subject: Explicit computations to speed up them

---
 dlls/d3dx9_36/math.c |  199 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 160 insertions(+), 39 deletions(-)

diff --git a/dlls/d3dx9_36/math.c b/dlls/d3dx9_36/math.c
index 69c3297..99360b6 100644
--- a/dlls/d3dx9_36/math.c
+++ b/dlls/d3dx9_36/math.c
@@ -255,54 +255,175 @@ HRESULT WINAPI D3DXMatrixDecompose(D3DXVECTOR3 
*poutscale, D3DXQUATERNION *poutr
 
 FLOAT WINAPI D3DXMatrixDeterminant(const D3DXMATRIX *pm)
 {
-    D3DXVECTOR4 minor, v1, v2, v3;
-    FLOAT det;
+
+    FLOAT v[4];
 
     TRACE("pm %p\n", pm);
 
-    v1.x = pm->u.m[0][0]; v1.y = pm->u.m[1][0]; v1.z = pm->u.m[2][0]; v1.w = 
pm->u.m[3][0];
-    v2.x = pm->u.m[0][1]; v2.y = pm->u.m[1][1]; v2.z = pm->u.m[2][1]; v2.w = 
pm->u.m[3][1];
-    v3.x = pm->u.m[0][2]; v3.y = pm->u.m[1][2]; v3.z = pm->u.m[2][2]; v3.w = 
pm->u.m[3][2];
-    D3DXVec4Cross(&minor, &v1, &v2, &v3);
-    det =  - (pm->u.m[0][3] * minor.x + pm->u.m[1][3] * minor.y + 
pm->u.m[2][3] * minor.z + pm->u.m[3][3] * minor.w);
-    return det;
+    v[0] = pm->u.m[1][1] * pm->u.m[2][2] * pm->u.m[3][3] -
+        pm->u.m[1][1] * pm->u.m[2][3] * pm->u.m[3][2] -
+        pm->u.m[2][1] * pm->u.m[1][2] * pm->u.m[3][3] +
+        pm->u.m[2][1] * pm->u.m[1][3] * pm->u.m[3][2] +
+        pm->u.m[3][1] * pm->u.m[1][2] * pm->u.m[2][3] -
+        pm->u.m[3][1] * pm->u.m[1][3] * pm->u.m[2][2];
+
+    v[1] = -pm->u.m[1][0] * pm->u.m[2][2] * pm->u.m[3][3] +
+        pm->u.m[1][0] * pm->u.m[2][3] * pm->u.m[3][2] +
+        pm->u.m[2][0] * pm->u.m[1][2] * pm->u.m[3][3] -
+        pm->u.m[2][0] * pm->u.m[1][3] * pm->u.m[3][2] -
+        pm->u.m[3][0] * pm->u.m[1][2] * pm->u.m[2][3] +
+        pm->u.m[3][0] * pm->u.m[1][3] * pm->u.m[2][2];
+
+    v[2] = pm->u.m[1][0] * pm->u.m[2][1] * pm->u.m[3][3] -
+        pm->u.m[1][0] * pm->u.m[2][3] * pm->u.m[3][1] -
+        pm->u.m[2][0] * pm->u.m[1][1] * pm->u.m[3][3] +
+        pm->u.m[2][0] * pm->u.m[1][3] * pm->u.m[3][1] +
+        pm->u.m[3][0] * pm->u.m[1][1] * pm->u.m[2][3] -
+        pm->u.m[3][0] * pm->u.m[1][3] * pm->u.m[2][1];
+
+    v[3] = -pm->u.m[1][0] * pm->u.m[2][1] * pm->u.m[3][2] +
+        pm->u.m[1][0] * pm->u.m[2][2] * pm->u.m[3][1] +
+        pm->u.m[2][0] * pm->u.m[1][1] * pm->u.m[3][2] -
+        pm->u.m[2][0] * pm->u.m[1][2] * pm->u.m[3][1] -
+        pm->u.m[3][0] * pm->u.m[1][1] * pm->u.m[2][2] +
+        pm->u.m[3][0] * pm->u.m[1][2] * pm->u.m[2][1];
+
+    return pm->u.m[0][0] * v[0] + pm->u.m[0][1] * v[1] +
+        pm->u.m[0][2] * v[2] + pm->u.m[0][3] * v[3];
 }
 
 D3DXMATRIX* WINAPI D3DXMatrixInverse(D3DXMATRIX *pout, FLOAT *pdeterminant, 
const D3DXMATRIX *pm)
 {
-    int a, i, j;
-    D3DXMATRIX out;
-    D3DXVECTOR4 v, vec[3];
-    FLOAT det;
+    FLOAT v[16], det;
+    UINT i, j;
+
+    v[0] = pm->u.m[1][1] * pm->u.m[2][2] * pm->u.m[3][3] -
+        pm->u.m[1][1] * pm->u.m[2][3] * pm->u.m[3][2] -
+        pm->u.m[2][1] * pm->u.m[1][2] * pm->u.m[3][3] +
+        pm->u.m[2][1] * pm->u.m[1][3] * pm->u.m[3][2] +
+        pm->u.m[3][1] * pm->u.m[1][2] * pm->u.m[2][3] -
+        pm->u.m[3][1] * pm->u.m[1][3] * pm->u.m[2][2];
+
+    v[4] = -pm->u.m[1][0] * pm->u.m[2][2] * pm->u.m[3][3] +
+        pm->u.m[1][0] * pm->u.m[2][3] * pm->u.m[3][2] +
+        pm->u.m[2][0] * pm->u.m[1][2] * pm->u.m[3][3] -
+        pm->u.m[2][0] * pm->u.m[1][3] * pm->u.m[3][2] -
+        pm->u.m[3][0] * pm->u.m[1][2] * pm->u.m[2][3] +
+        pm->u.m[3][0] * pm->u.m[1][3] * pm->u.m[2][2];
+
+    v[8] = pm->u.m[1][0] * pm->u.m[2][1] * pm->u.m[3][3] -
+        pm->u.m[1][0] * pm->u.m[2][3] * pm->u.m[3][1] -
+        pm->u.m[2][0] * pm->u.m[1][1] * pm->u.m[3][3] +
+        pm->u.m[2][0] * pm->u.m[1][3] * pm->u.m[3][1] +
+        pm->u.m[3][0] * pm->u.m[1][1] * pm->u.m[2][3] -
+        pm->u.m[3][0] * pm->u.m[1][3] * pm->u.m[2][1];
+
+    v[12] = -pm->u.m[1][0] * pm->u.m[2][1] * pm->u.m[3][2] +
+        pm->u.m[1][0] * pm->u.m[2][2] * pm->u.m[3][1] +
+        pm->u.m[2][0] * pm->u.m[1][1] * pm->u.m[3][2] -
+        pm->u.m[2][0] * pm->u.m[1][2] * pm->u.m[3][1] -
+        pm->u.m[3][0] * pm->u.m[1][1] * pm->u.m[2][2] +
+        pm->u.m[3][0] * pm->u.m[1][2] * pm->u.m[2][1];
+
+    det = pm->u.m[0][0] * v[0] + pm->u.m[0][1] * v[4] + pm->u.m[0][2] * v[8] +
+        pm->u.m[0][3] * v[12];
+
+    if (det == 0.0f)
+        return NULL;
+    if (pdeterminant)
+        *pdeterminant = det;
+
+    v[1] = -pm->u.m[0][1] * pm->u.m[2][2] * pm->u.m[3][3] +
+        pm->u.m[0][1] * pm->u.m[2][3] * pm->u.m[3][2] +
+        pm->u.m[2][1] * pm->u.m[0][2] * pm->u.m[3][3] -
+        pm->u.m[2][1] * pm->u.m[0][3] * pm->u.m[3][2] -
+        pm->u.m[3][1] * pm->u.m[0][2] * pm->u.m[2][3] +
+        pm->u.m[3][1] * pm->u.m[0][3] * pm->u.m[2][2];
+
+    v[5] = pm->u.m[0][0] * pm->u.m[2][2] * pm->u.m[3][3] -
+        pm->u.m[0][0] * pm->u.m[2][3] * pm->u.m[3][2] -
+        pm->u.m[2][0] * pm->u.m[0][2] * pm->u.m[3][3] +
+        pm->u.m[2][0] * pm->u.m[0][3] * pm->u.m[3][2] +
+        pm->u.m[3][0] * pm->u.m[0][2] * pm->u.m[2][3] -
+        pm->u.m[3][0] * pm->u.m[0][3] * pm->u.m[2][2];
+
+    v[9] = -pm->u.m[0][0] * pm->u.m[2][1] * pm->u.m[3][3] +
+        pm->u.m[0][0] * pm->u.m[2][3] * pm->u.m[3][1] +
+        pm->u.m[2][0] * pm->u.m[0][1] * pm->u.m[3][3] -
+        pm->u.m[2][0] * pm->u.m[0][3] * pm->u.m[3][1] -
+        pm->u.m[3][0] * pm->u.m[0][1] * pm->u.m[2][3] +
+        pm->u.m[3][0] * pm->u.m[0][3] * pm->u.m[2][1];
+
+    v[13] = pm->u.m[0][0] * pm->u.m[2][1] * pm->u.m[3][2] -
+        pm->u.m[0][0] * pm->u.m[2][2] * pm->u.m[3][1] -
+        pm->u.m[2][0] * pm->u.m[0][1] * pm->u.m[3][2] +
+        pm->u.m[2][0] * pm->u.m[0][2] * pm->u.m[3][1] +
+        pm->u.m[3][0] * pm->u.m[0][1] * pm->u.m[2][2] -
+        pm->u.m[3][0] * pm->u.m[0][2] * pm->u.m[2][1];
+
+    v[2] = pm->u.m[0][1] * pm->u.m[1][2] * pm->u.m[3][3] -
+        pm->u.m[0][1] * pm->u.m[1][3] * pm->u.m[3][2] -
+        pm->u.m[1][1] * pm->u.m[0][2] * pm->u.m[3][3] +
+        pm->u.m[1][1] * pm->u.m[0][3] * pm->u.m[3][2] +
+        pm->u.m[3][1] * pm->u.m[0][2] * pm->u.m[1][3] -
+        pm->u.m[3][1] * pm->u.m[0][3] * pm->u.m[1][2];
+
+    v[6] = -pm->u.m[0][0] * pm->u.m[1][2] * pm->u.m[3][3] +
+        pm->u.m[0][0] * pm->u.m[1][3] * pm->u.m[3][2] +
+        pm->u.m[1][0] * pm->u.m[0][2] * pm->u.m[3][3] -
+        pm->u.m[1][0] * pm->u.m[0][3] * pm->u.m[3][2] -
+        pm->u.m[3][0] * pm->u.m[0][2] * pm->u.m[1][3] +
+        pm->u.m[3][0] * pm->u.m[0][3] * pm->u.m[1][2];
+
+    v[10] = pm->u.m[0][0] * pm->u.m[1][1] * pm->u.m[3][3] -
+        pm->u.m[0][0] * pm->u.m[1][3] * pm->u.m[3][1] -
+        pm->u.m[1][0] * pm->u.m[0][1] * pm->u.m[3][3] +
+        pm->u.m[1][0] * pm->u.m[0][3] * pm->u.m[3][1] +
+        pm->u.m[3][0] * pm->u.m[0][1] * pm->u.m[1][3] -
+        pm->u.m[3][0] * pm->u.m[0][3] * pm->u.m[1][1];
+
+    v[14] = -pm->u.m[0][0] * pm->u.m[1][1] * pm->u.m[3][2] +
+        pm->u.m[0][0] * pm->u.m[1][2] * pm->u.m[3][1] +
+        pm->u.m[1][0] * pm->u.m[0][1] * pm->u.m[3][2] -
+        pm->u.m[1][0] * pm->u.m[0][2] * pm->u.m[3][1] -
+        pm->u.m[3][0] * pm->u.m[0][1] * pm->u.m[1][2] +
+        pm->u.m[3][0] * pm->u.m[0][2] * pm->u.m[1][1];
+
+    v[3] = -pm->u.m[0][1] * pm->u.m[1][2] * pm->u.m[2][3] +
+        pm->u.m[0][1] * pm->u.m[1][3] * pm->u.m[2][2] +
+        pm->u.m[1][1] * pm->u.m[0][2] * pm->u.m[2][3] -
+        pm->u.m[1][1] * pm->u.m[0][3] * pm->u.m[2][2] -
+        pm->u.m[2][1] * pm->u.m[0][2] * pm->u.m[1][3] +
+        pm->u.m[2][1] * pm->u.m[0][3] * pm->u.m[1][2];
+
+    v[7] = pm->u.m[0][0] * pm->u.m[1][2] * pm->u.m[2][3] -
+        pm->u.m[0][0] * pm->u.m[1][3] * pm->u.m[2][2] -
+        pm->u.m[1][0] * pm->u.m[0][2] * pm->u.m[2][3] +
+        pm->u.m[1][0] * pm->u.m[0][3] * pm->u.m[2][2] +
+        pm->u.m[2][0] * pm->u.m[0][2] * pm->u.m[1][3] -
+        pm->u.m[2][0] * pm->u.m[0][3] * pm->u.m[1][2];
+
+    v[11] = -pm->u.m[0][0] * pm->u.m[1][1] * pm->u.m[2][3] +
+        pm->u.m[0][0] * pm->u.m[1][3] * pm->u.m[2][1] +
+        pm->u.m[1][0] * pm->u.m[0][1] * pm->u.m[2][3] -
+        pm->u.m[1][0] * pm->u.m[0][3] * pm->u.m[2][1] -
+        pm->u.m[2][0] * pm->u.m[0][1] * pm->u.m[1][3] +
+        pm->u.m[2][0] * pm->u.m[0][3] * pm->u.m[1][1];
+
+    v[15] = pm->u.m[0][0] * pm->u.m[1][1] * pm->u.m[2][2] -
+        pm->u.m[0][0] * pm->u.m[1][2] * pm->u.m[2][1] -
+        pm->u.m[1][0] * pm->u.m[0][1] * pm->u.m[2][2] +
+        pm->u.m[1][0] * pm->u.m[0][2] * pm->u.m[2][1] +
+        pm->u.m[2][0] * pm->u.m[0][1] * pm->u.m[1][2] -
+        pm->u.m[2][0] * pm->u.m[0][2] * pm->u.m[1][1];
+
+    det = 1.0f / det;
 
-    TRACE("pout %p, pdeterminant %p, pm %p\n", pout, pdeterminant, pm);
+    for (i = 0; i < 4; i++)
+        for (j = 0; j < 4; j++)
+            pout->u.m[i][j] = v[4 * i + j] * det;
 
-    det = D3DXMatrixDeterminant(pm);
-    if ( !det ) return NULL;
-    if ( pdeterminant ) *pdeterminant = det;
-    for (i=0; i<4; i++)
-    {
-        for (j=0; j<4; j++)
-        {
-            if (j != i )
-            {
-                a = j;
-                if ( j > i ) a = a-1;
-                vec[a].x = pm->u.m[j][0];
-                vec[a].y = pm->u.m[j][1];
-                vec[a].z = pm->u.m[j][2];
-                vec[a].w = pm->u.m[j][3];
-            }
-        }
-    D3DXVec4Cross(&v, &vec[0], &vec[1], &vec[2]);
-    out.u.m[0][i] = pow(-1.0f, i) * v.x / det;
-    out.u.m[1][i] = pow(-1.0f, i) * v.y / det;
-    out.u.m[2][i] = pow(-1.0f, i) * v.z / det;
-    out.u.m[3][i] = pow(-1.0f, i) * v.w / det;
-   }
-
-   *pout = out;
-   return pout;
+    return pout;
 }
 
 D3DXMATRIX* WINAPI D3DXMatrixLookAtLH(D3DXMATRIX *pout, const D3DXVECTOR3 
*peye, const D3DXVECTOR3 *pat, const D3DXVECTOR3 *pup)
-- 
1.7.10.4



Reply via email to