Nice work!

See my comments below, and double-check if some of them can be applied to the shaders I didn't review yet.

I recommend you to test your work because if one sched code is wrong, you are likely going to kill your card and reboot your box. :-)

On 06/03/2017 04:16 PM, Aaryaman Vasishta wrote:
v2: Add missing delays

This patch adds proper delays to maxwell exa shaders. rendercheck tests
seem consistent with/without this patch. I haven't extensively tested
them though.

Trello:
https://trello.com/c/6LPB2EIS/174-update-maxwell-shaders-with-proper-delays

Signed-off-by: Aaryaman Vasishta <jem456.vasis...@gmail.com>
---
  src/shader/exac8nv110.fp  | 10 +++++-----
  src/shader/exac8nv110.fpc | 18 +++++++++---------
  src/shader/exacanv110.fp  | 10 +++++-----
  src/shader/exacanv110.fpc | 18 +++++++++---------
  src/shader/exacmnv110.fp  | 10 +++++-----
  src/shader/exacmnv110.fpc | 18 +++++++++---------
  src/shader/exas8nv110.fp  |  6 +++---
  src/shader/exas8nv110.fpc | 12 ++++++------
  src/shader/exasanv110.fp  | 10 +++++-----
  src/shader/exasanv110.fpc | 18 +++++++++---------
  src/shader/exascnv110.fp  |  6 +++---
  src/shader/exascnv110.fpc | 10 +++++-----
  src/shader/videonv110.fp  | 14 +++++++-------
  src/shader/videonv110.fpc | 26 +++++++++++++-------------
  14 files changed, 93 insertions(+), 93 deletions(-)

diff --git a/src/shader/exac8nv110.fp b/src/shader/exac8nv110.fp
index ce78036..1c4a4f1 100644
--- a/src/shader/exac8nv110.fp
+++ b/src/shader/exac8nv110.fp
@@ -25,23 +25,23 @@ NV110FP_Composite_A8[] = {
  };
  #else
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
  ipa pass $r0 a[0x7c] 0x0 0x0 0x1
  mufu rcp $r0 $r0
  ipa $r3 a[0x94] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x1) (st 0xf wr 0x0 rd 0x1 wt 0x3) (st 0xf wr 0x1 wt 0x2)
  ipa $r2 a[0x90] $r0 0x0 0x1
  tex nodep $r1 $r2 0x0 0x1 t2d 0x8
  ipa $r3 a[0x84] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x2) (st 0xf wr 0x1 wt 0x6) (st 0xf)
  ipa $r2 a[0x80] $r0 0x0 0x1
  tex nodep $r0 $r2 0x0 0x0 t2d 0x8

Out of curiosity, what didn't you add a read-dep-bar on $r2:$r3 here?

  depbar le 0x5 0x0 0x0
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x6 wt 0x3) (st 0x6) (st 0x1)
  fmul ftz $r3 $r0 $r1
  mov $r2 $r3 0xf

You can stall for only one cycle here, but the 6 cycles on fmul is needed.

  mov $r1 $r3 0xf
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x6) (st 0xf) (st 0x0)
  mov $r0 $r3 0xf

Same here.

  exit
  #endif
diff --git a/src/shader/exac8nv110.fpc b/src/shader/exac8nv110.fpc
index 4aa1368..46943b7 100644
--- a/src/shader/exac8nv110.fpc
+++ b/src/shader/exac8nv110.fpc
@@ -1,36 +1,36 @@
-0xfc0007e0,
-0x001f8000,
+0xe1a0070f,
+0x003c3c01,
  0xcff7ff00,
  0xe003ff87,
  0x00470000,
  0x50800000,
  0x4007ff03,
  0xe043ff89,
-0xfc0007e0,
-0x001f8000,
+0x21e0072f,
+0x005cbc03,
  0x0007ff02,
  0xe043ff89,
  0x2ff70201,
  0xc03a0014,
  0x4007ff03,
  0xe043ff88,
-0xfc0007e0,
-0x001f8000,
+0xe5e0074f,
+0x001fbc06,
  0x0007ff02,
  0xe043ff88,
  0x2ff70200,
  0xc03a0004,
  0x34070000,
  0xf0f00000,
-0xfc0007e0,
-0x001f8000,
+0xfcc01fe6,
+0x001f8400,
  0x00170003,
  0x5c681000,
  0x00370002,
  0x5c980780,
  0x00370001,
  0x5c980780,
-0xfc0007e0,
+0xfde007e6,
  0x001f8000,
  0x00370000,
  0x5c980780,
diff --git a/src/shader/exacanv110.fp b/src/shader/exacanv110.fp
index a70d5c5..d7c2867 100644
--- a/src/shader/exacanv110.fp
+++ b/src/shader/exacanv110.fp
@@ -25,23 +25,23 @@ NV110FP_CAComposite[] = {
  };
  #else
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
  ipa pass $r0 a[0x7c] 0x0 0x0 0x1
  mufu rcp $r0 $r0
  ipa $r3 a[0x94] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2)
  ipa $r2 a[0x90] $r0 0x0 0x1
  tex nodep $r4 $r2 0x0 0x1 t2d 0xf

Please add a read-dep-bar and wait for on the first fmul because $r2:$r3 are re-used before $r4. Should be safer.

  ipa $r1 a[0x84] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf)
  ipa $r0 a[0x80] $r0 0x0 0x1
  tex nodep $r0 $r0 0x0 0x0 t2d 0xf
  depbar le 0x5 0x0 0x0
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
  fmul ftz $r3 $r3 $r7

Why are you waiting all barriers? Only $r3 is needed here.

  fmul ftz $r2 $r2 $r6
  fmul ftz $r1 $r1 $r5
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x1 wt 0x3) (st 0xf) (st 0x0)
  fmul ftz $r0 $r0 $r4
  exit
  #endif
diff --git a/src/shader/exacanv110.fpc b/src/shader/exacanv110.fpc
index 7c0ca5e..9cad139 100644
--- a/src/shader/exacanv110.fpc
+++ b/src/shader/exacanv110.fpc
@@ -1,36 +1,36 @@
-0xfc0007e0,
-0x001f8000,
+0xe1a0070f,
+0x003c3c01,
  0xcff7ff00,
  0xe003ff87,
  0x00470000,
  0x50800000,
  0x4007ff03,
  0xe043ff89,
-0xfc0007e0,
-0x001f8000,
+0xe1e0072f,
+0x0008bc03,
  0x0007ff02,
  0xe043ff89,
  0xaff70204,
  0xc03a0017,
  0x4007ff01,
  0xe043ff88,
-0xfc0007e0,
-0x001f8000,
+0xe5e0274f,
+0x001fbc06,
  0x0007ff00,
  0xe043ff88,
  0xaff70000,
  0xc03a0007,
  0x34070000,
  0xf0f00000,
-0xfc0007e0,
-0x001f8000,
+0xfc21ffe1,
+0x001f8400,
  0x00770303,
  0x5c681000,
  0x00670202,
  0x5c681000,
  0x00570101,
  0x5c681000,
-0xfc0007e0,
+0xfde01fe1,
  0x001f8000,
  0x00470000,
  0x5c681000,
diff --git a/src/shader/exacmnv110.fp b/src/shader/exacmnv110.fp
index fe5c294..d717138 100644
--- a/src/shader/exacmnv110.fp
+++ b/src/shader/exacmnv110.fp
@@ -25,23 +25,23 @@ NV110FP_Composite[] = {
  };
  #else
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
  ipa pass $r0 a[0x7c] 0x0 0x0 0x1
  mufu rcp $r0 $r0
  ipa $r3 a[0x94] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2)
  ipa $r2 a[0x90] $r0 0x0 0x1
  tex nodep $r4 $r2 0x0 0x1 t2d 0x8
  ipa $r1 a[0x84] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf)
  ipa $r0 a[0x80] $r0 0x0 0x1
  tex nodep $r0 $r0 0x0 0x0 t2d 0xf
  depbar le 0x5 0x0 0x0
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
  fmul ftz $r3 $r3 $r4
  fmul ftz $r2 $r2 $r4
  fmul ftz $r1 $r1 $r4
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x6 wt 0x2) (st 0xf) (st 0x0)
  fmul ftz $r0 $r0 $r4
  exit
  #endif
diff --git a/src/shader/exacmnv110.fpc b/src/shader/exacmnv110.fpc
index 9d62c1a..c150875 100644
--- a/src/shader/exacmnv110.fpc
+++ b/src/shader/exacmnv110.fpc
@@ -1,36 +1,36 @@
-0xfc0007e0,
-0x001f8000,
+0xe1a0070f,
+0x003c3c01,
  0xcff7ff00,
  0xe003ff87,
  0x00470000,
  0x50800000,
  0x4007ff03,
  0xe043ff89,
-0xfc0007e0,
-0x001f8000,
+0xe1e0072f,
+0x0008bc03,
  0x0007ff02,
  0xe043ff89,
  0x2ff70204,
  0xc03a0014,
  0x4007ff01,
  0xe043ff88,
-0xfc0007e0,
-0x001f8000,
+0xe5e0274f,
+0x001fbc06,
  0x0007ff00,
  0xe043ff88,
  0xaff70000,
  0xc03a0007,
  0x34070000,
  0xf0f00000,
-0xfc0007e0,
-0x001f8000,
+0xfc21ffe1,
+0x001f8400,
  0x00470303,
  0x5c681000,
  0x00470202,
  0x5c681000,
  0x00470101,
  0x5c681000,
-0xfc0007e0,
+0xfde017e6,
  0x001f8000,
  0x00470000,
  0x5c681000,
diff --git a/src/shader/exas8nv110.fp b/src/shader/exas8nv110.fp
index 4fe2e19..a555beb 100644
--- a/src/shader/exas8nv110.fp
+++ b/src/shader/exas8nv110.fp
@@ -25,15 +25,15 @@ NV110FP_Source_A8[] = {
  };
  #else
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
  ipa pass $r0 a[0x7c] 0x0 0x0 0x1
  mufu rcp $r0 $r0
  ipa $r1 a[0x84] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf)
  ipa $r0 a[0x80] $r0 0x0 0x1
  tex nodep $r0 $r0 0x0 0x0 t2d 0x8
  depbar le 0x5 0x0 0x0
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x1 wt 0x1) (st 0x1) (st 0x1)
  mov $r3 $r0 0xf
  mov $r2 $r0 0xf
  mov $r1 $r0 0xf

This one looks good!

diff --git a/src/shader/exas8nv110.fpc b/src/shader/exas8nv110.fpc
index 1181c41..e58d168 100644
--- a/src/shader/exas8nv110.fpc
+++ b/src/shader/exas8nv110.fpc
@@ -1,21 +1,21 @@
-0xfc0007e0,
-0x001f8000,
+0xe1a0070f,
+0x003c3c01,
  0xcff7ff00,
  0xe003ff87,
  0x00470000,
  0x50800000,
  0x4007ff01,
  0xe043ff88,
-0xfc0007e0,
-0x001f8000,
+0xe1e0072f,
+0x001fbc03,
  0x0007ff00,
  0xe043ff88,
  0x2ff70000,
  0xc03a0004,
  0x34070000,
  0xf0f00000,
-0xfc0007e0,
-0x001f8000,
+0xfc200fe1,
+0x001f8400,
  0x00070003,
  0x5c980780,
  0x00070002,
diff --git a/src/shader/exasanv110.fp b/src/shader/exasanv110.fp
index 61374a6..ad7ca36 100644
--- a/src/shader/exasanv110.fp
+++ b/src/shader/exasanv110.fp
@@ -25,23 +25,23 @@ NV110FP_CACompositeSrcAlpha[] = {
  };
  #else
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
  ipa pass $r0 a[0x7c] 0x0 0x0 0x1
  mufu rcp $r0 $r0
  ipa $r3 a[0x84] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1 rd 0x2)
  ipa $r2 a[0x80] $r0 0x0 0x1
  tex nodep $r4 $r2 0x0 0x0 t2d 0x8
  ipa $r1 a[0x94] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x2 wt 0x4) (st 0xf wr 0x1 wt 0x6) (st 0xf)
  ipa $r0 a[0x90] $r0 0x0 0x1
  tex nodep $r0 $r0 0x0 0x1 t2d 0xf
  depbar le 0x5 0x0 0x0
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x1 wt 0x3f) (st 0x1) (st 0x1)
  fmul ftz $r3 $r3 $r4
  fmul ftz $r2 $r2 $r4
  fmul ftz $r1 $r1 $r4
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x1 wt 0x2) (st 0xf) (st 0x0)
  fmul ftz $r0 $r0 $r4
  exit
  #endif
diff --git a/src/shader/exasanv110.fpc b/src/shader/exasanv110.fpc
index 5516a03..1485f11 100644
--- a/src/shader/exasanv110.fpc
+++ b/src/shader/exasanv110.fpc
@@ -1,36 +1,36 @@
-0xfc0007e0,
-0x001f8000,
+0xe1a0070f,
+0x003c3c01,
  0xcff7ff00,
  0xe003ff87,
  0x00470000,
  0x50800000,
  0x4007ff03,
  0xe043ff88,
-0xfc0007e0,
-0x001f8000,
+0xe1e0072f,
+0x0008bc03,
  0x0007ff02,
  0xe043ff88,
  0x2ff70204,
  0xc03a0004,
  0x4007ff01,
  0xe043ff89,
-0xfc0007e0,
-0x001f8000,
+0xe5e0274f,
+0x001fbc06,
  0x0007ff00,
  0xe043ff89,
  0xaff70000,
  0xc03a0017,
  0x34070000,
  0xf0f00000,
-0xfc0007e0,
-0x001f8000,
+0xfc21ffe1,
+0x001f8400,
  0x00470303,
  0x5c681000,
  0x00470202,
  0x5c681000,
  0x00470101,
  0x5c681000,
-0xfc0007e0,
+0xfde017e1,
  0x001f8000,
  0x00470000,
  0x5c681000,
diff --git a/src/shader/exascnv110.fp b/src/shader/exascnv110.fp
index 90bbb55..86e14e8 100644
--- a/src/shader/exascnv110.fp
+++ b/src/shader/exascnv110.fp
@@ -25,14 +25,14 @@ NV110FP_Source[] = {
  };
  #else
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
  ipa pass $r0 a[0x7c] 0x0 0x0 0x1
  mufu rcp $r0 $r0
  ipa $r1 a[0x84] $r0 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x1) (st 0xf wt 0x3) (st 0xf)
  ipa $r0 a[0x80] $r0 0x0 0x1
  tex nodep $r0 $r0 0x0 0x0 t2d 0xf
  depbar le 0x5 0x0 0x0
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf) (st 0x0) (st 0x0)

Looks good.

  exit
  #endif
diff --git a/src/shader/exascnv110.fpc b/src/shader/exascnv110.fpc
index 2dba15d..1fef5d2 100644
--- a/src/shader/exascnv110.fpc
+++ b/src/shader/exascnv110.fpc
@@ -1,20 +1,20 @@
-0xfc0007e0,
-0x001f8000,
+0xe1a0070f,
+0x003c3c01,
  0xcff7ff00,
  0xe003ff87,
  0x00470000,
  0x50800000,
  0x4007ff01,
  0xe043ff88,
-0xfc0007e0,
-0x001f8000,
+0xfde0072f,
+0x001fbc03,
  0x0007ff00,
  0xe043ff88,
  0xaff70000,
  0xc03a0007,
  0x34070000,
  0xf0f00000,
-0xfc0007e0,
+0xfc0007ef,
  0x001f8000,
  0x0007000f,
  0xe3000000,
diff --git a/src/shader/videonv110.fp b/src/shader/videonv110.fp
index 2728311..dd3816c 100644
--- a/src/shader/videonv110.fp
+++ b/src/shader/videonv110.fp
@@ -25,30 +25,30 @@ NV110FP_NV12[] = {
  };
  #else
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x0) (st 0xd wr 0x0 wt 0x1) (st 0xf wr 0x0 wt 0x1)
  ipa pass $r2 a[0x7c] 0x0 0x0 0x1
  mufu rcp $r2 $r2
  ipa $r0 a[0x80] $r2 0x0 0x1
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf wr 0x1) (st 0xf wr 0x0 wt 0x3) (st 0xf wr 0x1)
  ipa $r1 a[0x84] $r2 0x0 0x1
  tex nodep $r4 $r0 0x0 0x0 t2d 0x8
  tex nodep $r0 $r0 0x0 0x1 t2d 0xc
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf) (st 0x6 wt 0x1) (st 0x6)
  depbar le 0x5 0x1 0x1
  fmul ftz $r5 $r4 c0[0x0]
  fadd ftz $r3 $r5 c0[0x4]
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x6) (st 0x6) (st 0xf)
  fadd ftz $r4 $r5 c0[0x8]
  fadd ftz $r5 $r5 c0[0xc]
  depbar le 0x5 0x0 0x0
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x6 wt 0x2) (st 0x1) (st 0x1)
  ffma ftz $r3 $r0 c0[0x10] $r3
  ffma ftz $r4 $r0 c0[0x14] $r4
  ffma ftz $r5 $r0 c0[0x18] $r5
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0x1) (st 0x1) (st 0x6)
  ffma ftz $r0 $r1 c0[0x1c] $r3
  ffma ftz $r2 $r1 c0[0x24] $r5
  ffma ftz $r1 $r1 c0[0x20] $r4
-sched (st 0x0) (st 0x0) (st 0x0)
+sched (st 0xf) (st 0x0) (st 0x0)
  exit
  #endif
diff --git a/src/shader/videonv110.fpc b/src/shader/videonv110.fpc
index 31d745a..8fbc246 100644
--- a/src/shader/videonv110.fpc
+++ b/src/shader/videonv110.fpc
@@ -1,52 +1,52 @@
-0xfc0007e0,
-0x001f8000,
+0xe1a0070f,
+0x003c3c01,
  0xcff7ff02,
  0xe003ff87,
  0x00470202,
  0x50800000,
  0x0027ff00,
  0xe043ff88,
-0xfc0007e0,
-0x001f8000,
+0xe1e0072f,
+0x001cbc03,
  0x4027ff01,
  0xe043ff88,
  0x2ff70004,
  0xc03a0004,
  0x2ff70000,
  0xc03a0016,
-0xfc0007e0,
-0x001f8000,
+0xfcc007ef,
+0x001f9801,
  0x34170001,
  0xf0f00000,
  0x00070405,
  0x4c681000,
  0x00170503,
  0x4c581000,
-0xfc0007e0,
-0x001f8000,
+0xfcc007e6,
+0x001fbc00,
  0x00270504,
  0x4c581000,
  0x00370505,
  0x4c581000,
  0x34070000,
  0xf0f00000,
-0xfc0007e0,
-0x001f8000,
+0xfc2017e6,
+0x001f8400,
  0x00470003,
  0x49a00180,
  0x00570004,
  0x49a00200,
  0x00670005,
  0x49a00280,
-0xfc0007e0,
-0x001f8000,
+0xfc2007e1,
+0x001f9800,
  0x00770100,
  0x49a00180,
  0x00970102,
  0x49a00280,
  0x00870101,
  0x49a00200,
-0xfc0007e0,
+0xfc0007ef,
  0x001f8000,
  0x0007000f,
  0xe3000000,

_______________________________________________
Nouveau mailing list
Nouveau@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/nouveau

Reply via email to