Re: [Mesa-dev] R600/SI: Initial double precision support for Radeon SI

2013-07-09 Thread Niels Ole Salscheider
Hi Tom,

 All these patches look good to me, but #2 and #6 should have a test case
 with them.  If you resubmit these patches with test cases, I will push the
 entire series.

I have attached an updated patchset. I have added a test case to patch #2 and 
#6. I have also replaced the scalar move in patch #2 by a vector move since 
there is probably no point in having a floating point value in a scalar 
register.

Kind regards,

OleFrom 4224b314cf2d97cdf2ac99564d6155fa04fbb971 Mon Sep 17 00:00:00 2001
From: Niels Ole Salscheider niels_...@salscheider-online.de
Date: Sat, 1 Jun 2013 16:48:56 +0200
Subject: [PATCH 1/6] R600/SI: Add initial double precision support for SI

---
 lib/Target/R600/AMDGPUISelLowering.cpp |  6 ++
 lib/Target/R600/SIISelLowering.cpp |  1 +
 lib/Target/R600/SIInstructions.td  | 30 +-
 test/CodeGen/R600/fadd64.ll| 13 +
 test/CodeGen/R600/fdiv64.ll| 14 ++
 test/CodeGen/R600/fmul64.ll| 13 +
 test/CodeGen/R600/load64.ll| 20 
 7 Dateien geändert, 96 Zeilen hinzugefügt(+), 1 Zeile entfernt(-)
 create mode 100644 test/CodeGen/R600/fadd64.ll
 create mode 100644 test/CodeGen/R600/fdiv64.ll
 create mode 100644 test/CodeGen/R600/fmul64.ll
 create mode 100644 test/CodeGen/R600/load64.ll

diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 4019a1f..5f3d496 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -60,12 +60,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine TM) :
   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
 
+  setOperationAction(ISD::STORE, MVT::f64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
+
   setOperationAction(ISD::LOAD, MVT::f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
 
   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
 
+  setOperationAction(ISD::LOAD, MVT::f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
+
   setOperationAction(ISD::MUL, MVT::i64, Expand);
 
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 9d4cfef..0d17a12 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -45,6 +45,7 @@ SITargetLowering::SITargetLowering(TargetMachine TM) :
 
   addRegisterClass(MVT::v2i32, AMDGPU::VReg_64RegClass);
   addRegisterClass(MVT::v2f32, AMDGPU::VReg_64RegClass);
+  addRegisterClass(MVT::f64, AMDGPU::VReg_64RegClass);
 
   addRegisterClass(MVT::v4i32, AMDGPU::VReg_128RegClass);
   addRegisterClass(MVT::v4f32, AMDGPU::VReg_128RegClass);
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 9c96c08..b956387 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -660,7 +660,9 @@ defm V_RSQ_LEGACY_F32 : VOP1_32 
   [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))]
 ;
 defm V_RSQ_F32 : VOP1_32 0x002e, V_RSQ_F32, [];
-defm V_RCP_F64 : VOP1_64 0x002f, V_RCP_F64, [];
+defm V_RCP_F64 : VOP1_64 0x002f, V_RCP_F64,
+  [(set f64:$dst, (fdiv FP_ONE, f64:$src0))]
+;
 defm V_RCP_CLAMP_F64 : VOP1_64 0x0030, V_RCP_CLAMP_F64, [];
 defm V_RSQ_F64 : VOP1_64 0x0031, V_RSQ_F64, [];
 defm V_RSQ_CLAMP_F64 : VOP1_64 0x0032, V_RSQ_CLAMP_F64, [];
@@ -996,10 +998,25 @@ def V_LSHR_B64 : VOP3_64_Shift 0x0162, V_LSHR_B64,
 ;
 def V_ASHR_I64 : VOP3_64_Shift 0x0163, V_ASHR_I64, [];
 
+let isCommutable = 1 in {
+
 def V_ADD_F64 : VOP3_64 0x0164, V_ADD_F64, [];
 def V_MUL_F64 : VOP3_64 0x0165, V_MUL_F64, [];
 def V_MIN_F64 : VOP3_64 0x0166, V_MIN_F64, [];
 def V_MAX_F64 : VOP3_64 0x0167, V_MAX_F64, [];
+
+} // isCommutable = 1
+
+def : Pat 
+  (fadd f64:$src0, f64:$src1),
+  (V_ADD_F64 $src0, $src1, (i64 0))
+;
+
+def : Pat  
+  (fmul f64:$src0, f64:$src1),
+  (V_MUL_F64 $src0, $src1, (i64 0))
+;
+
 def V_LDEXP_F64 : VOP3_64 0x0168, V_LDEXP_F64, [];
 
 let isCommutable = 1 in {
@@ -1417,6 +1434,10 @@ def : BitConvert i32, f32, VReg_32;
 def : BitConvert f32, i32, SReg_32;
 def : BitConvert f32, i32, VReg_32;
 
+def : BitConvert i64, f64, VReg_64;
+
+def : BitConvert f64, i64, VReg_64;
+
 /** === **/
 /** Src  Dst modifiers **/
 /** === **/
@@ -1505,6 +1526,11 @@ def : Pat
   (V_MUL_F32_e32 $src0, (V_RCP_F32_e32 $src1))
 ;
 
+def : Pat
+  (fdiv f64:$src0, f64:$src1),
+  (V_MUL_F64 $src0, (V_RCP_F64_e32 $src1), (i64 0))
+;
+
 def : Pat 
   (fcos f32:$src0),
   (V_COS_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
@@ -1634,6 +1660,8 @@ multiclass MUBUFLoad_Pattern MUBUF Instr_ADDR64, ValueType vt,
   ;
 }
 
+defm : MUBUFLoad_Pattern 

Re: [Mesa-dev] R600/SI: Initial double precision support for Radeon SI

2013-07-08 Thread Tom Stellard
On Tue, Jul 02, 2013 at 10:44:10AM +0200, Niels Ole Salscheider wrote:
 Hi,
 
 the attached patches add initial support for double precision operations on 
 Southern Islands cards.
 
 Some expressions containing multiple double precision kernel arguments cause 
 llvm to run until all memory is used - but I do not (yet) know why.
 It works fine as long as I pass pointers to double values.
 

I may have an idea about why this is happening.  Could you file a bug
report and attach an LLVM IR test case?

All these patches look good to me, but #2 and #6 should have a test case
with them.  If you resubmit these patches with test cases, I will push the
entire series.

Nice work!

-Tom


 Regards,
 
 Ole

 From 4224b314cf2d97cdf2ac99564d6155fa04fbb971 Mon Sep 17 00:00:00 2001
 From: Niels Ole Salscheider niels_...@salscheider-online.de
 Date: Sat, 1 Jun 2013 16:48:56 +0200
 Subject: [PATCH 1/6] R600/SI: Add initial double precision support for SI
 
 ---
  lib/Target/R600/AMDGPUISelLowering.cpp |  6 ++
  lib/Target/R600/SIISelLowering.cpp |  1 +
  lib/Target/R600/SIInstructions.td  | 30 +-
  test/CodeGen/R600/fadd64.ll| 13 +
  test/CodeGen/R600/fdiv64.ll| 14 ++
  test/CodeGen/R600/fmul64.ll| 13 +
  test/CodeGen/R600/load64.ll| 20 
  7 Dateien geändert, 96 Zeilen hinzugefügt(+), 1 Zeile entfernt(-)
  create mode 100644 test/CodeGen/R600/fadd64.ll
  create mode 100644 test/CodeGen/R600/fdiv64.ll
  create mode 100644 test/CodeGen/R600/fmul64.ll
  create mode 100644 test/CodeGen/R600/load64.ll
 
 diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp 
 b/lib/Target/R600/AMDGPUISelLowering.cpp
 index 4019a1f..5f3d496 100644
 --- a/lib/Target/R600/AMDGPUISelLowering.cpp
 +++ b/lib/Target/R600/AMDGPUISelLowering.cpp
 @@ -60,12 +60,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine 
 TM) :
setOperationAction(ISD::STORE, MVT::v4f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
  
 +  setOperationAction(ISD::STORE, MVT::f64, Promote);
 +  AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
 +
setOperationAction(ISD::LOAD, MVT::f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
  
setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
  
 +  setOperationAction(ISD::LOAD, MVT::f64, Promote);
 +  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
 +
setOperationAction(ISD::MUL, MVT::i64, Expand);
  
setOperationAction(ISD::UDIV, MVT::i32, Expand);
 diff --git a/lib/Target/R600/SIISelLowering.cpp 
 b/lib/Target/R600/SIISelLowering.cpp
 index 9d4cfef..0d17a12 100644
 --- a/lib/Target/R600/SIISelLowering.cpp
 +++ b/lib/Target/R600/SIISelLowering.cpp
 @@ -45,6 +45,7 @@ SITargetLowering::SITargetLowering(TargetMachine TM) :
  
addRegisterClass(MVT::v2i32, AMDGPU::VReg_64RegClass);
addRegisterClass(MVT::v2f32, AMDGPU::VReg_64RegClass);
 +  addRegisterClass(MVT::f64, AMDGPU::VReg_64RegClass);
  
addRegisterClass(MVT::v4i32, AMDGPU::VReg_128RegClass);
addRegisterClass(MVT::v4f32, AMDGPU::VReg_128RegClass);
 diff --git a/lib/Target/R600/SIInstructions.td 
 b/lib/Target/R600/SIInstructions.td
 index 9c96c08..b956387 100644
 --- a/lib/Target/R600/SIInstructions.td
 +++ b/lib/Target/R600/SIInstructions.td
 @@ -660,7 +660,9 @@ defm V_RSQ_LEGACY_F32 : VOP1_32 
[(set f32:$dst, (int_AMDGPU_rsq f32:$src0))]
  ;
  defm V_RSQ_F32 : VOP1_32 0x002e, V_RSQ_F32, [];
 -defm V_RCP_F64 : VOP1_64 0x002f, V_RCP_F64, [];
 +defm V_RCP_F64 : VOP1_64 0x002f, V_RCP_F64,
 +  [(set f64:$dst, (fdiv FP_ONE, f64:$src0))]
 +;
  defm V_RCP_CLAMP_F64 : VOP1_64 0x0030, V_RCP_CLAMP_F64, [];
  defm V_RSQ_F64 : VOP1_64 0x0031, V_RSQ_F64, [];
  defm V_RSQ_CLAMP_F64 : VOP1_64 0x0032, V_RSQ_CLAMP_F64, [];
 @@ -996,10 +998,25 @@ def V_LSHR_B64 : VOP3_64_Shift 0x0162, 
 V_LSHR_B64,
  ;
  def V_ASHR_I64 : VOP3_64_Shift 0x0163, V_ASHR_I64, [];
  
 +let isCommutable = 1 in {
 +
  def V_ADD_F64 : VOP3_64 0x0164, V_ADD_F64, [];
  def V_MUL_F64 : VOP3_64 0x0165, V_MUL_F64, [];
  def V_MIN_F64 : VOP3_64 0x0166, V_MIN_F64, [];
  def V_MAX_F64 : VOP3_64 0x0167, V_MAX_F64, [];
 +
 +} // isCommutable = 1
 +
 +def : Pat 
 +  (fadd f64:$src0, f64:$src1),
 +  (V_ADD_F64 $src0, $src1, (i64 0))
 +;
 +
 +def : Pat  
 +  (fmul f64:$src0, f64:$src1),
 +  (V_MUL_F64 $src0, $src1, (i64 0))
 +;
 +
  def V_LDEXP_F64 : VOP3_64 0x0168, V_LDEXP_F64, [];
  
  let isCommutable = 1 in {
 @@ -1417,6 +1434,10 @@ def : BitConvert i32, f32, VReg_32;
  def : BitConvert f32, i32, SReg_32;
  def : BitConvert f32, i32, VReg_32;
  
 +def : BitConvert i64, f64, VReg_64;
 +
 +def : BitConvert f64, i64, VReg_64;
 +
  /** === **/
  /** Src  Dst modifiers **/
  /** === **/
 @@ -1505,6 +1526,11 @@ def : Pat
 

[Mesa-dev] R600/SI: Initial double precision support for Radeon SI

2013-07-02 Thread Niels Ole Salscheider
Hi,

the attached patches add initial support for double precision operations on 
Southern Islands cards.

Some expressions containing multiple double precision kernel arguments cause 
llvm to run until all memory is used - but I do not (yet) know why.
It works fine as long as I pass pointers to double values.

Regards,

OleFrom 4224b314cf2d97cdf2ac99564d6155fa04fbb971 Mon Sep 17 00:00:00 2001
From: Niels Ole Salscheider niels_...@salscheider-online.de
Date: Sat, 1 Jun 2013 16:48:56 +0200
Subject: [PATCH 1/6] R600/SI: Add initial double precision support for SI

---
 lib/Target/R600/AMDGPUISelLowering.cpp |  6 ++
 lib/Target/R600/SIISelLowering.cpp |  1 +
 lib/Target/R600/SIInstructions.td  | 30 +-
 test/CodeGen/R600/fadd64.ll| 13 +
 test/CodeGen/R600/fdiv64.ll| 14 ++
 test/CodeGen/R600/fmul64.ll| 13 +
 test/CodeGen/R600/load64.ll| 20 
 7 Dateien geändert, 96 Zeilen hinzugefügt(+), 1 Zeile entfernt(-)
 create mode 100644 test/CodeGen/R600/fadd64.ll
 create mode 100644 test/CodeGen/R600/fdiv64.ll
 create mode 100644 test/CodeGen/R600/fmul64.ll
 create mode 100644 test/CodeGen/R600/load64.ll

diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp b/lib/Target/R600/AMDGPUISelLowering.cpp
index 4019a1f..5f3d496 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -60,12 +60,18 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine TM) :
   setOperationAction(ISD::STORE, MVT::v4f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
 
+  setOperationAction(ISD::STORE, MVT::f64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::f64, MVT::i64);
+
   setOperationAction(ISD::LOAD, MVT::f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::f32, MVT::i32);
 
   setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
 
+  setOperationAction(ISD::LOAD, MVT::f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::f64, MVT::i64);
+
   setOperationAction(ISD::MUL, MVT::i64, Expand);
 
   setOperationAction(ISD::UDIV, MVT::i32, Expand);
diff --git a/lib/Target/R600/SIISelLowering.cpp b/lib/Target/R600/SIISelLowering.cpp
index 9d4cfef..0d17a12 100644
--- a/lib/Target/R600/SIISelLowering.cpp
+++ b/lib/Target/R600/SIISelLowering.cpp
@@ -45,6 +45,7 @@ SITargetLowering::SITargetLowering(TargetMachine TM) :
 
   addRegisterClass(MVT::v2i32, AMDGPU::VReg_64RegClass);
   addRegisterClass(MVT::v2f32, AMDGPU::VReg_64RegClass);
+  addRegisterClass(MVT::f64, AMDGPU::VReg_64RegClass);
 
   addRegisterClass(MVT::v4i32, AMDGPU::VReg_128RegClass);
   addRegisterClass(MVT::v4f32, AMDGPU::VReg_128RegClass);
diff --git a/lib/Target/R600/SIInstructions.td b/lib/Target/R600/SIInstructions.td
index 9c96c08..b956387 100644
--- a/lib/Target/R600/SIInstructions.td
+++ b/lib/Target/R600/SIInstructions.td
@@ -660,7 +660,9 @@ defm V_RSQ_LEGACY_F32 : VOP1_32 
   [(set f32:$dst, (int_AMDGPU_rsq f32:$src0))]
 ;
 defm V_RSQ_F32 : VOP1_32 0x002e, V_RSQ_F32, [];
-defm V_RCP_F64 : VOP1_64 0x002f, V_RCP_F64, [];
+defm V_RCP_F64 : VOP1_64 0x002f, V_RCP_F64,
+  [(set f64:$dst, (fdiv FP_ONE, f64:$src0))]
+;
 defm V_RCP_CLAMP_F64 : VOP1_64 0x0030, V_RCP_CLAMP_F64, [];
 defm V_RSQ_F64 : VOP1_64 0x0031, V_RSQ_F64, [];
 defm V_RSQ_CLAMP_F64 : VOP1_64 0x0032, V_RSQ_CLAMP_F64, [];
@@ -996,10 +998,25 @@ def V_LSHR_B64 : VOP3_64_Shift 0x0162, V_LSHR_B64,
 ;
 def V_ASHR_I64 : VOP3_64_Shift 0x0163, V_ASHR_I64, [];
 
+let isCommutable = 1 in {
+
 def V_ADD_F64 : VOP3_64 0x0164, V_ADD_F64, [];
 def V_MUL_F64 : VOP3_64 0x0165, V_MUL_F64, [];
 def V_MIN_F64 : VOP3_64 0x0166, V_MIN_F64, [];
 def V_MAX_F64 : VOP3_64 0x0167, V_MAX_F64, [];
+
+} // isCommutable = 1
+
+def : Pat 
+  (fadd f64:$src0, f64:$src1),
+  (V_ADD_F64 $src0, $src1, (i64 0))
+;
+
+def : Pat  
+  (fmul f64:$src0, f64:$src1),
+  (V_MUL_F64 $src0, $src1, (i64 0))
+;
+
 def V_LDEXP_F64 : VOP3_64 0x0168, V_LDEXP_F64, [];
 
 let isCommutable = 1 in {
@@ -1417,6 +1434,10 @@ def : BitConvert i32, f32, VReg_32;
 def : BitConvert f32, i32, SReg_32;
 def : BitConvert f32, i32, VReg_32;
 
+def : BitConvert i64, f64, VReg_64;
+
+def : BitConvert f64, i64, VReg_64;
+
 /** === **/
 /** Src  Dst modifiers **/
 /** === **/
@@ -1505,6 +1526,11 @@ def : Pat
   (V_MUL_F32_e32 $src0, (V_RCP_F32_e32 $src1))
 ;
 
+def : Pat
+  (fdiv f64:$src0, f64:$src1),
+  (V_MUL_F64 $src0, (V_RCP_F64_e32 $src1), (i64 0))
+;
+
 def : Pat 
   (fcos f32:$src0),
   (V_COS_F32_e32 (V_MUL_F32_e32 $src0, (V_MOV_B32_e32 CONST.TWO_PI_INV)))
@@ -1634,6 +1660,8 @@ multiclass MUBUFLoad_Pattern MUBUF Instr_ADDR64, ValueType vt,
   ;
 }
 
+defm : MUBUFLoad_Pattern BUFFER_LOAD_DWORDX2_ADDR64, i64,
+  global_load, constant_load;
 defm : MUBUFLoad_Pattern