Hi,
The attached patchset implements a few optimizations for the bfgminer
bitcoin mining program.
Please Review.
-Tom
>From 661e832408a8bafc03a7c4c600c4a140b03054b4 Mon Sep 17 00:00:00 2001
From: Dmitry Cherkassov
Date: Thu, 7 Mar 2013 20:17:59 +0400
Subject: [PATCH 1/3] R600: Add 64-bit load/store support
* Added R600_Reg64 class
* Added T#Index#.XY registers definition
* Added v2i32 register reads from parameter and global space
* Added f32 and i32 elements extraction from v2f32 and v2i32
* Added v2i32 -> v2f32 conversions
Signed-off-by: Dmitry Cherkassov
Tom Stellard:
- Mark vec2 operations as expand. The addition of a vec2 register
class made them all legal.
---
lib/Target/R600/AMDGPUISelLowering.cpp | 6 +++
lib/Target/R600/AMDILISelDAGToDAG.cpp | 10 -
lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp | 3 ++
lib/Target/R600/R600ISelLowering.cpp | 17 +
lib/Target/R600/R600InstrInfo.cpp | 19 ++
lib/Target/R600/R600Instructions.td| 44 ++
lib/Target/R600/R600RegisterInfo.td| 16
test/CodeGen/R600/64bit-kernel-args.ll | 41
test/CodeGen/R600/fadd.ll | 10 +
test/CodeGen/R600/fdiv.ll | 37 +-
test/CodeGen/R600/fmul.ll | 10 +
test/CodeGen/R600/fp_to_sint.ll| 10 +
test/CodeGen/R600/fp_to_uint.ll| 10 +
test/CodeGen/R600/fsub.ll | 20 +++---
test/CodeGen/R600/setcc.ll | 18 +++--
test/CodeGen/R600/sint_to_fp.ll| 10 +
test/CodeGen/R600/udiv.ll | 20 +++---
test/CodeGen/R600/uint_to_fp.ll| 10 +
test/CodeGen/R600/urem.ll | 21 ---
19 files changed, 292 insertions(+), 40 deletions(-)
create mode 100644 test/CodeGen/R600/64bit-kernel-args.ll
diff --git a/lib/Target/R600/AMDGPUISelLowering.cpp
b/lib/Target/R600/AMDGPUISelLowering.cpp
index a266df5..4a064b1 100644
--- a/lib/Target/R600/AMDGPUISelLowering.cpp
+++ b/lib/Target/R600/AMDGPUISelLowering.cpp
@@ -51,6 +51,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM)
:
setOperationAction(ISD::STORE, MVT::f32, Promote);
AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
+ setOperationAction(ISD::STORE, MVT::v2f32, Promote);
+ AddPromotedToType(ISD::STORE, MVT::v2f32, MVT::v2i32);
+
setOperationAction(ISD::STORE, MVT::v4f32, Promote);
AddPromotedToType(ISD::STORE, MVT::v4f32, MVT::v4i32);
@@ -60,6 +63,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(TargetMachine &TM)
:
setOperationAction(ISD::LOAD, MVT::v4f32, Promote);
AddPromotedToType(ISD::LOAD, MVT::v4f32, MVT::v4i32);
+ setOperationAction(ISD::LOAD, MVT::v2f32, Promote);
+ AddPromotedToType(ISD::LOAD, MVT::v2f32, MVT::v2i32);
+
setOperationAction(ISD::MUL, MVT::i64, Expand);
setOperationAction(ISD::UDIV, MVT::i32, Expand);
diff --git a/lib/Target/R600/AMDILISelDAGToDAG.cpp
b/lib/Target/R600/AMDILISelDAGToDAG.cpp
index ba75a44..198cd7e 100644
--- a/lib/Target/R600/AMDILISelDAGToDAG.cpp
+++ b/lib/Target/R600/AMDILISelDAGToDAG.cpp
@@ -167,12 +167,20 @@ SDNode *AMDGPUDAGToDAGISel::Select(SDNode *N) {
if (ST.device()->getGeneration() > AMDGPUDeviceInfo::HD6XXX) {
break;
}
+unsigned RegSequenceClassID;
+EVT VT = N->getValueType(0);
+assert(VT.isVector());
+switch (VT.getVectorNumElements()) {
+case 4: RegSequenceClassID = AMDGPU::R600_Reg128RegClassID; break;
+case 2: RegSequenceClassID = AMDGPU::R600_Reg64RegClassID; break;
+default: llvm_unreachable("Unhandled vector width in BUILD_VECTOR");
+}
// BUILD_VECTOR is usually lowered into an IMPLICIT_DEF + 4 INSERT_SUBREG
// that adds a 128 bits reg copy when going through TwoAddressInstructions
// pass. We want to avoid 128 bits copies as much as possible because they
// can't be bundled by our scheduler.
SDValue RegSeqArgs[9] = {
- CurDAG->getTargetConstant(AMDGPU::R600_Reg128RegClassID, MVT::i32),
+ CurDAG->getTargetConstant(RegSequenceClassID, MVT::i32),
SDValue(), CurDAG->getTargetConstant(AMDGPU::sub0, MVT::i32),
SDValue(), CurDAG->getTargetConstant(AMDGPU::sub1, MVT::i32),
SDValue(), CurDAG->getTargetConstant(AMDGPU::sub2, MVT::i32),
diff --git a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
index 7c83d86..030fc87 100644
--- a/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/lib/Target/R600/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -150,6 +150,7 @@ void R600MCCodeEmitter::EncodeInstruction(const MCInst &MI,
raw_ostream &OS,
} else {
switch(MI.getOpcode()) {
case AMDGPU::RAT_WRITE_CACHELESS_32_eg:
+case AMDGPU::RAT_WRITE_CACHEL