[AMDGPU] Propagate defining src reg for AGPR to AGPR Copys

On targets that do not support AGPR to AGPR copying directly, try to find the
defining accvgpr_write and propagate its source vgpr register to the copies
before register allocation so the source vgpr register does not get clobbered.

The postrapseudos pass also attempt to propagate the defining accvgpr_write but
if the register to propagate is clobbered, it will give up and create new
temporary vgpr registers instead.

Reviewed By: rampitec

Differential Revision: https://reviews.llvm.org/D108830
This commit is contained in:
Vang Thao 2021-08-27 09:56:12 -07:00
parent 80b92db02c
commit 1443ba6163
3 changed files with 236 additions and 27 deletions

View File

@ -40,6 +40,7 @@ namespace {
class GCNPreRAOptimizations : public MachineFunctionPass {
private:
const SIInstrInfo *TII;
const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;
LiveIntervals *LIS;
@ -85,32 +86,106 @@ bool GCNPreRAOptimizations::processReg(Register Reg) {
MachineInstr *Def0 = nullptr;
MachineInstr *Def1 = nullptr;
uint64_t Init = 0;
bool Changed = false;
SmallSet<Register, 32> ModifiedRegs;
bool IsAGPRDst = TRI->isAGPRClass(MRI->getRegClass(Reg));
for (MachineInstr &I : MRI->def_instructions(Reg)) {
if (I.getOpcode() != AMDGPU::S_MOV_B32 || I.getOperand(0).getReg() != Reg ||
!I.getOperand(1).isImm() || I.getNumOperands() != 2)
return false;
switch (I.getOperand(0).getSubReg()) {
switch (I.getOpcode()) {
default:
return false;
case AMDGPU::sub0:
if (Def0)
return false;
Def0 = &I;
Init |= I.getOperand(1).getImm() & 0xffffffff;
case AMDGPU::V_ACCVGPR_WRITE_B32_e64:
break;
case AMDGPU::sub1:
if (Def1)
case AMDGPU::COPY: {
// Some subtargets cannot do an AGPR to AGPR copy directly, and need an
// intermdiate temporary VGPR register. Try to find the defining
// accvgpr_write to avoid temporary registers.
if (!IsAGPRDst)
break;
Register SrcReg = I.getOperand(1).getReg();
if (!SrcReg.isVirtual())
break;
// Check if source of copy is from another AGPR.
bool IsAGPRSrc = TRI->isAGPRClass(MRI->getRegClass(SrcReg));
if (!IsAGPRSrc)
break;
// def_instructions() does not look at subregs so it may give us a
// different instruction that defines the same vreg but different subreg
// so we have to manually check subreg.
Register SrcSubReg = I.getOperand(1).getSubReg();
for (auto &Def : MRI->def_instructions(SrcReg)) {
if (SrcSubReg != Def.getOperand(0).getSubReg())
continue;
if (Def.getOpcode() == AMDGPU::V_ACCVGPR_WRITE_B32_e64) {
MachineOperand DefSrcMO = Def.getOperand(1);
// Immediates are not an issue and can be propagated in
// postrapseudos pass. Only handle cases where defining
// accvgpr_write source is a vreg.
if (DefSrcMO.isReg() && DefSrcMO.getReg().isVirtual()) {
// Propagate source reg of accvgpr write to this copy instruction
I.getOperand(1).setReg(DefSrcMO.getReg());
I.getOperand(1).setSubReg(DefSrcMO.getSubReg());
// Reg uses were changed, collect unique set of registers to update
// live intervals at the end.
ModifiedRegs.insert(DefSrcMO.getReg());
ModifiedRegs.insert(SrcReg);
Changed = true;
}
// Found the defining accvgpr_write, stop looking any further.
break;
}
}
break;
}
case AMDGPU::S_MOV_B32:
if (I.getOperand(0).getReg() != Reg || !I.getOperand(1).isImm() ||
I.getNumOperands() != 2)
return false;
Def1 = &I;
Init |= static_cast<uint64_t>(I.getOperand(1).getImm()) << 32;
switch (I.getOperand(0).getSubReg()) {
default:
return false;
case AMDGPU::sub0:
if (Def0)
return false;
Def0 = &I;
Init |= I.getOperand(1).getImm() & 0xffffffff;
break;
case AMDGPU::sub1:
if (Def1)
return false;
Def1 = &I;
Init |= static_cast<uint64_t>(I.getOperand(1).getImm()) << 32;
break;
}
break;
}
}
// For AGPR reg, check if live intervals need to be updated.
if (IsAGPRDst) {
if (Changed) {
for (Register RegToUpdate : ModifiedRegs) {
LIS->removeInterval(RegToUpdate);
LIS->createAndComputeVirtRegInterval(RegToUpdate);
}
}
return Changed;
}
// For SGPR reg, check if we can combine instructions.
if (!Def0 || !Def1 || Def0->getParent() != Def1->getParent())
return false;
return Changed;
LLVM_DEBUG(dbgs() << "Combining:\n " << *Def0 << " " << *Def1
<< " =>\n");
@ -144,7 +219,7 @@ bool GCNPreRAOptimizations::runOnMachineFunction(MachineFunction &MF) {
TII = ST.getInstrInfo();
MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
TRI = ST.getRegisterInfo();
bool Changed = false;
@ -153,8 +228,10 @@ bool GCNPreRAOptimizations::runOnMachineFunction(MachineFunction &MF) {
if (!LIS->hasInterval(Reg))
continue;
const TargetRegisterClass *RC = MRI->getRegClass(Reg);
if (RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC))
if ((RC->MC->getSizeInBits() != 64 || !TRI->isSGPRClass(RC)) &&
(ST.hasGFX90AInsts() || !TRI->isAGPRClass(RC)))
continue;
Changed |= processReg(Reg);
}

View File

@ -0,0 +1,139 @@
# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
# RUN: llc -march=amdgcn -mcpu=gfx908 -run-pass=liveintervals,amdgpu-pre-ra-optimizations -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX908 %s
---
name: test_mfma_f32_4x4x1f32_propagate_vgpr
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
; GFX908-LABEL: name: test_mfma_f32_4x4x1f32_propagate_vgpr
; GFX908: liveins: $sgpr0_sgpr1
; GFX908: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
; GFX908: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
; GFX908: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX908: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1123418112, implicit $exec
; GFX908: undef %4.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 [[V_MOV_B32_e32_1]], implicit $exec
; GFX908: %4.sub1:areg_128 = COPY [[V_MOV_B32_e32_1]]
; GFX908: %4.sub2:areg_128 = COPY [[V_MOV_B32_e32_1]]
; GFX908: %4.sub3:areg_128 = COPY [[V_MOV_B32_e32_1]]
; GFX908: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec
; GFX908: [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
; GFX908: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B32_e32_3]], [[V_MOV_B32_e32_2]], %4, 0, 0, 0, implicit $mode, implicit $exec
; GFX908: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_F32_4X4X1F32_e64_]]
; GFX908: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; GFX908: S_ENDPGM 0
%1:sgpr_64(p4) = COPY $sgpr0_sgpr1
%4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64(p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
%5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
%13:vgpr_32 = V_MOV_B32_e32 1123418112, implicit $exec
undef %11.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 %13:vgpr_32, implicit $exec
%11.sub1:areg_128 = COPY %11.sub0:areg_128
%11.sub2:areg_128 = COPY %11.sub0:areg_128
%11.sub3:areg_128 = COPY %11.sub0:areg_128
%8:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec
%9:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
%10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %9:vgpr_32, %8:vgpr_32, %11:areg_128, 0, 0, 0, implicit $mode, implicit $exec
%12:vreg_128 = COPY %10:areg_128
GLOBAL_STORE_DWORDX4_SADDR %5:vgpr_32, %12:vreg_128, %4:sreg_64_xexec, 0, 0, implicit $exec :: (store (s128), addrspace 1)
S_ENDPGM 0
...
---
name: test_mfma_f32_4x4x1f32_no_propagate_imm
tracksRegLiveness: true
body: |
bb.0:
liveins: $sgpr0_sgpr1
; GFX908-LABEL: name: test_mfma_f32_4x4x1f32_no_propagate_imm
; GFX908: liveins: $sgpr0_sgpr1
; GFX908: [[COPY:%[0-9]+]]:sgpr_64(p4) = COPY $sgpr0_sgpr1
; GFX908: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
; GFX908: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
; GFX908: undef %3.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 1073741824, implicit $exec
; GFX908: %3.sub1:areg_128 = COPY %3.sub0
; GFX908: %3.sub2:areg_128 = COPY %3.sub0
; GFX908: %3.sub3:areg_128 = COPY %3.sub0
; GFX908: [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec
; GFX908: [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
; GFX908: [[V_MFMA_F32_4X4X1F32_e64_:%[0-9]+]]:areg_128 = V_MFMA_F32_4X4X1F32_e64 [[V_MOV_B32_e32_2]], [[V_MOV_B32_e32_1]], %3, 0, 0, 0, implicit $mode, implicit $exec
; GFX908: [[COPY1:%[0-9]+]]:vreg_128 = COPY [[V_MFMA_F32_4X4X1F32_e64_]]
; GFX908: GLOBAL_STORE_DWORDX4_SADDR [[V_MOV_B32_e32_]], [[COPY1]], [[S_LOAD_DWORDX2_IMM]], 0, 0, implicit $exec :: (store (s128), addrspace 1)
; GFX908: S_ENDPGM 0
%1:sgpr_64(p4) = COPY $sgpr0_sgpr1
%4:sreg_64_xexec = S_LOAD_DWORDX2_IMM %1:sgpr_64(p4), 36, 0 :: (dereferenceable invariant load (s64), align 4, addrspace 4)
%5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
undef %11.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 1073741824, implicit $exec
%11.sub1:areg_128 = COPY %11.sub0:areg_128
%11.sub2:areg_128 = COPY %11.sub0:areg_128
%11.sub3:areg_128 = COPY %11.sub0:areg_128
%8:vgpr_32 = V_MOV_B32_e32 1073741824, implicit $exec
%9:vgpr_32 = V_MOV_B32_e32 1065353216, implicit $exec
%10:areg_128 = V_MFMA_F32_4X4X1F32_e64 %9:vgpr_32, %8:vgpr_32, %11:areg_128, 0, 0, 0, implicit $mode, implicit $exec
%12:vreg_128 = COPY %10:areg_128
GLOBAL_STORE_DWORDX4_SADDR %5:vgpr_32, %12:vreg_128, %4:sreg_64_xexec, 0, 0, implicit $exec :: (store (s128), addrspace 1)
S_ENDPGM 0
...
---
name: test_vgpr_subreg_propagate
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX908-LABEL: name: test_vgpr_subreg_propagate
; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3
; GFX908: [[COPY:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
; GFX908: undef %1.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec
; GFX908: %1.sub1:areg_128 = COPY [[COPY]].sub0
; GFX908: %1.sub2:areg_128 = COPY [[COPY]].sub0
; GFX908: %1.sub3:areg_128 = COPY [[COPY]].sub0
; GFX908: S_ENDPGM 0, implicit [[COPY]], implicit %1
%0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec
undef %1.sub0:areg_128 = V_ACCVGPR_WRITE_B32_e64 %0.sub0, implicit $exec
%1.sub1:areg_128 = COPY %1.sub0:areg_128
%1.sub2:areg_128 = COPY %1.sub0:areg_128
%1.sub3:areg_128 = COPY %1.sub0:areg_128
S_ENDPGM 0, implicit %0, implicit %1
...
---
name: test_nonmatching_agpr_subreg_no_propagate
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1
; GFX908-LABEL: name: test_nonmatching_agpr_subreg_no_propagate
; GFX908: liveins: $vgpr0_vgpr1
; GFX908: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec
; GFX908: undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec
; GFX908: %1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub1, implicit $exec
; GFX908: [[COPY1:%[0-9]+]]:areg_64 = COPY %1
; GFX908: S_ENDPGM 0, implicit [[COPY]], implicit %1, implicit [[COPY1]]
%0:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec
undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub0, implicit $exec
%1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub1, implicit $exec
%2:areg_64 = COPY %1:areg_64
S_ENDPGM 0, implicit %0, implicit %1, implicit %2
...
---
name: test_subreg_to_single_agpr_reg_propagate
tracksRegLiveness: true
body: |
bb.0:
liveins: $vgpr0_vgpr1
; GFX908-LABEL: name: test_subreg_to_single_agpr_reg_propagate
; GFX908: liveins: $vgpr0_vgpr1
; GFX908: [[COPY:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec
; GFX908: undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub0, implicit $exec
; GFX908: %1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 [[COPY]].sub1, implicit $exec
; GFX908: [[COPY1:%[0-9]+]]:agpr_32 = COPY [[COPY]].sub1
; GFX908: S_ENDPGM 0, implicit [[COPY]], implicit %1, implicit [[COPY1]]
%0:vreg_64 = COPY $vgpr0_vgpr1, implicit $exec
undef %1.sub0:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub0, implicit $exec
%1.sub1:areg_64 = V_ACCVGPR_WRITE_B32_e64 %0.sub1, implicit $exec
%2:agpr_32 = COPY %1.sub1:areg_64
S_ENDPGM 0, implicit %0, implicit %1, implicit %2
...

View File

@ -578,19 +578,12 @@ bb:
ret void
}
; FIXME: Resulting code for splat is pretty bad. A v_mov_b32 is moved
; in the middle of the expanded agpr reg_sequence. The broadcast of
; the individual AGPR->AGPR components should avoid the intermediate AGPR case.
; GCN-LABEL: {{^}}test_mfma_f32_4x4x1f32_lit_splat_bad_code:
; GFX908_A: v_mov_b32_e32 [[TMP0:v[0-9]+]], 0x42f60000
; GCN: v_accvgpr_write_b32 [[AGPR:a[0-9]+]], [[TMP0]]
; GFX908: s_nop 0
; GFX908: v_accvgpr_read_b32 [[TMP1:v[0-9]+]], [[AGPR]]
; GFX908: v_accvgpr_read_b32 [[TMP2:v[0-9]+]], [[AGPR]]
; GFX908: v_accvgpr_read_b32 [[TMP3:v[0-9]+]], [[AGPR]]
; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP1]]
; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP2]]
; GFX908: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP3]]
; GFX908-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]]
; GFX908-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]]
; GFX908-NEXT: v_accvgpr_write_b32 a{{[0-9]+}}, [[TMP0]]
; GFX90A-COUNT-3: v_accvgpr_mov_b32 a{{[0-9]+}}, [[AGPR]]
; GCN: s_nop 0
; GFX908_A: v_mfma_f32_4x4x1f32 a[{{[0-9]+:[0-9]+}}], {{v[0-9]+}}, {{v[0-9]+}}, a[{{[0-9]+:[0-9]+}}]