AMDGPU/GlobalISel: Use destination register bank in applyMappingLoad
Large loads on target that does not useFlatForGlobal have to be split in regbankselect. This did not happen in case when destination had vgpr bank and address had sgpr bank. Instead of checking if address bank is sgpr check bank of the destination. Differential Revision: https://reviews.llvm.org/D101992
This commit is contained in:
parent
d13ce17bb4
commit
f6985a197e
|
|
@ -1144,9 +1144,9 @@ bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
|
|||
unsigned LoadSize = LoadTy.getSizeInBits();
|
||||
const unsigned MaxNonSmrdLoadSize = 128;
|
||||
|
||||
const RegisterBank *PtrBank =
|
||||
OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
|
||||
if (PtrBank == &AMDGPU::SGPRRegBank) {
|
||||
const RegisterBank *DstBank =
|
||||
OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
|
||||
if (DstBank == &AMDGPU::SGPRRegBank) {
|
||||
// There are some special cases that we need to look at for 32 bit and 96
|
||||
// bit SGPR loads otherwise we have nothing to do.
|
||||
if (LoadSize != 32 && LoadSize != 96)
|
||||
|
|
|
|||
|
|
@ -446,13 +446,10 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align1(<3 x i32> addrspace(4)*
|
|||
;
|
||||
; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align1:
|
||||
; GFX7-UNALIGNED: ; %bb.0:
|
||||
; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
|
||||
; GFX7-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2
|
||||
; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX7-UNALIGNED-NEXT: s_mov_b32 s2, -1
|
||||
; GFX7-UNALIGNED-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-UNALIGNED-NEXT: buffer_load_dwordx3 v[0:2], off, s[0:3], 0
|
||||
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX7-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
|
||||
; GFX7-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
|
||||
|
|
@ -564,13 +561,10 @@ define amdgpu_ps <3 x i32> @s_load_constant_v3i32_align2(<3 x i32> addrspace(4)*
|
|||
;
|
||||
; GFX7-UNALIGNED-LABEL: s_load_constant_v3i32_align2:
|
||||
; GFX7-UNALIGNED: ; %bb.0:
|
||||
; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
|
||||
; GFX7-UNALIGNED-NEXT: s_load_dword s0, s[0:1], 0x2
|
||||
; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0)
|
||||
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s6
|
||||
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, s8
|
||||
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, s0
|
||||
; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s7
|
||||
; GFX7-UNALIGNED-NEXT: s_mov_b32 s2, -1
|
||||
; GFX7-UNALIGNED-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-UNALIGNED-NEXT: buffer_load_dwordx3 v[0:2], off, s[0:3], 0
|
||||
; GFX7-UNALIGNED-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX7-UNALIGNED-NEXT: v_readfirstlane_b32 s0, v0
|
||||
; GFX7-UNALIGNED-NEXT: v_readfirstlane_b32 s1, v1
|
||||
; GFX7-UNALIGNED-NEXT: v_readfirstlane_b32 s2, v2
|
||||
|
|
|
|||
|
|
@ -435,3 +435,46 @@ define void @store_lds_v3i32_align1(<3 x i32> addrspace(3)* %out, <3 x i32> %x)
|
|||
store <3 x i32> %x, <3 x i32> addrspace(3)* %out, align 1
|
||||
ret void
|
||||
}
|
||||
|
||||
define amdgpu_ps void @test_s_load_constant_v8i32_align1(<8 x i32> addrspace(4)* inreg %ptr, <8 x i32> addrspace(1)* inreg %out) {
|
||||
; GFX9-LABEL: test_s_load_constant_v8i32_align1:
|
||||
; GFX9: ; %bb.0:
|
||||
; GFX9-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX9-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1]
|
||||
; GFX9-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3]
|
||||
; GFX9-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX9-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16
|
||||
; GFX9-NEXT: s_endpgm
|
||||
;
|
||||
; GFX7-LABEL: test_s_load_constant_v8i32_align1:
|
||||
; GFX7: ; %bb.0:
|
||||
; GFX7-NEXT: s_mov_b32 s4, s2
|
||||
; GFX7-NEXT: s_mov_b32 s5, s3
|
||||
; GFX7-NEXT: s_mov_b32 s2, -1
|
||||
; GFX7-NEXT: s_mov_b32 s3, 0xf000
|
||||
; GFX7-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0
|
||||
; GFX7-NEXT: buffer_load_dwordx4 v[4:7], off, s[0:3], 0 offset:16
|
||||
; GFX7-NEXT: s_mov_b64 s[6:7], s[2:3]
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0
|
||||
; GFX7-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX7-NEXT: buffer_store_dwordx4 v[4:7], off, s[4:7], 0 offset:16
|
||||
; GFX7-NEXT: s_endpgm
|
||||
;
|
||||
; GFX10-LABEL: test_s_load_constant_v8i32_align1:
|
||||
; GFX10: ; %bb.0:
|
||||
; GFX10-NEXT: v_mov_b32_e32 v8, 0
|
||||
; GFX10-NEXT: s_clause 0x1
|
||||
; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1]
|
||||
; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(1)
|
||||
; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[2:3]
|
||||
; GFX10-NEXT: s_waitcnt vmcnt(0)
|
||||
; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[2:3] offset:16
|
||||
; GFX10-NEXT: s_endpgm
|
||||
%load = load <8 x i32>, <8 x i32> addrspace(4)* %ptr, align 1
|
||||
store <8 x i32> %load, <8 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
|
|
|||
|
|
@ -14,7 +14,17 @@ body: |
|
|||
; GFX7: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
|
||||
; GFX7: %in_addr:sgpr(p1) = COPY $sgpr0_sgpr1
|
||||
; GFX7: %out_addr:sgpr(p1) = COPY $sgpr2_sgpr3
|
||||
; GFX7: %load:vgpr(<16 x s32>) = G_LOAD %in_addr(p1) :: (load 64, align 4, addrspace 1)
|
||||
; GFX7: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD %in_addr(p1) :: (load 16, align 4, addrspace 1)
|
||||
; GFX7: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
|
||||
; GFX7: [[PTR_ADD:%[0-9]+]]:vgpr(p1) = G_PTR_ADD %in_addr, [[C]](s64)
|
||||
; GFX7: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p1) :: (load 16 from unknown-address + 16, align 4, addrspace 1)
|
||||
; GFX7: [[C1:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 32
|
||||
; GFX7: [[PTR_ADD1:%[0-9]+]]:vgpr(p1) = G_PTR_ADD %in_addr, [[C1]](s64)
|
||||
; GFX7: [[LOAD2:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD1]](p1) :: (load 16 from unknown-address + 32, align 4, addrspace 1)
|
||||
; GFX7: [[C2:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 48
|
||||
; GFX7: [[PTR_ADD2:%[0-9]+]]:vgpr(p1) = G_PTR_ADD %in_addr, [[C2]](s64)
|
||||
; GFX7: [[LOAD3:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD2]](p1) :: (load 16 from unknown-address + 48, align 4, addrspace 1)
|
||||
; GFX7: %load:vgpr(<16 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>), [[LOAD2]](<4 x s32>), [[LOAD3]](<4 x s32>)
|
||||
; GFX7: %load0_3:vgpr(<4 x s32>), %load4_7:vgpr(<4 x s32>), %load8_11:vgpr(<4 x s32>), %load12_15:vgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<16 x s32>)
|
||||
; GFX7: G_STORE %load0_3(<4 x s32>), %out_addr(p1) :: (store 16, align 4, addrspace 1)
|
||||
; GFX7: %cst16:sgpr(s64) = G_CONSTANT i64 16
|
||||
|
|
@ -88,7 +98,11 @@ body: |
|
|||
; GFX7: liveins: $sgpr0_sgpr1, $sgpr2_sgpr3
|
||||
; GFX7: %ptr:sgpr(p4) = COPY $sgpr0_sgpr1
|
||||
; GFX7: %out:sgpr(p1) = COPY $sgpr2_sgpr3
|
||||
; GFX7: %load:vgpr(<8 x s32>) = G_LOAD %ptr(p4) :: (load 32, align 1, addrspace 4)
|
||||
; GFX7: [[LOAD:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD %ptr(p4) :: (load 16, align 1, addrspace 4)
|
||||
; GFX7: [[C:%[0-9]+]]:vgpr(s64) = G_CONSTANT i64 16
|
||||
; GFX7: [[PTR_ADD:%[0-9]+]]:vgpr(p4) = G_PTR_ADD %ptr, [[C]](s64)
|
||||
; GFX7: [[LOAD1:%[0-9]+]]:vgpr(<4 x s32>) = G_LOAD [[PTR_ADD]](p4) :: (load 16 from unknown-address + 16, align 1, addrspace 4)
|
||||
; GFX7: %load:vgpr(<8 x s32>) = G_CONCAT_VECTORS [[LOAD]](<4 x s32>), [[LOAD1]](<4 x s32>)
|
||||
; GFX7: %load0_3:vgpr(<4 x s32>), %load4_7:vgpr(<4 x s32>) = G_UNMERGE_VALUES %load(<8 x s32>)
|
||||
; GFX7: G_STORE %load0_3(<4 x s32>), %out(p1) :: (store 16, align 32, addrspace 1)
|
||||
; GFX7: %cst_16:sgpr(s64) = G_CONSTANT i64 16
|
||||
|
|
|
|||
Loading…
Reference in New Issue