[NFC] Refactor tests to improve readability.
This commit is contained in:
parent
40b230f685
commit
4f28a2eb03
|
|
@ -2,154 +2,63 @@
|
|||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOW16,ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FAST16,ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOW16,ALL %s
|
||||
; END.
|
||||
|
||||
|
||||
; ALL: 'add_i32'
|
||||
; ALL-LABEL: 'add_i32'
|
||||
; ALL: estimated cost of 1 for {{.*}} add i32
|
||||
define amdgpu_kernel void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
|
||||
%vec = load i32, i32 addrspace(1)* %vaddr
|
||||
%add = add i32 %vec, %b
|
||||
store i32 %add, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'add_v2i32'
|
||||
; ALL: estimated cost of 2 for {{.*}} add <2 x i32>
|
||||
define amdgpu_kernel void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
|
||||
%vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
|
||||
%add = add <2 x i32> %vec, %b
|
||||
store <2 x i32> %add, <2 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'add_v3i32'
|
||||
; Allow for 4 when v3i32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 3 when it is legal.
|
||||
;;; Allow for 4 when v3i32 is illegal and TargetLowering thinks it needs widening,
|
||||
;;; and 3 when it is legal.
|
||||
; ALL: estimated cost of {{[34]}} for {{.*}} add <3 x i32>
|
||||
define amdgpu_kernel void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
|
||||
%vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
|
||||
%add = add <3 x i32> %vec, %b
|
||||
store <3 x i32> %add, <3 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'add_v4i32'
|
||||
; ALL: estimated cost of 4 for {{.*}} add <4 x i32>
|
||||
define amdgpu_kernel void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
|
||||
%vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
|
||||
%add = add <4 x i32> %vec, %b
|
||||
store <4 x i32> %add, <4 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'add_v5i32'
|
||||
; Allow for 8 when v3i32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 5 when it is legal.
|
||||
;;; Allow for 8 when v3i32 is illegal and TargetLowering thinks it needs widening,
|
||||
;;; and 5 when it is legal.
|
||||
; ALL: estimated cost of {{[58]}} for {{.*}} add <5 x i32>
|
||||
define amdgpu_kernel void @add_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 {
|
||||
%vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr
|
||||
%add = add <5 x i32> %vec, %b
|
||||
store <5 x i32> %add, <5 x i32> addrspace(1)* %out
|
||||
define amdgpu_kernel void @add_i32() #0 {
|
||||
%i32 = add i32 undef, undef
|
||||
%v2i32 = add <2 x i32> undef, undef
|
||||
%v3i32 = add <3 x i32> undef, undef
|
||||
%v4i32 = add <4 x i32> undef, undef
|
||||
%v5i32 = add <5 x i32> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'add_i64'
|
||||
; ALL-LABEL: 'add_i64'
|
||||
; ALL: estimated cost of 2 for {{.*}} add i64
|
||||
define amdgpu_kernel void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
|
||||
%vec = load i64, i64 addrspace(1)* %vaddr
|
||||
%add = add i64 %vec, %b
|
||||
store i64 %add, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'add_v2i64'
|
||||
; ALL: estimated cost of 4 for {{.*}} add <2 x i64>
|
||||
define amdgpu_kernel void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
|
||||
%vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
|
||||
%add = add <2 x i64> %vec, %b
|
||||
store <2 x i64> %add, <2 x i64> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'add_v3i64'
|
||||
; ALL: estimated cost of 6 for {{.*}} add <3 x i64>
|
||||
define amdgpu_kernel void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
|
||||
%vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
|
||||
%add = add <3 x i64> %vec, %b
|
||||
store <3 x i64> %add, <3 x i64> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'add_v4i64'
|
||||
; ALL: estimated cost of 8 for {{.*}} add <4 x i64>
|
||||
define amdgpu_kernel void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
|
||||
%vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
|
||||
%add = add <4 x i64> %vec, %b
|
||||
store <4 x i64> %add, <4 x i64> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'add_v16i64'
|
||||
; ALL: estimated cost of 128 for {{.*}} add <16 x i64>
|
||||
define amdgpu_kernel void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 {
|
||||
%vec = load <16 x i64>, <16 x i64> addrspace(1)* %vaddr
|
||||
%add = add <16 x i64> %vec, %b
|
||||
store <16 x i64> %add, <16 x i64> addrspace(1)* %out
|
||||
define amdgpu_kernel void @add_i64() #0 {
|
||||
%i64 = add i64 undef, undef
|
||||
%v2i64 = add <2 x i64> undef, undef
|
||||
%v3i64 = add <3 x i64> undef, undef
|
||||
%v4i64 = add <4 x i64> undef, undef
|
||||
%v16i64 = add <16 x i64> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'add_i16'
|
||||
; ALL-LABEL: 'add_i16'
|
||||
; ALL: estimated cost of 1 for {{.*}} add i16
|
||||
define amdgpu_kernel void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
|
||||
%vec = load i16, i16 addrspace(1)* %vaddr
|
||||
%add = add i16 %vec, %b
|
||||
store i16 %add, i16 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'add_v2i16'
|
||||
; SLOW16: estimated cost of 2 for {{.*}} add <2 x i16>
|
||||
; FAST16: estimated cost of 1 for {{.*}} add <2 x i16>
|
||||
define amdgpu_kernel void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%add = add <2 x i16> %vec, %b
|
||||
store <2 x i16> %add, <2 x i16> addrspace(1)* %out
|
||||
define amdgpu_kernel void @add_i16() #0 {
|
||||
%i16 = add i16 undef, undef
|
||||
%v2i16 = add <2 x i16> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'sub_i32'
|
||||
; ALL-LABEL: 'sub'
|
||||
; ALL: estimated cost of 1 for {{.*}} sub i32
|
||||
define amdgpu_kernel void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
|
||||
%vec = load i32, i32 addrspace(1)* %vaddr
|
||||
%sub = sub i32 %vec, %b
|
||||
store i32 %sub, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'sub_i64'
|
||||
; ALL: estimated cost of 2 for {{.*}} sub i64
|
||||
define amdgpu_kernel void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
|
||||
%vec = load i64, i64 addrspace(1)* %vaddr
|
||||
%sub = sub i64 %vec, %b
|
||||
store i64 %sub, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
; ALL: 'sub_i16'
|
||||
; ALL: estimated cost of 1 for {{.*}} sub i16
|
||||
define amdgpu_kernel void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
|
||||
%vec = load i16, i16 addrspace(1)* %vaddr
|
||||
%sub = sub i16 %vec, %b
|
||||
store i16 %sub, i16 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'sub_v2i16'
|
||||
; SLOW16: estimated cost of 2 for {{.*}} sub <2 x i16>
|
||||
; FAST16: estimated cost of 1 for {{.*}} sub <2 x i16>
|
||||
define amdgpu_kernel void @sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%sub = sub <2 x i16> %vec, %b
|
||||
store <2 x i16> %sub, <2 x i16> addrspace(1)* %out
|
||||
define amdgpu_kernel void @sub() #0 {
|
||||
%i32 = sub i32 undef, undef
|
||||
%i64 = sub i64 undef, undef
|
||||
%i16 = sub i16 undef, undef
|
||||
%v2i16 = sub <2 x i16> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri < %s | FileCheck %s
|
||||
; END.
|
||||
|
||||
; CHECK-LABEL: 'addrspacecast_global_to_flat'
|
||||
; CHECK: estimated cost of 0 for {{.*}} addrspacecast i8 addrspace(1)* %ptr to i8*
|
||||
|
|
|
|||
|
|
@ -2,88 +2,41 @@
|
|||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,FAST16 %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=ALL,SLOW16 %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,FAST16 %s
|
||||
; END.
|
||||
|
||||
; ALL: 'or_i32'
|
||||
; ALL-LABEL: 'or'
|
||||
; ALL: estimated cost of 1 for {{.*}} or i32
|
||||
define amdgpu_kernel void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
|
||||
%vec = load i32, i32 addrspace(1)* %vaddr
|
||||
%or = or i32 %vec, %b
|
||||
store i32 %or, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'or_i64'
|
||||
; ALL: estimated cost of 2 for {{.*}} or i64
|
||||
define amdgpu_kernel void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
|
||||
%vec = load i64, i64 addrspace(1)* %vaddr
|
||||
%or = or i64 %vec, %b
|
||||
store i64 %or, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'or_v2i16'
|
||||
; SLOW16: estimated cost of 2 for {{.*}} or <2 x i16>
|
||||
; FAST16: estimated cost of 1 for {{.*}} or <2 x i16>
|
||||
define amdgpu_kernel void @or_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%or = or <2 x i16> %vec, %b
|
||||
store <2 x i16> %or, <2 x i16> addrspace(1)* %out
|
||||
define amdgpu_kernel void @or() #0 {
|
||||
%i32 = or i32 undef, undef
|
||||
%i64 = or i64 undef, undef
|
||||
%v2i16 = or <2 x i16> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'xor_i32'
|
||||
; ALL-LABEL: 'xor'
|
||||
; ALL: estimated cost of 1 for {{.*}} xor i32
|
||||
define amdgpu_kernel void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
|
||||
%vec = load i32, i32 addrspace(1)* %vaddr
|
||||
%or = xor i32 %vec, %b
|
||||
store i32 %or, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'xor_i64'
|
||||
; ALL: estimated cost of 2 for {{.*}} xor i64
|
||||
define amdgpu_kernel void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
|
||||
%vec = load i64, i64 addrspace(1)* %vaddr
|
||||
%or = xor i64 %vec, %b
|
||||
store i64 %or, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'xor_v2i16'
|
||||
; SLOW16: estimated cost of 2 for {{.*}} xor <2 x i16>
|
||||
; FAST16: estimated cost of 1 for {{.*}} xor <2 x i16>
|
||||
define amdgpu_kernel void @xor_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%xor = xor <2 x i16> %vec, %b
|
||||
store <2 x i16> %xor, <2 x i16> addrspace(1)* %out
|
||||
define amdgpu_kernel void @xor() #0 {
|
||||
%i32 = xor i32 undef, undef
|
||||
%i64 = xor i64 undef, undef
|
||||
%v2i16 = xor <2 x i16> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'and_i32'
|
||||
; ALL-LABEL: 'and'
|
||||
; ALL: estimated cost of 1 for {{.*}} and i32
|
||||
define amdgpu_kernel void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
|
||||
%vec = load i32, i32 addrspace(1)* %vaddr
|
||||
%or = and i32 %vec, %b
|
||||
store i32 %or, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'and_i64'
|
||||
; ALL: estimated cost of 2 for {{.*}} and i64
|
||||
define amdgpu_kernel void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
|
||||
%vec = load i64, i64 addrspace(1)* %vaddr
|
||||
%or = and i64 %vec, %b
|
||||
store i64 %or, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL: 'and_v2i16'
|
||||
; SLOW16: estimated cost of 2 for {{.*}} and <2 x i16>
|
||||
; FAST16: estimated cost of 1 for {{.*}} and <2 x i16>
|
||||
define amdgpu_kernel void @and_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%and = and <2 x i16> %vec, %b
|
||||
store <2 x i16> %and, <2 x i16> addrspace(1)* %out
|
||||
define amdgpu_kernel void @and() #0 {
|
||||
%i32 = and i32 undef, undef
|
||||
%i64 = and i64 undef, undef
|
||||
%v2i16 = and <2 x i16> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,15 +1,16 @@
|
|||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck --check-prefixes=ALL,SPEED %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck --check-prefixes=ALL,SIZE %s
|
||||
; END.
|
||||
|
||||
; ALL-LABEL: 'test_br_cost'
|
||||
; SPEED: estimated cost of 7 for instruction: br i1
|
||||
; SPEED-NEXT: estimated cost of 7 for instruction: br i1
|
||||
; SPEED: estimated cost of 4 for instruction: br label
|
||||
; SPEED: estimated cost of 1 for instruction: %phi = phi i32 [
|
||||
; SPEED: estimated cost of 10 for instruction: ret void
|
||||
; SIZE: estimated cost of 5 for instruction: br i1
|
||||
; SPEED-NEXT: estimated cost of 1 for instruction: %phi = phi i32 [
|
||||
; SPEED-NEXT: estimated cost of 10 for instruction: ret void
|
||||
; SIZE-NEXT: estimated cost of 5 for instruction: br i1
|
||||
; SIZE: estimated cost of 1 for instruction: br label
|
||||
; SIZE: estimated cost of 0 for instruction: %phi = phi i32 [
|
||||
; SIZE: estimated cost of 1 for instruction: ret void
|
||||
; SIZE-NEXT: estimated cost of 0 for instruction: %phi = phi i32 [
|
||||
; SIZE-NEXT: estimated cost of 1 for instruction: ret void
|
||||
define amdgpu_kernel void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
|
||||
bb0:
|
||||
br i1 undef, label %bb1, label %bb2
|
||||
|
|
@ -26,8 +27,8 @@ bb2:
|
|||
}
|
||||
|
||||
; ALL-LABEL: 'test_switch_cost'
|
||||
; SPEED: estimated cost of 24 for instruction: switch
|
||||
; SIZE: estimated cost of 18 for instruction: switch
|
||||
; SPEED-NEXT: estimated cost of 24 for instruction: switch
|
||||
; SIZE-NEXT: estimated cost of 18 for instruction: switch
|
||||
define amdgpu_kernel void @test_switch_cost(i32 %a) #0 {
|
||||
entry:
|
||||
switch i32 %a, label %default [
|
||||
|
|
|
|||
|
|
@ -4,141 +4,55 @@
|
|||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa %s | FileCheck -check-prefixes=GCN,CI %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=GCN,GFX89 %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GCN,GFX89 %s
|
||||
; END.
|
||||
|
||||
|
||||
; GCN: 'extractelement_v2i32'
|
||||
; GCN: estimated cost of 0 for {{.*}} extractelement <2 x i32>
|
||||
define amdgpu_kernel void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
|
||||
%vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
|
||||
%elt = extractelement <2 x i32> %vec, i32 1
|
||||
store i32 %elt, i32 addrspace(1)* %out
|
||||
; GCN-LABEL: 'extractelement_32'
|
||||
; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <2 x i32>
|
||||
; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <2 x float>
|
||||
; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <3 x i32>
|
||||
; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <4 x i32>
|
||||
; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <5 x i32>
|
||||
; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <8 x i32>
|
||||
; GCN-NEXT: estimated cost of 2 for {{.*}} extractelement <8 x i32>
|
||||
define amdgpu_kernel void @extractelement_32(i32 %arg) {
|
||||
%v2i32_1 = extractelement <2 x i32> undef, i32 1
|
||||
%v2f32_1 = extractelement <2 x float> undef, i32 1
|
||||
%v3i32_1 = extractelement <3 x i32> undef, i32 1
|
||||
%v4i32_1 = extractelement <4 x i32> undef, i32 1
|
||||
%v5i32_1 = extractelement <5 x i32> undef, i32 1
|
||||
%v8i32_1 = extractelement <8 x i32> undef, i32 1
|
||||
%v8i32_a = extractelement <8 x i32> undef, i32 %arg
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN: 'extractelement_v2f32'
|
||||
; GCN: estimated cost of 0 for {{.*}} extractelement <2 x float>
|
||||
define amdgpu_kernel void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
|
||||
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
|
||||
%elt = extractelement <2 x float> %vec, i32 1
|
||||
store float %elt, float addrspace(1)* %out
|
||||
; GCN-LABEL: 'extractelement_64'
|
||||
; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <2 x i64>
|
||||
; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <3 x i64>
|
||||
; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <4 x i64>
|
||||
; GCN-NEXT: estimated cost of 0 for {{.*}} extractelement <8 x i64>
|
||||
define amdgpu_kernel void @extractelement_64() {
|
||||
%v2i64_1 = extractelement <2 x i64> undef, i64 1
|
||||
%v3i64_1 = extractelement <3 x i64> undef, i64 1
|
||||
%v4i64_1 = extractelement <4 x i64> undef, i64 1
|
||||
%v8i64_1 = extractelement <8 x i64> undef, i64 1
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN: 'extractelement_v3i32'
|
||||
; GCN: estimated cost of 0 for {{.*}} extractelement <3 x i32>
|
||||
define amdgpu_kernel void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
|
||||
%vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
|
||||
%elt = extractelement <3 x i32> %vec, i32 1
|
||||
store i32 %elt, i32 addrspace(1)* %out
|
||||
; GCN-LABEL: 'extractelement_8'
|
||||
; GCN-NEXT: estimated cost of 1 for {{.*}} extractelement <4 x i8>
|
||||
define amdgpu_kernel void @extractelement_8() {
|
||||
%v4i8_1 = extractelement <4 x i8> undef, i8 1
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN: 'extractelement_v4i32'
|
||||
; GCN: estimated cost of 0 for {{.*}} extractelement <4 x i32>
|
||||
define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
|
||||
%vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
|
||||
%elt = extractelement <4 x i32> %vec, i32 1
|
||||
store i32 %elt, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN: 'extractelement_v5i32'
|
||||
; GCN: estimated cost of 0 for {{.*}} extractelement <5 x i32>
|
||||
define amdgpu_kernel void @extractelement_v5i32(i32 addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr) {
|
||||
%vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr
|
||||
%elt = extractelement <5 x i32> %vec, i32 1
|
||||
store i32 %elt, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN: 'extractelement_v8i32'
|
||||
; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i32>
|
||||
define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
|
||||
%vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
|
||||
%elt = extractelement <8 x i32> %vec, i32 1
|
||||
store i32 %elt, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; FIXME: Should be non-0
|
||||
; GCN: 'extractelement_v8i32_dynindex'
|
||||
; GCN: estimated cost of 2 for {{.*}} extractelement <8 x i32>
|
||||
define amdgpu_kernel void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
|
||||
%vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
|
||||
%elt = extractelement <8 x i32> %vec, i32 %idx
|
||||
store i32 %elt, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN: 'extractelement_v2i64'
|
||||
; GCN: estimated cost of 0 for {{.*}} extractelement <2 x i64>
|
||||
define amdgpu_kernel void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
|
||||
%vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
|
||||
%elt = extractelement <2 x i64> %vec, i64 1
|
||||
store i64 %elt, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN: 'extractelement_v3i64'
|
||||
; GCN: estimated cost of 0 for {{.*}} extractelement <3 x i64>
|
||||
define amdgpu_kernel void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
|
||||
%vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
|
||||
%elt = extractelement <3 x i64> %vec, i64 1
|
||||
store i64 %elt, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN: 'extractelement_v4i64'
|
||||
; GCN: estimated cost of 0 for {{.*}} extractelement <4 x i64>
|
||||
define amdgpu_kernel void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
|
||||
%vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
|
||||
%elt = extractelement <4 x i64> %vec, i64 1
|
||||
store i64 %elt, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN: 'extractelement_v8i64'
|
||||
; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i64>
|
||||
define amdgpu_kernel void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
|
||||
%vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
|
||||
%elt = extractelement <8 x i64> %vec, i64 1
|
||||
store i64 %elt, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN: 'extractelement_v4i8'
|
||||
; GCN: estimated cost of 1 for {{.*}} extractelement <4 x i8>
|
||||
define amdgpu_kernel void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
|
||||
%vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr
|
||||
%elt = extractelement <4 x i8> %vec, i8 1
|
||||
store i8 %elt, i8 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN: 'extractelement_0_v2i16':
|
||||
; CI: estimated cost of 1 for {{.*}} extractelement <2 x i16> %vec, i16 0
|
||||
; GFX89: estimated cost of 0 for {{.*}} extractelement <2 x i16>
|
||||
define amdgpu_kernel void @extractelement_0_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%elt = extractelement <2 x i16> %vec, i16 0
|
||||
store i16 %elt, i16 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN: 'extractelement_1_v2i16':
|
||||
; GCN: estimated cost of 1 for {{.*}} extractelement <2 x i16>
|
||||
define amdgpu_kernel void @extractelement_1_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%elt = extractelement <2 x i16> %vec, i16 1
|
||||
store i16 %elt, i16 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN: 'extractelement_var_v2i16'
|
||||
; GCN: estimated cost of 1 for {{.*}} extractelement <2 x i16>
|
||||
define amdgpu_kernel void @extractelement_var_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, i32 %idx) {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%elt = extractelement <2 x i16> %vec, i32 %idx
|
||||
store i16 %elt, i16 addrspace(1)* %out
|
||||
; GCN-LABEL: 'extractelement_16'
|
||||
; CI-NEXT: estimated cost of 1 for {{.*}} extractelement <2 x i16> undef, i16 0
|
||||
; GFX89-NEXT: estimated cost of 0 for {{.*}} extractelement <2 x i16>
|
||||
; GCN-NEXT: estimated cost of 1 for {{.*}} extractelement <2 x i16>
|
||||
; GCN-NEXT: estimated cost of 1 for {{.*}} extractelement <2 x i16>
|
||||
define amdgpu_kernel void @extractelement_16(i32 %arg) {
|
||||
%v2i16_0 = extractelement <2 x i16> undef, i16 0
|
||||
%v2i16_1 = extractelement <2 x i16> undef, i16 1
|
||||
%v2i16_a = extractelement <2 x i16> undef, i32 %arg
|
||||
ret void
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,93 +1,39 @@
|
|||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
|
||||
; END.
|
||||
|
||||
; CHECK-LABEL: 'fabs_f32'
|
||||
; CHECK: estimated cost of 0 for {{.*}} call float @llvm.fabs.f32
|
||||
define amdgpu_kernel void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
|
||||
%vec = load float, float addrspace(1)* %vaddr
|
||||
%fabs = call float @llvm.fabs.f32(float %vec) #1
|
||||
store float %fabs, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'fabs_v2f32'
|
||||
; CHECK: estimated cost of 0 for {{.*}} call <2 x float> @llvm.fabs.v2f32
|
||||
define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
|
||||
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
|
||||
%fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %vec) #1
|
||||
store <2 x float> %fabs, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'fabs_v3f32'
|
||||
; CHECK: estimated cost of 0 for {{.*}} call <3 x float> @llvm.fabs.v3f32
|
||||
define amdgpu_kernel void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
|
||||
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
|
||||
%fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %vec) #1
|
||||
store <3 x float> %fabs, <3 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'fabs_v5f32'
|
||||
; CHECK: estimated cost of 0 for {{.*}} call <5 x float> @llvm.fabs.v5f32
|
||||
define amdgpu_kernel void @fabs_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) #0 {
|
||||
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
|
||||
%fabs = call <5 x float> @llvm.fabs.v5f32(<5 x float> %vec) #1
|
||||
store <5 x float> %fabs, <5 x float> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fabs_f32() #0 {
|
||||
%f32 = call float @llvm.fabs.f32(float undef) #1
|
||||
%v2f32 = call <2 x float> @llvm.fabs.v2f32(<2 x float> undef) #1
|
||||
%v3f32 = call <3 x float> @llvm.fabs.v3f32(<3 x float> undef) #1
|
||||
%v5f32 = call <5 x float> @llvm.fabs.v5f32(<5 x float> undef) #1
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'fabs_f64'
|
||||
; CHECK: estimated cost of 0 for {{.*}} call double @llvm.fabs.f64
|
||||
define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
|
||||
%vec = load double, double addrspace(1)* %vaddr
|
||||
%fabs = call double @llvm.fabs.f64(double %vec) #1
|
||||
store double %fabs, double addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'fabs_v2f64'
|
||||
; CHECK: estimated cost of 0 for {{.*}} call <2 x double> @llvm.fabs.v2f64
|
||||
define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
|
||||
%vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
|
||||
%fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %vec) #1
|
||||
store <2 x double> %fabs, <2 x double> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'fabs_v3f64'
|
||||
; CHECK: estimated cost of 0 for {{.*}} call <3 x double> @llvm.fabs.v3f64
|
||||
define amdgpu_kernel void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
|
||||
%vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
|
||||
%fabs = call <3 x double> @llvm.fabs.v3f64(<3 x double> %vec) #1
|
||||
store <3 x double> %fabs, <3 x double> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fabs_f64() #0 {
|
||||
%f64 = call double @llvm.fabs.f64(double undef) #1
|
||||
%v2f64 = call <2 x double> @llvm.fabs.v2f64(<2 x double> undef) #1
|
||||
%v3f64 = call <3 x double> @llvm.fabs.v3f64(<3 x double> undef) #1
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'fabs_f16'
|
||||
; CHECK: estimated cost of 0 for {{.*}} call half @llvm.fabs.f16
|
||||
define amdgpu_kernel void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
|
||||
%vec = load half, half addrspace(1)* %vaddr
|
||||
%fabs = call half @llvm.fabs.f16(half %vec) #1
|
||||
store half %fabs, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'fabs_v2f16'
|
||||
; CHECK: estimated cost of 0 for {{.*}} call <2 x half> @llvm.fabs.v2f16
|
||||
define amdgpu_kernel void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
|
||||
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
|
||||
%fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %vec) #1
|
||||
store <2 x half> %fabs, <2 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'fabs_v3f16'
|
||||
; CHECK: estimated cost of 0 for {{.*}} call <3 x half> @llvm.fabs.v3f16
|
||||
define amdgpu_kernel void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
|
||||
%vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
|
||||
%fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %vec) #1
|
||||
store <3 x half> %fabs, <3 x half> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fabs_f16() #0 {
|
||||
%f16 = call half @llvm.fabs.f16(half undef) #1
|
||||
%v2f16 = call <2 x half> @llvm.fabs.v2f16(<2 x half> undef) #1
|
||||
%v3f16 = call <3 x half> @llvm.fabs.v3f16(<3 x half> undef) #1
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3,47 +3,25 @@
|
|||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=FASTF16,SIZEALL,ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF16,SIZEALL,ALL %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX90A-FASTF64,FASTF16,PACKEDF32,ALL %s
|
||||
; END.
|
||||
|
||||
; ALL-LABEL: 'fadd_f32'
|
||||
; ALL: estimated cost of 1 for {{.*}} fadd float
|
||||
define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
|
||||
%vec = load float, float addrspace(1)* %vaddr
|
||||
%add = fadd float %vec, %b
|
||||
store float %add, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fadd_v2f32'
|
||||
; NOPACKEDF32: estimated cost of 2 for {{.*}} fadd <2 x float>
|
||||
; PACKEDF32: estimated cost of 1 for {{.*}} fadd <2 x float>
|
||||
define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
|
||||
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
|
||||
%add = fadd <2 x float> %vec, %b
|
||||
store <2 x float> %add, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fadd_v3f32'
|
||||
; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 3 when it is legal.
|
||||
; NOPACKEDF32: estimated cost of {{[34]}} for {{.*}} fadd <3 x float>
|
||||
; PACKEDF32: estimated cost of 2 for {{.*}} fadd <3 x float>
|
||||
define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
|
||||
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
|
||||
%add = fadd <3 x float> %vec, %b
|
||||
store <3 x float> %add, <3 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fadd_v5f32'
|
||||
; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 5 when it is legal.
|
||||
; NOPACKEDF32: estimated cost of {{[58]}} for {{.*}} fadd <5 x float>
|
||||
; PACKEDF32: estimated cost of 3 for {{.*}} fadd <5 x float>
|
||||
define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
|
||||
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
|
||||
%add = fadd <5 x float> %vec, %b
|
||||
store <5 x float> %add, <5 x float> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fadd_f32() #0 {
|
||||
%f32 = fadd float undef, undef
|
||||
%v2f32 = fadd <2 x float> undef, undef
|
||||
%v3f32 = fadd <3 x float> undef, undef
|
||||
%v5f32 = fadd <5 x float> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
@ -52,73 +30,34 @@ define amdgpu_kernel void @fadd_v5f32(<5 x float> addrspace(1)* %out, <5 x float
|
|||
; FASTF64: estimated cost of 2 for {{.*}} fadd double
|
||||
; SLOWF64: estimated cost of 4 for {{.*}} fadd double
|
||||
; SIZEALL: estimated cost of 2 for {{.*}} fadd double
|
||||
define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
|
||||
%vec = load double, double addrspace(1)* %vaddr
|
||||
%add = fadd double %vec, %b
|
||||
store double %add, double addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fadd_v2f64'
|
||||
; GFX90A-FASTF64: estimated cost of 2 for {{.*}} fadd <2 x double>
|
||||
; FASTF64: estimated cost of 4 for {{.*}} fadd <2 x double>
|
||||
; SLOWF64: estimated cost of 8 for {{.*}} fadd <2 x double>
|
||||
; SIZEALL: estimated cost of 4 for {{.*}} fadd <2 x double>
|
||||
define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
|
||||
%vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
|
||||
%add = fadd <2 x double> %vec, %b
|
||||
store <2 x double> %add, <2 x double> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fadd_v3f64'
|
||||
; GFX90A-FASTF64: estimated cost of 3 for {{.*}} fadd <3 x double>
|
||||
; FASTF64: estimated cost of 6 for {{.*}} fadd <3 x double>
|
||||
; SLOWF64: estimated cost of 12 for {{.*}} fadd <3 x double>
|
||||
; SIZEALL: estimated cost of 6 for {{.*}} fadd <3 x double>
|
||||
define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
|
||||
%vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
|
||||
%add = fadd <3 x double> %vec, %b
|
||||
store <3 x double> %add, <3 x double> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fadd_f64() #0 {
|
||||
%f64 = fadd double undef, undef
|
||||
%v2f64 = fadd <2 x double> undef, undef
|
||||
%v3f64 = fadd <3 x double> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fadd_f16'
|
||||
; ALL: estimated cost of 1 for {{.*}} fadd half
|
||||
define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
|
||||
%vec = load half, half addrspace(1)* %vaddr
|
||||
%add = fadd half %vec, %b
|
||||
store half %add, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fadd_v2f16'
|
||||
; SLOWF16: estimated cost of 2 for {{.*}} fadd <2 x half>
|
||||
; FASTF16: estimated cost of 1 for {{.*}} fadd <2 x half>
|
||||
define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
|
||||
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
|
||||
%add = fadd <2 x half> %vec, %b
|
||||
store <2 x half> %add, <2 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fadd_v3f16'
|
||||
; SLOWF16: estimated cost of 4 for {{.*}} fadd <3 x half>
|
||||
; FASTF16: estimated cost of 2 for {{.*}} fadd <3 x half>
|
||||
define amdgpu_kernel void @fadd_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 {
|
||||
%vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
|
||||
%add = fadd <3 x half> %vec, %b
|
||||
store <3 x half> %add, <3 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fadd_v4f16'
|
||||
; SLOWF16: estimated cost of 4 for {{.*}} fadd <4 x half>
|
||||
; FASTF16: estimated cost of 2 for {{.*}} fadd <4 x half>
|
||||
define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
|
||||
%vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
|
||||
%add = fadd <4 x half> %vec, %b
|
||||
store <4 x half> %add, <4 x half> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fadd_f16() #0 {
|
||||
%f16 = fadd half undef, undef
|
||||
%v2f16 = fadd <2 x half> undef, undef
|
||||
%v3f16 = fadd <3 x half> undef, undef
|
||||
%v4f16 = fadd <4 x half> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -9,84 +9,39 @@
|
|||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZESI,SIZENOF16 %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-mesa-mesa3d -mcpu=verde < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZESI,SIZENOF16 %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZECI,SIZEF16 %s
|
||||
; END.
|
||||
|
||||
; ALL-LABEL: 'fdiv_f32_ieee'
|
||||
; THRPTALL: estimated cost of 14 for {{.*}} fdiv float
|
||||
; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float>
|
||||
; THRPTALL: estimated cost of 42 for {{.*}} fdiv <3 x float>
|
||||
; THRPTALL: estimated cost of 70 for {{.*}} fdiv <5 x float>
|
||||
; SIZEALL: estimated cost of 12 for {{.*}} fdiv float
|
||||
define amdgpu_kernel void @fdiv_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
|
||||
%vec = load float, float addrspace(1)* %vaddr
|
||||
%add = fdiv float %vec, %b
|
||||
store float %add, float addrspace(1)* %out
|
||||
; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float>
|
||||
; SIZEALL: estimated cost of 36 for {{.*}} fdiv <3 x float>
|
||||
; SIZEALL: estimated cost of 60 for {{.*}} fdiv <5 x float>
|
||||
define amdgpu_kernel void @fdiv_f32_ieee() #0 {
|
||||
%f32 = fdiv float undef, undef
|
||||
%v2f32 = fdiv <2 x float> undef, undef
|
||||
%v3f32 = fdiv <3 x float> undef, undef
|
||||
%v5f32 = fdiv <5 x float> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fdiv_f32_ftzdaz'
|
||||
; THRPTALL: estimated cost of 16 for {{.*}} fdiv float
|
||||
; SIZEALL: estimated cost of 14 for {{.*}} fdiv float
|
||||
define amdgpu_kernel void @fdiv_f32_ftzdaz(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #1 {
|
||||
%vec = load float, float addrspace(1)* %vaddr
|
||||
%add = fdiv float %vec, %b
|
||||
store float %add, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fdiv_v2f32_ieee'
|
||||
; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float>
|
||||
; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float>
|
||||
define amdgpu_kernel void @fdiv_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
|
||||
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
|
||||
%add = fdiv <2 x float> %vec, %b
|
||||
store <2 x float> %add, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fdiv_v2f32_ftzdaz'
|
||||
; THRPTALL: estimated cost of 32 for {{.*}} fdiv <2 x float>
|
||||
; SIZEALL: estimated cost of 28 for {{.*}} fdiv <2 x float>
|
||||
define amdgpu_kernel void @fdiv_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #1 {
|
||||
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
|
||||
%add = fdiv <2 x float> %vec, %b
|
||||
store <2 x float> %add, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fdiv_v3f32_ieee'
|
||||
; THRPTALL: estimated cost of 42 for {{.*}} fdiv <3 x float>
|
||||
; SIZEALL: estimated cost of 36 for {{.*}} fdiv <3 x float>
|
||||
define amdgpu_kernel void @fdiv_v3f32_ieee(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
|
||||
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
|
||||
%add = fdiv <3 x float> %vec, %b
|
||||
store <3 x float> %add, <3 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fdiv_v3f32_ftzdaz'
|
||||
; THRPTALL: estimated cost of 48 for {{.*}} fdiv <3 x float>
|
||||
; SIZEALL: estimated cost of 42 for {{.*}} fdiv <3 x float>
|
||||
define amdgpu_kernel void @fdiv_v3f32_ftzdaz(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #1 {
|
||||
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
|
||||
%add = fdiv <3 x float> %vec, %b
|
||||
store <3 x float> %add, <3 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fdiv_v5f32_ieee'
|
||||
; THRPTALL: estimated cost of 70 for {{.*}} fdiv <5 x float>
|
||||
; SIZEALL: estimated cost of 60 for {{.*}} fdiv <5 x float>
|
||||
define amdgpu_kernel void @fdiv_v5f32_ieee(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
|
||||
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
|
||||
%add = fdiv <5 x float> %vec, %b
|
||||
store <5 x float> %add, <5 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fdiv_v5f32_ftzdaz'
|
||||
; THRPTALL: estimated cost of 80 for {{.*}} fdiv <5 x float>
|
||||
; SIZEALL: estimated cost of 70 for {{.*}} fdiv <5 x float>
|
||||
define amdgpu_kernel void @fdiv_v5f32_ftzdaz(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #1 {
|
||||
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
|
||||
%add = fdiv <5 x float> %vec, %b
|
||||
store <5 x float> %add, <5 x float> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fdiv_f32_ftzdaz() #1 {
|
||||
%f32 = fdiv float undef, undef
|
||||
%v2f32 = fdiv <2 x float> undef, undef
|
||||
%v3f32 = fdiv <3 x float> undef, undef
|
||||
%v5f32 = fdiv <5 x float> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
@ -97,208 +52,107 @@ define amdgpu_kernel void @fdiv_v5f32_ftzdaz(<5 x float> addrspace(1)* %out, <5
|
|||
; SISLOWF64: estimated cost of 41 for {{.*}} fdiv double
|
||||
; SIZECI: estimated cost of 22 for {{.*}} fdiv double
|
||||
; SIZESI: estimated cost of 25 for {{.*}} fdiv double
|
||||
define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
|
||||
%vec = load double, double addrspace(1)* %vaddr
|
||||
%add = fdiv double %vec, %b
|
||||
store double %add, double addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fdiv_v2f64'
|
||||
; CIFASTF64: estimated cost of 48 for {{.*}} fdiv <2 x double>
|
||||
; CISLOWF64: estimated cost of 76 for {{.*}} fdiv <2 x double>
|
||||
; SIFASTF64: estimated cost of 54 for {{.*}} fdiv <2 x double>
|
||||
; SISLOWF64: estimated cost of 82 for {{.*}} fdiv <2 x double>
|
||||
; SIZECI: estimated cost of 44 for {{.*}} fdiv <2 x double>
|
||||
; SIZESI: estimated cost of 50 for {{.*}} fdiv <2 x double>
|
||||
define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
|
||||
%vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
|
||||
%add = fdiv <2 x double> %vec, %b
|
||||
store <2 x double> %add, <2 x double> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fdiv_v3f64'
|
||||
; CIFASTF64: estimated cost of 72 for {{.*}} fdiv <3 x double>
|
||||
; CISLOWF64: estimated cost of 114 for {{.*}} fdiv <3 x double>
|
||||
; SIFASTF64: estimated cost of 81 for {{.*}} fdiv <3 x double>
|
||||
; SISLOWF64: estimated cost of 123 for {{.*}} fdiv <3 x double>
|
||||
; SIZECI: estimated cost of 66 for {{.*}} fdiv <3 x double>
|
||||
; SIZESI: estimated cost of 75 for {{.*}} fdiv <3 x double>
|
||||
define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
|
||||
%vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
|
||||
%add = fdiv <3 x double> %vec, %b
|
||||
store <3 x double> %add, <3 x double> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fdiv_f64() #0 {
|
||||
%f64 = fdiv double undef, undef
|
||||
%v2f64 = fdiv <2 x double> undef, undef
|
||||
%v3f64 = fdiv <3 x double> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fdiv_f16_f32_ieee'
|
||||
; ALL-LABEL: 'fdiv_f16_f32ieee'
|
||||
; NOFP16: estimated cost of 14 for {{.*}} fdiv half
|
||||
; FP16: estimated cost of 12 for {{.*}} fdiv half
|
||||
; SIZENOF16: estimated cost of 12 for {{.*}} fdiv half
|
||||
; SIZEF16: estimated cost of 8 for {{.*}} fdiv half
|
||||
define amdgpu_kernel void @fdiv_f16_f32_ieee(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
|
||||
%vec = load half, half addrspace(1)* %vaddr
|
||||
%add = fdiv half %vec, %b
|
||||
store half %add, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fdiv_f16_f32_ftzdaz'
|
||||
; NOFP16: estimated cost of 16 for {{.*}} fdiv half
|
||||
; FP16: estimated cost of 12 for {{.*}} fdiv half
|
||||
; SIZENOF16: estimated cost of 14 for {{.*}} fdiv half
|
||||
; SIZEF16: estimated cost of 8 for {{.*}} fdiv half
|
||||
define amdgpu_kernel void @fdiv_f16_f32_ftzdaz(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #1 {
|
||||
%vec = load half, half addrspace(1)* %vaddr
|
||||
%add = fdiv half %vec, %b
|
||||
store half %add, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fdiv_v2f16_f32_ieee'
|
||||
; NOFP16: estimated cost of 28 for {{.*}} fdiv <2 x half>
|
||||
; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half>
|
||||
; SIZENOF16: estimated cost of 24 for {{.*}} fdiv <2 x half>
|
||||
; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half>
|
||||
define amdgpu_kernel void @fdiv_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
|
||||
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
|
||||
%add = fdiv <2 x half> %vec, %b
|
||||
store <2 x half> %add, <2 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fdiv_v2f16_f32_ftzdaz'
|
||||
; NOFP16: estimated cost of 32 for {{.*}} fdiv <2 x half>
|
||||
; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half>
|
||||
; SIZENOF16: estimated cost of 28 for {{.*}} fdiv <2 x half>
|
||||
; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half>
|
||||
define amdgpu_kernel void @fdiv_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #1 {
|
||||
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
|
||||
%add = fdiv <2 x half> %vec, %b
|
||||
store <2 x half> %add, <2 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fdiv_v4f16_f32_ieee'
|
||||
; NOFP16: estimated cost of 56 for {{.*}} fdiv <4 x half>
|
||||
; FP16: estimated cost of 48 for {{.*}} fdiv <4 x half>
|
||||
; SIZENOF16: estimated cost of 48 for {{.*}} fdiv <4 x half>
|
||||
; SIZEF16: estimated cost of 32 for {{.*}} fdiv <4 x half>
|
||||
define amdgpu_kernel void @fdiv_v4f16_f32_ieee(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
|
||||
%vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
|
||||
%add = fdiv <4 x half> %vec, %b
|
||||
store <4 x half> %add, <4 x half> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fdiv_f16_f32ieee() #0 {
|
||||
%f16 = fdiv half undef, undef
|
||||
%v2f16 = fdiv <2 x half> undef, undef
|
||||
%v4f16 = fdiv <4 x half> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fdiv_v4f16_f32_ftzdaz'
|
||||
; ALL-LABEL: 'fdiv_f16_f32ftzdaz'
|
||||
; NOFP16: estimated cost of 16 for {{.*}} fdiv half
|
||||
; FP16: estimated cost of 12 for {{.*}} fdiv half
|
||||
; SIZENOF16: estimated cost of 14 for {{.*}} fdiv half
|
||||
; SIZEF16: estimated cost of 8 for {{.*}} fdiv half
|
||||
; NOFP16: estimated cost of 32 for {{.*}} fdiv <2 x half>
|
||||
; FP16: estimated cost of 24 for {{.*}} fdiv <2 x half>
|
||||
; SIZENOF16: estimated cost of 28 for {{.*}} fdiv <2 x half>
|
||||
; SIZEF16: estimated cost of 16 for {{.*}} fdiv <2 x half>
|
||||
; NOFP16: estimated cost of 64 for {{.*}} fdiv <4 x half>
|
||||
; FP16: estimated cost of 48 for {{.*}} fdiv <4 x half>
|
||||
; SIZENOF16: estimated cost of 56 for {{.*}} fdiv <4 x half>
|
||||
; SIZEF16: estimated cost of 32 for {{.*}} fdiv <4 x half>
|
||||
define amdgpu_kernel void @fdiv_v4f16_f32_ftzdaz(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #1 {
|
||||
%vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
|
||||
%add = fdiv <4 x half> %vec, %b
|
||||
store <4 x half> %add, <4 x half> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fdiv_f16_f32ftzdaz() #1 {
|
||||
%f16 = fdiv half undef, undef
|
||||
%v2f16 = fdiv <2 x half> undef, undef
|
||||
%v4f16 = fdiv <4 x half> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'rcp_f32_ieee'
|
||||
; ALL-LABEL: 'rcp_ieee'
|
||||
; THRPTALL: estimated cost of 14 for {{.*}} fdiv float
|
||||
; SIZEALL: estimated cost of 12 for {{.*}} fdiv float
|
||||
define amdgpu_kernel void @rcp_f32_ieee(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
|
||||
%vec = load float, float addrspace(1)* %vaddr
|
||||
%add = fdiv float 1.0, %vec
|
||||
store float %add, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'rcp_f32_ftzdaz'
|
||||
; THRPTALL: estimated cost of 4 for {{.*}} fdiv float
|
||||
; SIZEALL: estimated cost of 2 for {{.*}} fdiv float
|
||||
define amdgpu_kernel void @rcp_f32_ftzdaz(float addrspace(1)* %out, float addrspace(1)* %vaddr) #1 {
|
||||
%vec = load float, float addrspace(1)* %vaddr
|
||||
%add = fdiv float 1.0, %vec
|
||||
store float %add, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'rcp_f16_f32_ieee'
|
||||
; NOFP16: estimated cost of 14 for {{.*}} fdiv half
|
||||
; FP16: estimated cost of 4 for {{.*}} fdiv half
|
||||
; SIZENOF16: estimated cost of 12 for {{.*}} fdiv half
|
||||
; SIZEF16: estimated cost of 2 for {{.*}} fdiv half
|
||||
define amdgpu_kernel void @rcp_f16_f32_ieee(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
|
||||
%vec = load half, half addrspace(1)* %vaddr
|
||||
%add = fdiv half 1.0, %vec
|
||||
store half %add, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'rcp_f16_f32_ftzdaz'
|
||||
; THRPTALL: estimated cost of 4 for {{.*}} fdiv half
|
||||
; SIZEALL: estimated cost of 2 for {{.*}} fdiv half
|
||||
define amdgpu_kernel void @rcp_f16_f32_ftzdaz(half addrspace(1)* %out, half addrspace(1)* %vaddr) #1 {
|
||||
%vec = load half, half addrspace(1)* %vaddr
|
||||
%add = fdiv half 1.0, %vec
|
||||
store half %add, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'rcp_f64'
|
||||
; CIFASTF64: estimated cost of 24 for {{.*}} fdiv double
|
||||
; CISLOWF64: estimated cost of 38 for {{.*}} fdiv double
|
||||
; SIFASTF64: estimated cost of 27 for {{.*}} fdiv double
|
||||
; SISLOWF64: estimated cost of 41 for {{.*}} fdiv double
|
||||
; SIZECI: estimated cost of 22 for {{.*}} fdiv double
|
||||
; SIZESI: estimated cost of 25 for {{.*}} fdiv double
|
||||
define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
|
||||
%vec = load double, double addrspace(1)* %vaddr
|
||||
%add = fdiv double 1.0, %vec
|
||||
store double %add, double addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'rcp_v2f32_ieee'
|
||||
; THRPTALL: estimated cost of 28 for {{.*}} fdiv <2 x float>
|
||||
; SIZEALL: estimated cost of 24 for {{.*}} fdiv <2 x float>
|
||||
define amdgpu_kernel void @rcp_v2f32_ieee(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
|
||||
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
|
||||
%add = fdiv <2 x float> <float 1.0, float 1.0>, %vec
|
||||
store <2 x float> %add, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'rcp_v2f32_ftzdaz'
|
||||
; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x float>
|
||||
; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x float>
|
||||
define amdgpu_kernel void @rcp_v2f32_ftzdaz(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #1 {
|
||||
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
|
||||
%add = fdiv <2 x float> <float 1.0, float 1.0>, %vec
|
||||
store <2 x float> %add, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'rcp_v2f16_f32_ieee'
|
||||
; NOFP16: estimated cost of 28 for {{.*}} fdiv <2 x half>
|
||||
; FP16: estimated cost of 8 for {{.*}} fdiv <2 x half>
|
||||
; SIZENOF16: estimated cost of 24 for {{.*}} fdiv <2 x half>
|
||||
; SIZEF16: estimated cost of 4 for {{.*}} fdiv <2 x half>
|
||||
define amdgpu_kernel void @rcp_v2f16_f32_ieee(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
|
||||
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
|
||||
%add = fdiv <2 x half> <half 1.0, half 1.0>, %vec
|
||||
store <2 x half> %add, <2 x half> addrspace(1)* %out
|
||||
define amdgpu_kernel void @rcp_ieee() #0 {
|
||||
%f32 = fdiv float 1.0, undef
|
||||
%f16 = fdiv half 1.0, undef
|
||||
%f64 = fdiv double 1.0, undef
|
||||
%v2f32 = fdiv <2 x float> <float 1.0, float 1.0>, undef
|
||||
%v2f16 = fdiv <2 x half> <half 1.0, half 1.0>, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'rcp_v2f16_f32_ftzdaz'
|
||||
; ALL-LABEL: 'rcp_ftzdaz'
|
||||
; THRPTALL: estimated cost of 4 for {{.*}} fdiv float
|
||||
; SIZEALL: estimated cost of 2 for {{.*}} fdiv float
|
||||
; THRPTALL: estimated cost of 4 for {{.*}} fdiv half
|
||||
; SIZEALL: estimated cost of 2 for {{.*}} fdiv half
|
||||
; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x float>
|
||||
; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x float>
|
||||
; THRPTALL: estimated cost of 8 for {{.*}} fdiv <2 x half>
|
||||
; SIZEALL: estimated cost of 4 for {{.*}} fdiv <2 x half>
|
||||
define amdgpu_kernel void @rcp_v2f16_f32_ftzdaz(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #1 {
|
||||
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
|
||||
%add = fdiv <2 x half> <half 1.0, half 1.0>, %vec
|
||||
store <2 x half> %add, <2 x half> addrspace(1)* %out
|
||||
define amdgpu_kernel void @rcp_ftzdaz() #1 {
|
||||
%f32 = fdiv float 1.0, undef
|
||||
%f16 = fdiv half 1.0, undef
|
||||
%v2f32 = fdiv <2 x float> <float 1.0, float 1.0>, undef
|
||||
%v2f16 = fdiv <2 x half> <half 1.0, half 1.0>, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3,48 +3,26 @@
|
|||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZEF16 %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SIZENOF16 %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX90A-FASTF64,FASTF16,PACKEDF32,ALL %s
|
||||
; END.
|
||||
|
||||
; ALL-LABEL: 'fma_f32'
|
||||
; SLOWF32: estimated cost of 4 for {{.*}} call float @llvm.fma.f32
|
||||
; FASTF32: estimated cost of 2 for {{.*}} call float @llvm.fma.f32
|
||||
; SIZEALL: estimated cost of 2 for {{.*}} call float @llvm.fma.f32
|
||||
define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
|
||||
%vec = load float, float addrspace(1)* %vaddr
|
||||
%fma = call float @llvm.fma.f32(float %vec, float %vec, float %vec) #1
|
||||
store float %fma, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fma_v2f32'
|
||||
; SLOWF32: estimated cost of 8 for {{.*}} call <2 x float> @llvm.fma.v2f32
|
||||
; PACKEDF32: estimated cost of 2 for {{.*}} call <2 x float> @llvm.fma.v2f32
|
||||
; SIZEALL: estimated cost of 4 for {{.*}} call <2 x float> @llvm.fma.v2f32
|
||||
define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
|
||||
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
|
||||
%fma = call <2 x float> @llvm.fma.v2f32(<2 x float> %vec, <2 x float> %vec, <2 x float> %vec) #1
|
||||
store <2 x float> %fma, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fma_v3f32'
|
||||
; SLOWF32: estimated cost of 12 for {{.*}} call <3 x float> @llvm.fma.v3f32
|
||||
; PACKEDF32: estimated cost of 4 for {{.*}} call <3 x float> @llvm.fma.v3f32
|
||||
; SIZEALL: estimated cost of 6 for {{.*}} call <3 x float> @llvm.fma.v3f32
|
||||
define amdgpu_kernel void @fma_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
|
||||
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
|
||||
%fma = call <3 x float> @llvm.fma.v3f32(<3 x float> %vec, <3 x float> %vec, <3 x float> %vec) #1
|
||||
store <3 x float> %fma, <3 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fma_v5f32'
|
||||
; SLOWF32: estimated cost of 20 for {{.*}} call <5 x float> @llvm.fma.v5f32
|
||||
; PACKEDF32: estimated cost of 6 for {{.*}} call <5 x float> @llvm.fma.v5f32
|
||||
; SIZEALL: estimated cost of 10 for {{.*}} call <5 x float> @llvm.fma.v5f32
|
||||
define amdgpu_kernel void @fma_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) #0 {
|
||||
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
|
||||
%fma = call <5 x float> @llvm.fma.v5f32(<5 x float> %vec, <5 x float> %vec, <5 x float> %vec) #1
|
||||
store <5 x float> %fma, <5 x float> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fma_f32() #0 {
|
||||
%f32 = call float @llvm.fma.f32(float undef, float undef, float undef) #1
|
||||
%v2f32 = call <2 x float> @llvm.fma.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef) #1
|
||||
%v3f32 = call <3 x float> @llvm.fma.v3f32(<3 x float> undef, <3 x float> undef, <3 x float> undef) #1
|
||||
%v5f32 = call <5 x float> @llvm.fma.v5f32(<5 x float> undef, <5 x float> undef, <5 x float> undef) #1
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
@ -53,33 +31,17 @@ define amdgpu_kernel void @fma_v5f32(<5 x float> addrspace(1)* %out, <5 x float>
|
|||
; GFX90A-FASTF64: estimated cost of 1 for {{.*}} call double @llvm.fma.f64
|
||||
; FASTF64: estimated cost of 2 for {{.*}} call double @llvm.fma.f64
|
||||
; SIZEALL: estimated cost of 2 for {{.*}} call double @llvm.fma.f64
|
||||
define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
|
||||
%vec = load double, double addrspace(1)* %vaddr
|
||||
%fma = call double @llvm.fma.f64(double %vec, double %vec, double %vec) #1
|
||||
store double %fma, double addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fma_v2f64'
|
||||
; SLOWF64: estimated cost of 8 for {{.*}} call <2 x double> @llvm.fma.v2f64
|
||||
; GFX90A-FASTF64: estimated cost of 2 for {{.*}} call <2 x double> @llvm.fma.v2f64
|
||||
; FASTF64: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64
|
||||
; SIZEALL: estimated cost of 4 for {{.*}} call <2 x double> @llvm.fma.v2f64
|
||||
define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
|
||||
%vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
|
||||
%fma = call <2 x double> @llvm.fma.v2f64(<2 x double> %vec, <2 x double> %vec, <2 x double> %vec) #1
|
||||
store <2 x double> %fma, <2 x double> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fma_v3f64'
|
||||
; SLOWF64: estimated cost of 12 for {{.*}} call <3 x double> @llvm.fma.v3f64
|
||||
; FASTF64: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64
|
||||
; SIZEALL: estimated cost of 6 for {{.*}} call <3 x double> @llvm.fma.v3f64
|
||||
define amdgpu_kernel void @fma_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
|
||||
%vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
|
||||
%fma = call <3 x double> @llvm.fma.v3f64(<3 x double> %vec, <3 x double> %vec, <3 x double> %vec) #1
|
||||
store <3 x double> %fma, <3 x double> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fma_f64() #0 {
|
||||
%f64 = call double @llvm.fma.f64(double undef, double undef, double undef) #1
|
||||
%v2f64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef) #1
|
||||
%v3f64 = call <3 x double> @llvm.fma.v3f64(<3 x double> undef, <3 x double> undef, <3 x double> undef) #1
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
@ -87,34 +49,18 @@ define amdgpu_kernel void @fma_v3f64(<3 x double> addrspace(1)* %out, <3 x doubl
|
|||
; SLOWF16: estimated cost of 4 for {{.*}} call half @llvm.fma.f16
|
||||
; FASTF16: estimated cost of 2 for {{.*}} call half @llvm.fma.f16
|
||||
; SIZEALL: estimated cost of 2 for {{.*}} call half @llvm.fma.f16
|
||||
define amdgpu_kernel void @fma_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
|
||||
%vec = load half, half addrspace(1)* %vaddr
|
||||
%fma = call half @llvm.fma.f16(half %vec, half %vec, half %vec) #1
|
||||
store half %fma, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fma_v2f16'
|
||||
; SLOWF16: estimated cost of 8 for {{.*}} call <2 x half> @llvm.fma.v2f16
|
||||
; FASTF16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16
|
||||
; SIZEF16: estimated cost of 2 for {{.*}} call <2 x half> @llvm.fma.v2f16
|
||||
; SIZENOF16: estimated cost of 4 for {{.*}} call <2 x half> @llvm.fma.v2f16
|
||||
define amdgpu_kernel void @fma_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
|
||||
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
|
||||
%fma = call <2 x half> @llvm.fma.v2f16(<2 x half> %vec, <2 x half> %vec, <2 x half> %vec) #1
|
||||
store <2 x half> %fma, <2 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fma_v3f16'
|
||||
; SLOWF16: estimated cost of 16 for {{.*}} call <3 x half> @llvm.fma.v3f16
|
||||
; FASTF16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16
|
||||
; SIZEF16: estimated cost of 4 for {{.*}} call <3 x half> @llvm.fma.v3f16
|
||||
; SIZENOF16: estimated cost of 8 for {{.*}} call <3 x half> @llvm.fma.v3f16
|
||||
define amdgpu_kernel void @fma_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
|
||||
%vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
|
||||
%fma = call <3 x half> @llvm.fma.v3f16(<3 x half> %vec, <3 x half> %vec, <3 x half> %vec) #1
|
||||
store <3 x half> %fma, <3 x half> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fma_f16() #0 {
|
||||
%f16 = call half @llvm.fma.f16(half undef, half undef, half undef) #1
|
||||
%v2f16 = call <2 x half> @llvm.fma.v2f16(<2 x half> undef, <2 x half> undef, <2 x half> undef) #1
|
||||
%v3f16 = call <3 x half> @llvm.fma.v3f16(<3 x half> undef, <3 x half> undef, <3 x half> undef) #1
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -3,47 +3,25 @@
|
|||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,FASTF16 %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SLOWF16 %s
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=GFX90A-FASTF64,FASTF16,PACKEDF32,ALL %s
|
||||
; END.
|
||||
|
||||
; ALL-LABEL: 'fmul_f32'
|
||||
; ALL: estimated cost of 1 for {{.*}} fmul float
|
||||
define amdgpu_kernel void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
|
||||
%vec = load float, float addrspace(1)* %vaddr
|
||||
%add = fmul float %vec, %b
|
||||
store float %add, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_v2f32'
|
||||
; NOPACKEDF32: estimated cost of 2 for {{.*}} fmul <2 x float>
|
||||
; PACKEDF32: estimated cost of 1 for {{.*}} fmul <2 x float>
|
||||
define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
|
||||
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
|
||||
%add = fmul <2 x float> %vec, %b
|
||||
store <2 x float> %add, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_v3f32'
|
||||
; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 3 when it is legal.
|
||||
;;; Allow for 4 when v3f32 is illegal and TargetLowering thinks it needs widening,
|
||||
;;; and 3 when it is legal.
|
||||
; NOPACKEDF32: estimated cost of {{[34]}} for {{.*}} fmul <3 x float>
|
||||
; PACKEDF32: estimated cost of 2 for {{.*}} fmul <3 x float>
|
||||
define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
|
||||
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
|
||||
%add = fmul <3 x float> %vec, %b
|
||||
store <3 x float> %add, <3 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_v5f32'
|
||||
; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
|
||||
; and 5 when it is legal.
|
||||
;;; Allow for 8 when v5f32 is illegal and TargetLowering thinks it needs widening,
|
||||
;;; and 5 when it is legal.
|
||||
; NOPACKEDF32: estimated cost of {{[58]}} for {{.*}} fmul <5 x float>
|
||||
; PACKEDF32: estimated cost of 3 for {{.*}} fmul <5 x float>
|
||||
define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
|
||||
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
|
||||
%add = fmul <5 x float> %vec, %b
|
||||
store <5 x float> %add, <5 x float> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fmul_f32() #0 {
|
||||
%f32 = fmul float undef, undef
|
||||
%v2f32 = fmul <2 x float> undef, undef
|
||||
%v3f32 = fmul <3 x float> undef, undef
|
||||
%v5f32 = fmul <5 x float> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
@ -52,71 +30,32 @@ define amdgpu_kernel void @fmul_v5f32(<5 x float> addrspace(1)* %out, <5 x float
|
|||
; FASTF64: estimated cost of 2 for {{.*}} fmul double
|
||||
; SLOWF64: estimated cost of 4 for {{.*}} fmul double
|
||||
; SIZEALL: estimated cost of 2 for {{.*}} fmul double
|
||||
define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
|
||||
%vec = load double, double addrspace(1)* %vaddr
|
||||
%add = fmul double %vec, %b
|
||||
store double %add, double addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_v2f64'
|
||||
; FASTF64: estimated cost of 4 for {{.*}} fmul <2 x double>
|
||||
; SLOWF64: estimated cost of 8 for {{.*}} fmul <2 x double>
|
||||
; SIZEALL: estimated cost of 4 for {{.*}} fmul <2 x double>
|
||||
define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
|
||||
%vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
|
||||
%add = fmul <2 x double> %vec, %b
|
||||
store <2 x double> %add, <2 x double> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_v3f64'
|
||||
; FASTF64: estimated cost of 6 for {{.*}} fmul <3 x double>
|
||||
; SLOWF64: estimated cost of 12 for {{.*}} fmul <3 x double>
|
||||
; SIZEALL: estimated cost of 6 for {{.*}} fmul <3 x double>
|
||||
define amdgpu_kernel void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
|
||||
%vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
|
||||
%add = fmul <3 x double> %vec, %b
|
||||
store <3 x double> %add, <3 x double> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fmul_f64() #0 {
|
||||
%f64 = fmul double undef, undef
|
||||
%v2f64 = fmul <2 x double> undef, undef
|
||||
%v3f64 = fmul <3 x double> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_f16'
|
||||
; ALL: estimated cost of 1 for {{.*}} fmul half
|
||||
define amdgpu_kernel void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
|
||||
%vec = load half, half addrspace(1)* %vaddr
|
||||
%add = fmul half %vec, %b
|
||||
store half %add, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_v2f16'
|
||||
; SLOWF16: estimated cost of 2 for {{.*}} fmul <2 x half>
|
||||
; FASTF16: estimated cost of 1 for {{.*}} fmul <2 x half>
|
||||
define amdgpu_kernel void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
|
||||
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
|
||||
%add = fmul <2 x half> %vec, %b
|
||||
store <2 x half> %add, <2 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_v3f16'
|
||||
; SLOWF16: estimated cost of 4 for {{.*}} fmul <3 x half>
|
||||
; FASTF16: estimated cost of 2 for {{.*}} fmul <3 x half>
|
||||
define amdgpu_kernel void @fmul_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 {
|
||||
%vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
|
||||
%add = fmul <3 x half> %vec, %b
|
||||
store <3 x half> %add, <3 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_v4f16'
|
||||
; SLOWF16: estimated cost of 4 for {{.*}} fmul <4 x half>
|
||||
; FASTF16: estimated cost of 2 for {{.*}} fmul <4 x half>
|
||||
define amdgpu_kernel void @fmul_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
|
||||
%vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
|
||||
%add = fmul <4 x half> %vec, %b
|
||||
store <4 x half> %add, <4 x half> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fmul_f16() #0 {
|
||||
%f16 = fmul half undef, undef
|
||||
%v2f16 = fmul <2 x half> undef, undef
|
||||
%v3f16 = fmul <3 x half> undef, undef
|
||||
%v4f16 = fmul <4 x half> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,102 +1,38 @@
|
|||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s
|
||||
; END.
|
||||
|
||||
; CHECK-LABEL: 'fneg_f32'
|
||||
; CHECK: estimated cost of 0 for instruction: %fneg = fneg float
|
||||
define amdgpu_kernel void @fneg_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) {
|
||||
%vec = load float, float addrspace(1)* %vaddr
|
||||
%fadd = fadd float %vec, undef
|
||||
%fneg = fneg float %fadd
|
||||
store float %fneg, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'fneg_v2f32'
|
||||
; CHECK: estimated cost of 0 for instruction: %fneg = fneg <2 x float>
|
||||
define amdgpu_kernel void @fneg_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
|
||||
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
|
||||
%fadd = fadd <2 x float> %vec, undef
|
||||
%fneg = fneg <2 x float> %fadd
|
||||
store <2 x float> %fneg, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'fneg_v3f32'
|
||||
; CHECK: estimated cost of 0 for instruction: %fneg = fneg <3 x float>
|
||||
define amdgpu_kernel void @fneg_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) {
|
||||
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
|
||||
%fadd = fadd <3 x float> %vec, undef
|
||||
%fneg = fneg <3 x float> %fadd
|
||||
store <3 x float> %fneg, <3 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'fneg_v5f32'
|
||||
; CHECK: estimated cost of 0 for instruction: %fneg = fneg <5 x float>
|
||||
define amdgpu_kernel void @fneg_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr) {
|
||||
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
|
||||
%fadd = fadd <5 x float> %vec, undef
|
||||
%fneg = fneg <5 x float> %fadd
|
||||
store <5 x float> %fneg, <5 x float> addrspace(1)* %out
|
||||
; CHECK: estimated cost of 0 for {{.*}} fneg float
|
||||
; CHECK: estimated cost of 0 for {{.*}} fneg <2 x float>
|
||||
; CHECK: estimated cost of 0 for {{.*}} fneg <3 x float>
|
||||
; CHECK: estimated cost of 0 for {{.*}} fneg <5 x float>
|
||||
define amdgpu_kernel void @fneg_f32() {
|
||||
%f32 = fneg float undef
|
||||
%v2f32 = fneg <2 x float> undef
|
||||
%v3f32 = fneg <3 x float> undef
|
||||
%v5f32 = fneg <5 x float> undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'fneg_f64'
|
||||
; CHECK: estimated cost of 0 for instruction: %fneg = fneg double
|
||||
define amdgpu_kernel void @fneg_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) {
|
||||
%vec = load double, double addrspace(1)* %vaddr
|
||||
%fadd = fadd double %vec, undef
|
||||
%fneg = fneg double %fadd
|
||||
store double %fneg, double addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'fneg_v2f64'
|
||||
; CHECK: estimated cost of 0 for instruction: %fneg = fneg <2 x double>
|
||||
define amdgpu_kernel void @fneg_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) {
|
||||
%vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
|
||||
%fadd = fadd <2 x double> %vec, undef
|
||||
%fneg = fneg <2 x double> %fadd
|
||||
store <2 x double> %fneg, <2 x double> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'fneg_v3f64'
|
||||
; CHECK: estimated cost of 0 for instruction: %fneg = fneg <3 x double>
|
||||
define amdgpu_kernel void @fneg_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) {
|
||||
%vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
|
||||
%fadd = fadd <3 x double> %vec, undef
|
||||
%fneg = fneg <3 x double> %fadd
|
||||
store <3 x double> %fneg, <3 x double> addrspace(1)* %out
|
||||
; CHECK: estimated cost of 0 for {{.*}} fneg double
|
||||
; CHECK: estimated cost of 0 for {{.*}} fneg <2 x double>
|
||||
; CHECK: estimated cost of 0 for {{.*}} fneg <3 x double>
|
||||
define amdgpu_kernel void @fneg_f64() {
|
||||
%f64 = fneg double undef
|
||||
%v2f64 = fneg <2 x double> undef
|
||||
%v3f64 = fneg <3 x double> undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'fneg_f16'
|
||||
; CHECK: estimated cost of 0 for instruction: %fneg = fneg half
|
||||
define amdgpu_kernel void @fneg_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) {
|
||||
%vec = load half, half addrspace(1)* %vaddr
|
||||
%fadd = fadd half %vec, undef
|
||||
%fneg = fneg half %fadd
|
||||
store half %fneg, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'fneg_v2f16'
|
||||
; CHECK: estimated cost of 0 for instruction: %fneg = fneg <2 x half>
|
||||
define amdgpu_kernel void @fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) {
|
||||
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
|
||||
%fadd = fadd <2 x half> %vec, undef
|
||||
%fneg = fneg <2 x half> %fadd
|
||||
store <2 x half> %fneg, <2 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; CHECK-LABEL: 'fneg_v3f16'
|
||||
; CHECK: estimated cost of 0 for instruction: %fneg = fneg <3 x half>
|
||||
define amdgpu_kernel void @fneg_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) {
|
||||
%vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
|
||||
%fadd = fadd <3 x half> %vec, undef
|
||||
%fneg = fneg <3 x half> %fadd
|
||||
store <3 x half> %fneg, <3 x half> addrspace(1)* %out
|
||||
; CHECK: estimated cost of 0 for {{.*}} fneg half
|
||||
; CHECK: estimated cost of 0 for {{.*}} fneg <2 x half>
|
||||
; CHECK: estimated cost of 0 for {{.*}} fneg <3 x half>
|
||||
define amdgpu_kernel void @fneg_f16() {
|
||||
%f16 = fneg half undef
|
||||
%v2f16 = fneg <2 x half> undef
|
||||
%v3f16 = fneg <3 x half> undef
|
||||
ret void
|
||||
}
|
||||
|
|
|
|||
|
|
@ -2,40 +2,18 @@
|
|||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SLOWF64,SLOWF16,ALL %s
|
||||
; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=SIZEALL,FASTF16,ALL %s
|
||||
; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=SIZEALL,SLOWF16,ALL %s
|
||||
; END.
|
||||
|
||||
; ALL-LABEL: 'fsub_f32'
|
||||
; ALL: estimated cost of 1 for {{.*}} fsub float
|
||||
define amdgpu_kernel void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
|
||||
%vec = load float, float addrspace(1)* %vaddr
|
||||
%add = fsub float %vec, %b
|
||||
store float %add, float addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fsub_v2f32'
|
||||
; ALL: estimated cost of 2 for {{.*}} fsub <2 x float>
|
||||
define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
|
||||
%vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
|
||||
%add = fsub <2 x float> %vec, %b
|
||||
store <2 x float> %add, <2 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fsub_v3f32'
|
||||
; ALL: estimated cost of 3 for {{.*}} fsub <3 x float>
|
||||
define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
|
||||
%vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
|
||||
%add = fsub <3 x float> %vec, %b
|
||||
store <3 x float> %add, <3 x float> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fsub_v5f32'
|
||||
; ALL: estimated cost of 5 for {{.*}} fsub <5 x float>
|
||||
define amdgpu_kernel void @fsub_v5f32(<5 x float> addrspace(1)* %out, <5 x float> addrspace(1)* %vaddr, <5 x float> %b) #0 {
|
||||
%vec = load <5 x float>, <5 x float> addrspace(1)* %vaddr
|
||||
%add = fsub <5 x float> %vec, %b
|
||||
store <5 x float> %add, <5 x float> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fsub_f32() #0 {
|
||||
%f32 = fsub float undef, undef
|
||||
%v2f32 = fsub <2 x float> undef, undef
|
||||
%v3f32 = fsub <3 x float> undef, undef
|
||||
%v5f32 = fsub <5 x float> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
@ -43,70 +21,31 @@ define amdgpu_kernel void @fsub_v5f32(<5 x float> addrspace(1)* %out, <5 x float
|
|||
; FASTF64: estimated cost of 2 for {{.*}} fsub double
|
||||
; SLOWF64: estimated cost of 4 for {{.*}} fsub double
|
||||
; SIZEALL: estimated cost of 2 for {{.*}} fsub double
|
||||
define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
|
||||
%vec = load double, double addrspace(1)* %vaddr
|
||||
%add = fsub double %vec, %b
|
||||
store double %add, double addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fsub_v2f64'
|
||||
; FASTF64: estimated cost of 4 for {{.*}} fsub <2 x double>
|
||||
; SLOWF64: estimated cost of 8 for {{.*}} fsub <2 x double>
|
||||
; SIZEALL: estimated cost of 4 for {{.*}} fsub <2 x double>
|
||||
define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
|
||||
%vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
|
||||
%add = fsub <2 x double> %vec, %b
|
||||
store <2 x double> %add, <2 x double> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fsub_v3f64'
|
||||
; FASTF64: estimated cost of 6 for {{.*}} fsub <3 x double>
|
||||
; SLOWF64: estimated cost of 12 for {{.*}} fsub <3 x double>
|
||||
; SIZEALL: estimated cost of 6 for {{.*}} fsub <3 x double>
|
||||
define amdgpu_kernel void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
|
||||
%vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
|
||||
%add = fsub <3 x double> %vec, %b
|
||||
store <3 x double> %add, <3 x double> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fsub_f64() #0 {
|
||||
%f64 = fsub double undef, undef
|
||||
%v2f64 = fsub <2 x double> undef, undef
|
||||
%v3f64 = fsub <3 x double> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fsub_f16'
|
||||
; ALL: estimated cost of 1 for {{.*}} fsub half
|
||||
define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
|
||||
%vec = load half, half addrspace(1)* %vaddr
|
||||
%add = fsub half %vec, %b
|
||||
store half %add, half addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fsub_v2f16'
|
||||
; SLOWF16: estimated cost of 2 for {{.*}} fsub <2 x half>
|
||||
; FASTF16: estimated cost of 1 for {{.*}} fsub <2 x half>
|
||||
define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
|
||||
%vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
|
||||
%add = fsub <2 x half> %vec, %b
|
||||
store <2 x half> %add, <2 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fsub_v3f16'
|
||||
; SLOWF16: estimated cost of 4 for {{.*}} fsub <3 x half>
|
||||
; FASTF16: estimated cost of 2 for {{.*}} fsub <3 x half>
|
||||
define amdgpu_kernel void @fsub_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr, <3 x half> %b) #0 {
|
||||
%vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
|
||||
%add = fsub <3 x half> %vec, %b
|
||||
store <3 x half> %add, <3 x half> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fsub_v4f16'
|
||||
; SLOWF16: estimated cost of 4 for {{.*}} fsub <4 x half>
|
||||
; FASTF16: estimated cost of 2 for {{.*}} fsub <4 x half>
|
||||
define amdgpu_kernel void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
|
||||
%vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
|
||||
%add = fsub <4 x half> %vec, %b
|
||||
store <4 x half> %add, <4 x half> addrspace(1)* %out
|
||||
define amdgpu_kernel void @fsub_f16() #0 {
|
||||
%f16 = fsub half undef, undef
|
||||
%v2f16 = fsub <2 x half> undef, undef
|
||||
%v3f16 = fsub <3 x half> undef, undef
|
||||
%v4f16 = fsub <4 x half> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
|
|
|||
|
|
@ -6,167 +6,109 @@
|
|||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=on < %s | FileCheck -check-prefixes=SLOW,SZNOCONTRACT,SIZEALL,ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -denormal-fp-math-f32=ieee -denormal-fp-math=ieee -fp-contract=fast < %s | FileCheck -check-prefixes=FUSED,CONTRACT,SIZEALL,ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -denormal-fp-math-f32=preserve-sign -denormal-fp-math=preserve-sign -fp-contract=on < %s | FileCheck -check-prefixes=GFX1030,SZNOCONTRACT,SIZEALL,ALL %s
|
||||
; END.
|
||||
|
||||
target triple = "amdgcn--"
|
||||
|
||||
; ALL-LABEL: 'fmul_fadd_f32':
|
||||
; FUSED: estimated cost of 0 for instruction: %mul = fmul float
|
||||
; SLOW: estimated cost of 1 for instruction: %mul = fmul float
|
||||
; GFX1030: estimated cost of 1 for instruction: %mul = fmul float
|
||||
; ALL: estimated cost of 1 for instruction: %add = fadd float
|
||||
define float @fmul_fadd_f32(float %r0, float %r1, float %r2) #0 {
|
||||
%mul = fmul float %r0, %r1
|
||||
%add = fadd float %mul, %r2
|
||||
ret float %add
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_fadd_contract_f32':
|
||||
; ALL: estimated cost of 0 for instruction: %mul = fmul contract float
|
||||
; ALL: estimated cost of 1 for instruction: %add = fadd contract float
|
||||
define float @fmul_fadd_contract_f32(float %r0, float %r1, float %r2) #0 {
|
||||
%mul = fmul contract float %r0, %r1
|
||||
%add = fadd contract float %mul, %r2
|
||||
ret float %add
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_fadd_v2f32':
|
||||
; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x float>
|
||||
; SLOW: estimated cost of 2 for instruction: %mul = fmul <2 x float>
|
||||
; GFX1030: estimated cost of 2 for instruction: %mul = fmul <2 x float>
|
||||
; ALL: estimated cost of 2 for instruction: %add = fadd <2 x float>
|
||||
define <2 x float> @fmul_fadd_v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) #0 {
|
||||
%mul = fmul <2 x float> %r0, %r1
|
||||
%add = fadd <2 x float> %mul, %r2
|
||||
ret <2 x float> %add
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_fsub_f32':
|
||||
; FUSED: estimated cost of 0 for instruction: %mul = fmul float
|
||||
; SLOW: estimated cost of 1 for instruction: %mul = fmul float
|
||||
; GFX1030: estimated cost of 1 for instruction: %mul = fmul float
|
||||
; ALL: estimated cost of 1 for instruction: %sub = fsub float
|
||||
define float @fmul_fsub_f32(float %r0, float %r1, float %r2) #0 {
|
||||
%mul = fmul float %r0, %r1
|
||||
%sub = fsub float %mul, %r2
|
||||
ret float %sub
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_fsub_v2f32':
|
||||
; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x float>
|
||||
; SLOW: estimated cost of 2 for instruction: %mul = fmul <2 x float>
|
||||
; GFX1030: estimated cost of 2 for instruction: %mul = fmul <2 x float>
|
||||
; ALL: estimated cost of 2 for instruction: %sub = fsub <2 x float>
|
||||
define <2 x float> @fmul_fsub_v2f32(<2 x float> %r0, <2 x float> %r1, <2 x float> %r2) #0 {
|
||||
%mul = fmul <2 x float> %r0, %r1
|
||||
%sub = fsub <2 x float> %mul, %r2
|
||||
ret <2 x float> %sub
|
||||
; FUSED: estimated cost of 0 for {{.*}} fmul float
|
||||
; SLOW: estimated cost of 1 for {{.*}} fmul float
|
||||
; GFX1030: estimated cost of 1 for {{.*}} fmul float
|
||||
; ALL: estimated cost of 1 for {{.*}} fadd float
|
||||
; ALL: estimated cost of 0 for {{.*}} fmul contract float
|
||||
; ALL: estimated cost of 1 for {{.*}} fadd contract float
|
||||
; FUSED: estimated cost of 0 for {{.*}} fmul <2 x float>
|
||||
; SLOW: estimated cost of 2 for {{.*}} fmul <2 x float>
|
||||
; GFX1030: estimated cost of 2 for {{.*}} fmul <2 x float>
|
||||
; ALL: estimated cost of 2 for {{.*}} fadd <2 x float>
|
||||
; FUSED: estimated cost of 0 for {{.*}} fmul float
|
||||
; SLOW: estimated cost of 1 for {{.*}} fmul float
|
||||
; GFX1030: estimated cost of 1 for {{.*}} fmul float
|
||||
; ALL: estimated cost of 1 for {{.*}} fsub float
|
||||
; FUSED: estimated cost of 0 for {{.*}} fmul <2 x float>
|
||||
; SLOW: estimated cost of 2 for {{.*}} fmul <2 x float>
|
||||
; GFX1030: estimated cost of 2 for {{.*}} fmul <2 x float>
|
||||
; ALL: estimated cost of 2 for {{.*}} fsub <2 x float>
|
||||
define void @fmul_fadd_f32() #0 {
|
||||
%f32 = fmul float undef, undef
|
||||
%f32add = fadd float %f32, undef
|
||||
%f32c = fmul contract float undef, undef
|
||||
%f32cadd = fadd contract float %f32c, undef
|
||||
%v2f32 = fmul <2 x float> undef, undef
|
||||
%v2f32add = fadd <2 x float> %v2f32, undef
|
||||
%f32_2 = fmul float undef, undef
|
||||
%f32sub = fsub float %f32_2, undef
|
||||
%v2f32_2 = fmul <2 x float> undef, undef
|
||||
%v2f32sub = fsub <2 x float> %v2f32_2, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_fadd_f16':
|
||||
; FUSED: estimated cost of 0 for instruction: %mul = fmul half
|
||||
; SLOW: estimated cost of 1 for instruction: %mul = fmul half
|
||||
; ALL: estimated cost of 1 for instruction: %add = fadd half
|
||||
define half @fmul_fadd_f16(half %r0, half %r1, half %r2) #0 {
|
||||
%mul = fmul half %r0, %r1
|
||||
%add = fadd half %mul, %r2
|
||||
ret half %add
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_fadd_contract_f16':
|
||||
; ALL: estimated cost of 0 for instruction: %mul = fmul contract half
|
||||
; ALL: estimated cost of 1 for instruction: %add = fadd contract half
|
||||
define half @fmul_fadd_contract_f16(half %r0, half %r1, half %r2) #0 {
|
||||
%mul = fmul contract half %r0, %r1
|
||||
%add = fadd contract half %mul, %r2
|
||||
ret half %add
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_fadd_v2f16':
|
||||
; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x half>
|
||||
; SLOW: estimated cost of 1 for instruction: %mul = fmul <2 x half>
|
||||
; ALL: estimated cost of 1 for instruction: %add = fadd <2 x half>
|
||||
define <2 x half> @fmul_fadd_v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2) #0 {
|
||||
%mul = fmul <2 x half> %r0, %r1
|
||||
%add = fadd <2 x half> %mul, %r2
|
||||
ret <2 x half> %add
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_fsub_f16':
|
||||
; FUSED: estimated cost of 0 for instruction: %mul = fmul half
|
||||
; SLOW: estimated cost of 1 for instruction: %mul = fmul half
|
||||
; ALL: estimated cost of 1 for instruction: %sub = fsub half
|
||||
define half @fmul_fsub_f16(half %r0, half %r1, half %r2) #0 {
|
||||
%mul = fmul half %r0, %r1
|
||||
%sub = fsub half %mul, %r2
|
||||
ret half %sub
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_fsub_v2f16':
|
||||
; FUSED: estimated cost of 0 for instruction: %mul = fmul <2 x half>
|
||||
; SLOW: estimated cost of 1 for instruction: %mul = fmul <2 x half>
|
||||
; ALL: estimated cost of 1 for instruction: %sub = fsub <2 x half>
|
||||
define <2 x half> @fmul_fsub_v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2) #0 {
|
||||
%mul = fmul <2 x half> %r0, %r1
|
||||
%sub = fsub <2 x half> %mul, %r2
|
||||
ret <2 x half> %sub
|
||||
; FUSED: estimated cost of 0 for {{.*}} fmul half
|
||||
; SLOW: estimated cost of 1 for {{.*}} fmul half
|
||||
; ALL: estimated cost of 1 for {{.*}} fadd half
|
||||
; ALL: estimated cost of 0 for {{.*}} fmul contract half
|
||||
; ALL: estimated cost of 1 for {{.*}} fadd contract half
|
||||
; FUSED: estimated cost of 0 for {{.*}} fmul <2 x half>
|
||||
; SLOW: estimated cost of 1 for {{.*}} fmul <2 x half>
|
||||
; ALL: estimated cost of 1 for {{.*}} fadd <2 x half>
|
||||
; FUSED: estimated cost of 0 for {{.*}} fmul half
|
||||
; SLOW: estimated cost of 1 for {{.*}} fmul half
|
||||
; ALL: estimated cost of 1 for {{.*}} fsub half
|
||||
; FUSED: estimated cost of 0 for {{.*}} fmul <2 x half>
|
||||
; SLOW: estimated cost of 1 for {{.*}} fmul <2 x half>
|
||||
; ALL: estimated cost of 1 for {{.*}} fsub <2 x half>
|
||||
define void @fmul_fadd_f16() #0 {
|
||||
%f16 = fmul half undef, undef
|
||||
%f16add = fadd half %f16, undef
|
||||
%f16c = fmul contract half undef, undef
|
||||
%f15cadd = fadd contract half %f16c, undef
|
||||
%v2f16 = fmul <2 x half> undef, undef
|
||||
%v2f16add = fadd <2 x half> %v2f16, undef
|
||||
%f16_2 = fmul half undef, undef
|
||||
%f16sub = fsub half %f16_2, undef
|
||||
%v2f16_2 = fmul <2 x half> undef, undef
|
||||
%v2f16sub = fsub <2 x half> %v2f16_2, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_fadd_f64':
|
||||
; CONTRACT: estimated cost of 0 for instruction: %mul = fmul double
|
||||
; NOCONTRACT: estimated cost of 4 for instruction: %mul = fmul double
|
||||
; SZNOCONTRACT: estimated cost of 2 for instruction: %mul = fmul double
|
||||
; THRPTALL: estimated cost of 4 for instruction: %add = fadd double
|
||||
; SIZEALL: estimated cost of 2 for instruction: %add = fadd double
|
||||
define double @fmul_fadd_f64(double %r0, double %r1, double %r2) #0 {
|
||||
%mul = fmul double %r0, %r1
|
||||
%add = fadd double %mul, %r2
|
||||
ret double %add
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_fadd_contract_f64':
|
||||
; ALL: estimated cost of 0 for instruction: %mul = fmul contract double
|
||||
; THRPTALL: estimated cost of 4 for instruction: %add = fadd contract double
|
||||
; SIZEALL: estimated cost of 2 for instruction: %add = fadd contract double
|
||||
define double @fmul_fadd_contract_f64(double %r0, double %r1, double %r2) #0 {
|
||||
%mul = fmul contract double %r0, %r1
|
||||
%add = fadd contract double %mul, %r2
|
||||
ret double %add
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_fadd_v2f64':
|
||||
; CONTRACT: estimated cost of 0 for instruction: %mul = fmul <2 x double>
|
||||
; NOCONTRACT: estimated cost of 8 for instruction: %mul = fmul <2 x double>
|
||||
; SZNOCONTRACT: estimated cost of 4 for instruction: %mul = fmul <2 x double>
|
||||
; THRPTALL: estimated cost of 8 for instruction: %add = fadd <2 x double>
|
||||
; SIZEALL: estimated cost of 4 for instruction: %add = fadd <2 x double>
|
||||
define <2 x double> @fmul_fadd_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 {
|
||||
%mul = fmul <2 x double> %r0, %r1
|
||||
%add = fadd <2 x double> %mul, %r2
|
||||
ret <2 x double> %add
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_fsub_f64':
|
||||
; CONTRACT: estimated cost of 0 for instruction: %mul = fmul double
|
||||
; NOCONTRACT: estimated cost of 4 for instruction: %mul = fmul double
|
||||
; SZNOCONTRACT: estimated cost of 2 for instruction: %mul = fmul double
|
||||
; THRPTALL: estimated cost of 4 for instruction: %sub = fsub double
|
||||
; SIZEALL: estimated cost of 2 for instruction: %sub = fsub double
|
||||
define double @fmul_fsub_f64(double %r0, double %r1, double %r2) #0 {
|
||||
%mul = fmul double %r0, %r1
|
||||
%sub = fsub double %mul, %r2
|
||||
ret double %sub
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'fmul_fsub_v2f64':
|
||||
; CONTRACT: estimated cost of 0 for instruction: %mul = fmul <2 x double>
|
||||
; NOCONTRACT: estimated cost of 8 for instruction: %mul = fmul <2 x double>
|
||||
; SZNOCONTRACT: estimated cost of 4 for instruction: %mul = fmul <2 x double>
|
||||
; THRPTALL: estimated cost of 8 for instruction: %sub = fsub <2 x double>
|
||||
; SIZEALL: estimated cost of 4 for instruction: %sub = fsub <2 x double>
|
||||
define <2 x double> @fmul_fsub_v2f64(<2 x double> %r0, <2 x double> %r1, <2 x double> %r2) #0 {
|
||||
%mul = fmul <2 x double> %r0, %r1
|
||||
%sub = fsub <2 x double> %mul, %r2
|
||||
ret <2 x double> %sub
|
||||
; CONTRACT: estimated cost of 0 for {{.*}} fmul double
|
||||
; NOCONTRACT: estimated cost of 4 for {{.*}} fmul double
|
||||
; SZNOCONTRACT: estimated cost of 2 for {{.*}} fmul double
|
||||
; THRPTALL: estimated cost of 4 for {{.*}} fadd double
|
||||
; SIZEALL: estimated cost of 2 for {{.*}} fadd double
|
||||
; ALL: estimated cost of 0 for {{.*}} fmul contract double
|
||||
; THRPTALL: estimated cost of 4 for {{.*}} fadd contract double
|
||||
; SIZEALL: estimated cost of 2 for {{.*}} fadd contract double
|
||||
; CONTRACT: estimated cost of 0 for {{.*}} fmul <2 x double>
|
||||
; NOCONTRACT: estimated cost of 8 for {{.*}} fmul <2 x double>
|
||||
; SZNOCONTRACT: estimated cost of 4 for {{.*}} fmul <2 x double>
|
||||
; THRPTALL: estimated cost of 8 for {{.*}} fadd <2 x double>
|
||||
; SIZEALL: estimated cost of 4 for {{.*}} fadd <2 x double>
|
||||
; CONTRACT: estimated cost of 0 for {{.*}} fmul double
|
||||
; NOCONTRACT: estimated cost of 4 for {{.*}} fmul double
|
||||
; SZNOCONTRACT: estimated cost of 2 for {{.*}} fmul double
|
||||
; THRPTALL: estimated cost of 4 for {{.*}} fsub double
|
||||
; SIZEALL: estimated cost of 2 for {{.*}} fsub double
|
||||
; CONTRACT: estimated cost of 0 for {{.*}} fmul <2 x double>
|
||||
; NOCONTRACT: estimated cost of 8 for {{.*}} fmul <2 x double>
|
||||
; SZNOCONTRACT: estimated cost of 4 for {{.*}} fmul <2 x double>
|
||||
; THRPTALL: estimated cost of 8 for {{.*}} fsub <2 x double>
|
||||
; SIZEALL: estimated cost of 4 for {{.*}} fsub <2 x double>
|
||||
define void @fmul_fadd_f64() #0 {
|
||||
%f64 = fmul double undef, undef
|
||||
%f64add = fadd double %f64, undef
|
||||
%f64c = fmul contract double undef, undef
|
||||
%f64cadd = fadd contract double %f64c, undef
|
||||
%v2f64 = fmul <2 x double> undef, undef
|
||||
%v2f64add = fadd <2 x double> %v2f64, undef
|
||||
%f64_2 = fmul double undef, undef
|
||||
%f64sub = fsub double %f64_2, undef
|
||||
%v2f64_2 = fmul <2 x double> undef, undef
|
||||
%v2f64sub = fsub <2 x double> %v2f64_2, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
attributes #0 = { nounwind }
|
||||
|
||||
|
|
|
|||
|
|
@ -4,49 +4,20 @@
|
|||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa %s | FileCheck -check-prefixes=GCN,CI %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=GCN,GFX89 %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GCN,GFX89 %s
|
||||
; END.
|
||||
|
||||
; GCN-LABEL: 'insertelement_v2i32'
|
||||
; GCN-LABEL: 'insertelement_v2'
|
||||
; GCN: estimated cost of 0 for {{.*}} insertelement <2 x i32>
|
||||
define amdgpu_kernel void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
|
||||
%vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
|
||||
%insert = insertelement <2 x i32> %vec, i32 123, i32 1
|
||||
store <2 x i32> %insert, <2 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: 'insertelement_v2i64'
|
||||
; GCN: estimated cost of 0 for {{.*}} insertelement <2 x i64>
|
||||
define amdgpu_kernel void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
|
||||
%vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
|
||||
%insert = insertelement <2 x i64> %vec, i64 123, i64 1
|
||||
store <2 x i64> %insert, <2 x i64> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: 'insertelement_0_v2i16'
|
||||
; CI: estimated cost of 1 for {{.*}} insertelement <2 x i16>
|
||||
; GFX89: estimated cost of 0 for {{.*}} insertelement <2 x i16>
|
||||
define amdgpu_kernel void @insertelement_0_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%insert = insertelement <2 x i16> %vec, i16 123, i16 0
|
||||
store <2 x i16> %insert, <2 x i16> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: 'insertelement_1_v2i16'
|
||||
; GCN: estimated cost of 1 for {{.*}} insertelement <2 x i16>
|
||||
define amdgpu_kernel void @insertelement_1_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%insert = insertelement <2 x i16> %vec, i16 123, i16 1
|
||||
store <2 x i16> %insert, <2 x i16> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; GCN-LABEL: 'insertelement_1_v2i8'
|
||||
; GCN: estimated cost of 1 for {{.*}} insertelement <2 x i8>
|
||||
define amdgpu_kernel void @insertelement_1_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
|
||||
%vec = load <2 x i8>, <2 x i8> addrspace(1)* %vaddr
|
||||
%insert = insertelement <2 x i8> %vec, i8 123, i8 1
|
||||
store <2 x i8> %insert, <2 x i8> addrspace(1)* %out
|
||||
define amdgpu_kernel void @insertelement_v2() {
|
||||
%v2i32_1 = insertelement <2 x i32> undef, i32 123, i32 1
|
||||
%v2i64_1 = insertelement <2 x i64> undef, i64 123, i64 1
|
||||
%v2i16_0 = insertelement <2 x i16> undef, i16 123, i16 0
|
||||
%v2i16_1 = insertelement <2 x i16> undef, i16 123, i16 1
|
||||
%v2i8_1 = insertelement <2 x i8> undef, i8 123, i8 1
|
||||
ret void
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
|
||||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s --check-prefix=CHECK-THROUGHPUT
|
||||
; RUN: opt -cost-model -analyze -cost-kind=code-size -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s --check-prefix=CHECK-SIZE
|
||||
; END.
|
||||
|
||||
define amdgpu_kernel void @op() {
|
||||
; Logical and/or - select's cost must be equivalent to that of binop
|
||||
|
|
|
|||
|
|
@ -2,139 +2,63 @@
|
|||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=FAST16,THRPTALL,ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck -check-prefixes=SIZESLOW16,SIZEALL,ALL %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=SIZEFAST16,SIZEALL,ALL %s
|
||||
; END.
|
||||
|
||||
; ALL-LABEL: 'mul_i32'
|
||||
; THRPTALL: estimated cost of 4 for {{.*}} mul i32
|
||||
; SIZEALL: estimated cost of 2 for {{.*}} mul i32
|
||||
define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
|
||||
%vec = load i32, i32 addrspace(1)* %vaddr
|
||||
%mul = mul i32 %vec, %b
|
||||
store i32 %mul, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'mul_v2i32'
|
||||
; THRPTALL: estimated cost of 8 for {{.*}} mul <2 x i32>
|
||||
; SIZEALL: estimated cost of 4 for {{.*}} mul <2 x i32>
|
||||
define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
|
||||
%vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
|
||||
%mul = mul <2 x i32> %vec, %b
|
||||
store <2 x i32> %mul, <2 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'mul_v3i32'
|
||||
; THRPTALL: estimated cost of 12 for {{.*}} mul <3 x i32>
|
||||
; SIZEALL: estimated cost of 6 for {{.*}} mul <3 x i32>
|
||||
define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
|
||||
%vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
|
||||
%mul = mul <3 x i32> %vec, %b
|
||||
store <3 x i32> %mul, <3 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'mul_v5i32'
|
||||
; THRPTALL: estimated cost of 20 for {{.*}} mul <5 x i32>
|
||||
; SIZEALL: estimated cost of 10 for {{.*}} mul <5 x i32>
|
||||
define amdgpu_kernel void @mul_v5i32(<5 x i32> addrspace(1)* %out, <5 x i32> addrspace(1)* %vaddr, <5 x i32> %b) #0 {
|
||||
%vec = load <5 x i32>, <5 x i32> addrspace(1)* %vaddr
|
||||
%mul = mul <5 x i32> %vec, %b
|
||||
store <5 x i32> %mul, <5 x i32> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'mul_v4i32'
|
||||
; THRPTALL: estimated cost of 16 for {{.*}} mul <4 x i32>
|
||||
; SIZEALL: estimated cost of 8 for {{.*}} mul <4 x i32>
|
||||
define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
|
||||
%vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
|
||||
%mul = mul <4 x i32> %vec, %b
|
||||
store <4 x i32> %mul, <4 x i32> addrspace(1)* %out
|
||||
; THRPTALL: estimated cost of 20 for {{.*}} mul <5 x i32>
|
||||
; SIZEALL: estimated cost of 10 for {{.*}} mul <5 x i32>
|
||||
define amdgpu_kernel void @mul_i32() #0 {
|
||||
%i32 = mul i32 undef, undef
|
||||
%v2i32 = mul <2 x i32> undef, undef
|
||||
%v3i32 = mul <3 x i32> undef, undef
|
||||
%v4i32 = mul <4 x i32> undef, undef
|
||||
%v5i32 = mul <5 x i32> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'mul_i64'
|
||||
; THRPTALL: estimated cost of 20 for {{.*}} mul i64
|
||||
; SIZEALL: estimated cost of 12 for {{.*}} mul i64
|
||||
define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
|
||||
%vec = load i64, i64 addrspace(1)* %vaddr
|
||||
%mul = mul i64 %vec, %b
|
||||
store i64 %mul, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'mul_v2i64'
|
||||
; THRPTALL: estimated cost of 40 for {{.*}} mul <2 x i64>
|
||||
; SIZEALL: estimated cost of 24 for {{.*}} mul <2 x i64>
|
||||
define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
|
||||
%vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
|
||||
%mul = mul <2 x i64> %vec, %b
|
||||
store <2 x i64> %mul, <2 x i64> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'mul_v3i64'
|
||||
; THRPTALL: estimated cost of 60 for {{.*}} mul <3 x i64>
|
||||
; SIZEALL: estimated cost of 36 for {{.*}} mul <3 x i64>
|
||||
define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
|
||||
%vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
|
||||
%mul = mul <3 x i64> %vec, %b
|
||||
store <3 x i64> %mul, <3 x i64> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'mul_v4i64'
|
||||
; THRPTALL: estimated cost of 80 for {{.*}} mul <4 x i64>
|
||||
; SIZEALL: estimated cost of 48 for {{.*}} mul <4 x i64>
|
||||
define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
|
||||
%vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
|
||||
%mul = mul <4 x i64> %vec, %b
|
||||
store <4 x i64> %mul, <4 x i64> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
|
||||
; ALL-LABEL: 'mul_v8i64'
|
||||
; THRPTALL: estimated cost of 320 for {{.*}} mul <8 x i64>
|
||||
; SIZEALL: estimated cost of 192 for {{.*}} mul <8 x i64>
|
||||
define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
|
||||
%vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
|
||||
%mul = mul <8 x i64> %vec, %b
|
||||
store <8 x i64> %mul, <8 x i64> addrspace(1)* %out
|
||||
define amdgpu_kernel void @mul_i64() #0 {
|
||||
%i64 = mul i64 undef, undef
|
||||
%v2i64 = mul <2 x i64> undef, undef
|
||||
%v3i64 = mul <3 x i64> undef, undef
|
||||
%v4i64 = mul <4 x i64> undef, undef
|
||||
%v8i64 = mul <8 x i64> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'mul_i16'
|
||||
; THRPTALL: estimated cost of 4 for {{.*}} mul i16
|
||||
; SIZEALL: estimated cost of 2 for {{.*}} mul i16
|
||||
define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
|
||||
%vec = load i16, i16 addrspace(1)* %vaddr
|
||||
%mul = mul i16 %vec, %b
|
||||
store i16 %mul, i16 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'mul_v2i16'
|
||||
; SLOW16: estimated cost of 8 for {{.*}} mul <2 x i16>
|
||||
; FAST16: estimated cost of 4 for {{.*}} mul <2 x i16>
|
||||
; SIZESLOW16: estimated cost of 4 for {{.*}} mul <2 x i16>
|
||||
; SIZEFAST16: estimated cost of 2 for {{.*}} mul <2 x i16>
|
||||
define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%mul = mul <2 x i16> %vec, %b
|
||||
store <2 x i16> %mul, <2 x i16> addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'mul_v3i16'
|
||||
; SLOW16: estimated cost of 16 for {{.*}} mul <3 x i16>
|
||||
; FAST16: estimated cost of 8 for {{.*}} mul <3 x i16>
|
||||
; SIZESLOW16: estimated cost of 8 for {{.*}} mul <3 x i16>
|
||||
; SIZEFAST16: estimated cost of 4 for {{.*}} mul <3 x i16>
|
||||
define amdgpu_kernel void @mul_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %vaddr, <3 x i16> %b) #0 {
|
||||
%vec = load <3 x i16>, <3 x i16> addrspace(1)* %vaddr
|
||||
%mul = mul <3 x i16> %vec, %b
|
||||
store <3 x i16> %mul, <3 x i16> addrspace(1)* %out
|
||||
define amdgpu_kernel void @mul_i16() #0 {
|
||||
%i16 = mul i16 undef, undef
|
||||
%v2i16 = mul <2 x i16> undef, undef
|
||||
%v3i16 = mul <3 x i16> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
|
||||
; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -cost-model -cost-kind=throughput -analyze | FileCheck %s
|
||||
; END.
|
||||
|
||||
define i32 @reduce_i1(i32 %arg) {
|
||||
; CHECK-LABEL: 'reduce_i1'
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
|
||||
; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -cost-model -cost-kind=throughput -analyze | FileCheck %s
|
||||
; END.
|
||||
|
||||
define i32 @reduce_i1(i32 %arg) {
|
||||
; CHECK-LABEL: 'reduce_i1'
|
||||
|
|
|
|||
|
|
@ -2,120 +2,52 @@
|
|||
; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SLOW64,SLOW16 %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -mattr=+half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,FAST16 %s
|
||||
; RUN: opt -cost-model -cost-kind=code-size -analyze -mtriple=amdgcn-unknown-amdhsa -mattr=-half-rate-64-ops < %s | FileCheck -check-prefixes=ALL,SIZEALL,SLOW16 %s
|
||||
; END.
|
||||
|
||||
; ALL-LABEL: 'shl_i32'
|
||||
; ALL-LABEL: 'shl'
|
||||
; ALL: estimated cost of 1 for {{.*}} shl i32
|
||||
define amdgpu_kernel void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
|
||||
%vec = load i32, i32 addrspace(1)* %vaddr
|
||||
%or = shl i32 %vec, %b
|
||||
store i32 %or, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'shl_i64'
|
||||
; FAST64: estimated cost of 2 for {{.*}} shl i64
|
||||
; SLOW64: estimated cost of 4 for {{.*}} shl i64
|
||||
; SIZEALL: estimated cost of 2 for {{.*}} shl i64
|
||||
define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
|
||||
%vec = load i64, i64 addrspace(1)* %vaddr
|
||||
%or = shl i64 %vec, %b
|
||||
store i64 %or, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'shl_i16'
|
||||
; ALL: estimated cost of 1 for {{.*}} shl i16
|
||||
define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
|
||||
%vec = load i16, i16 addrspace(1)* %vaddr
|
||||
%or = shl i16 %vec, %b
|
||||
store i16 %or, i16 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'shl_v2i16'
|
||||
; SLOW16: estimated cost of 2 for {{.*}} shl <2 x i16>
|
||||
; FAST16: estimated cost of 1 for {{.*}} shl <2 x i16>
|
||||
define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%or = shl <2 x i16> %vec, %b
|
||||
store <2 x i16> %or, <2 x i16> addrspace(1)* %out
|
||||
define amdgpu_kernel void @shl() #0 {
|
||||
%i32 = shl i32 undef, undef
|
||||
%i64 = shl i64 undef, undef
|
||||
%i16 = shl i16 undef, undef
|
||||
%v2i16 = shl <2 x i16> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'lshr_i32'
|
||||
; ALL-LABEL: 'lshr'
|
||||
; ALL: estimated cost of 1 for {{.*}} lshr i32
|
||||
define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
|
||||
%vec = load i32, i32 addrspace(1)* %vaddr
|
||||
%or = lshr i32 %vec, %b
|
||||
store i32 %or, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'lshr_i64'
|
||||
; FAST64: estimated cost of 2 for {{.*}} lshr i64
|
||||
; SLOW64: estimated cost of 4 for {{.*}} lshr i64
|
||||
; SIZEALL: estimated cost of 2 for {{.*}} lshr i64
|
||||
define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
|
||||
%vec = load i64, i64 addrspace(1)* %vaddr
|
||||
%or = lshr i64 %vec, %b
|
||||
store i64 %or, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'lshr_i16'
|
||||
; ALL: estimated cost of 1 for {{.*}} lshr i16
|
||||
define amdgpu_kernel void @lshr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
|
||||
%vec = load i16, i16 addrspace(1)* %vaddr
|
||||
%or = lshr i16 %vec, %b
|
||||
store i16 %or, i16 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'lshr_v2i16'
|
||||
; SLOW16: estimated cost of 2 for {{.*}} lshr <2 x i16>
|
||||
; FAST16: estimated cost of 1 for {{.*}} lshr <2 x i16>
|
||||
define amdgpu_kernel void @lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%or = lshr <2 x i16> %vec, %b
|
||||
store <2 x i16> %or, <2 x i16> addrspace(1)* %out
|
||||
define amdgpu_kernel void @lshr() #0 {
|
||||
%i32 = lshr i32 undef, undef
|
||||
%i64 = lshr i64 undef, undef
|
||||
%i16 = lshr i16 undef, undef
|
||||
%v2i16 = lshr <2 x i16> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'ashr_i32'
|
||||
; ALL-LABEL: 'ashr'
|
||||
; ALL: estimated cost of 1 for {{.*}} ashr i32
|
||||
define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
|
||||
%vec = load i32, i32 addrspace(1)* %vaddr
|
||||
%or = ashr i32 %vec, %b
|
||||
store i32 %or, i32 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'ashr_i64'
|
||||
; FAST64: estimated cost of 2 for {{.*}} ashr i64
|
||||
; SLOW64: estimated cost of 4 for {{.*}} ashr i64
|
||||
define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
|
||||
%vec = load i64, i64 addrspace(1)* %vaddr
|
||||
%or = ashr i64 %vec, %b
|
||||
store i64 %or, i64 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'ashr_i16'
|
||||
; ALL: estimated cost of 1 for {{.*}} ashr i16
|
||||
define amdgpu_kernel void @ashr_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
|
||||
%vec = load i16, i16 addrspace(1)* %vaddr
|
||||
%or = ashr i16 %vec, %b
|
||||
store i16 %or, i16 addrspace(1)* %out
|
||||
ret void
|
||||
}
|
||||
|
||||
; ALL-LABEL: 'ashr_v2i16'
|
||||
; SLOW16: estimated cost of 2 for {{.*}} ashr <2 x i16>
|
||||
; FAST16: estimated cost of 1 for {{.*}} ashr <2 x i16>
|
||||
define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
|
||||
%vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
|
||||
%or = ashr <2 x i16> %vec, %b
|
||||
store <2 x i16> %or, <2 x i16> addrspace(1)* %out
|
||||
define amdgpu_kernel void @ashr() #0 {
|
||||
%i32 = ashr i32 undef, undef
|
||||
%i64 = ashr i64 undef, undef
|
||||
%i16 = ashr i16 undef, undef
|
||||
%v2i16 = ashr <2 x i16> undef, undef
|
||||
ret void
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
; RUN: opt < %s -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -cost-kind=code-size -S | FileCheck -check-prefixes=GFX9-CS %s
|
||||
; RUN: opt < %s -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji -S | FileCheck -check-prefixes=VI %s
|
||||
; RUN: opt < %s -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji -cost-kind=code-size -S | FileCheck -check-prefixes=VI-CS %s
|
||||
; END.
|
||||
|
||||
define amdgpu_kernel void @shufflevector_00_v2i16(<2 x i16> %vec0, <2 x i16> %vec1) {
|
||||
; GFX10-LABEL: 'shufflevector_00_v2i16'
|
||||
|
|
|
|||
Loading…
Reference in New Issue