124 lines
		
	
	
		
			6.0 KiB
		
	
	
	
		
			LLVM
		
	
	
	
			
		
		
	
	
			124 lines
		
	
	
		
			6.0 KiB
		
	
	
	
		
			LLVM
		
	
	
	
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 | 
						|
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
 | 
						|
@buf = dso_local global [3072 x i8] zeroinitializer, align 16
 | 
						|
 | 
						|
define dso_local void @test1(i16 signext %0, i16 signext %1) nounwind {
 | 
						|
; CHECK-LABEL: test1:
 | 
						|
; CHECK:       # %bb.0:
 | 
						|
; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 | 
						|
; CHECK-NEXT:    vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movl $buf, %eax
 | 
						|
; CHECK-NEXT:    movl $32, %ecx
 | 
						|
; CHECK-NEXT:    movw $8, %dx
 | 
						|
; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm0
 | 
						|
; CHECK-NEXT:    movl $buf+1024, %eax
 | 
						|
; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm1
 | 
						|
; CHECK-NEXT:    movl $buf+2048, %eax
 | 
						|
; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm2
 | 
						|
; CHECK-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2
 | 
						|
; CHECK-NEXT:    tilestored %tmm2, (%rax,%rcx)
 | 
						|
; CHECK-NEXT:    tilerelease
 | 
						|
; CHECK-NEXT:    vzeroupper
 | 
						|
; CHECK-NEXT:    jmp foo # TAILCALL
 | 
						|
  %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
 | 
						|
  %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
 | 
						|
  %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
 | 
						|
  %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
 | 
						|
  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6)
 | 
						|
  tail call void @foo()
 | 
						|
  ret void
 | 
						|
}
 | 
						|
 | 
						|
define dso_local void @test2(i16 signext %0, i16 signext %1) nounwind {
 | 
						|
; CHECK-LABEL: test2:
 | 
						|
; CHECK:       # %bb.0:
 | 
						|
; CHECK-NEXT:    pushq %rbp
 | 
						|
; CHECK-NEXT:    pushq %rbx
 | 
						|
; CHECK-NEXT:    subq $72, %rsp
 | 
						|
; CHECK-NEXT:    movl %esi, %ebx
 | 
						|
; CHECK-NEXT:    movl %edi, %ebp
 | 
						|
; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
 | 
						|
; CHECK-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movb %bpl, {{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movb %bpl, {{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movb %bpl, {{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    vzeroupper
 | 
						|
; CHECK-NEXT:    callq foo
 | 
						|
; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
 | 
						|
; CHECK-NEXT:    xorl %eax, %eax
 | 
						|
; CHECK-NEXT:    testb %al, %al
 | 
						|
; CHECK-NEXT:    jne .LBB1_3
 | 
						|
; CHECK-NEXT:  # %bb.1: # %if.true
 | 
						|
; CHECK-NEXT:    movw $8, %ax
 | 
						|
; CHECK-NEXT:    tilezero %tmm0
 | 
						|
; CHECK-NEXT:    movl $32, %ecx
 | 
						|
; CHECK-NEXT:    movl $buf+1024, %edx
 | 
						|
; CHECK-NEXT:    tileloadd (%rdx,%rcx), %tmm1
 | 
						|
; CHECK-NEXT:    movl $buf+2048, %edx
 | 
						|
; CHECK-NEXT:    tileloadd (%rdx,%rcx), %tmm2
 | 
						|
; CHECK-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
 | 
						|
; CHECK-NEXT:    tilestored %tmm0, (%rdx,%rcx)
 | 
						|
; CHECK-NEXT:    jmp .LBB1_2
 | 
						|
; CHECK-NEXT:  .LBB1_3: # %if.false
 | 
						|
; CHECK-NEXT:    movl $buf, %eax
 | 
						|
; CHECK-NEXT:    movl $32, %ecx
 | 
						|
; CHECK-NEXT:    movw $8, %dx
 | 
						|
; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm3
 | 
						|
; CHECK-NEXT:    movl $buf+1024, %eax
 | 
						|
; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm4
 | 
						|
; CHECK-NEXT:    movl $buf+2048, %eax
 | 
						|
; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm2
 | 
						|
; CHECK-NEXT:    tdpbssd %tmm2, %tmm4, %tmm3
 | 
						|
; CHECK-NEXT:    tilestored %tmm3, (%rax,%rcx)
 | 
						|
; CHECK-NEXT:  .LBB1_2: # %if.true
 | 
						|
; CHECK-NEXT:    addq $72, %rsp
 | 
						|
; CHECK-NEXT:    popq %rbx
 | 
						|
; CHECK-NEXT:    popq %rbp
 | 
						|
; CHECK-NEXT:    tilerelease
 | 
						|
; CHECK-NEXT:    retq
 | 
						|
  call void @foo()
 | 
						|
  br i1 undef, label %if.true, label %if.false
 | 
						|
 | 
						|
if.true:
 | 
						|
  %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 8)
 | 
						|
  %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
 | 
						|
  %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
 | 
						|
  %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3)
 | 
						|
  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t4)
 | 
						|
  br label %exit
 | 
						|
 | 
						|
if.false:
 | 
						|
  %t5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
 | 
						|
  %t6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
 | 
						|
  %t7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
 | 
						|
  %t8 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t5, x86_amx %t6, x86_amx %t7)
 | 
						|
  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t8)
 | 
						|
  br label %exit
 | 
						|
 | 
						|
exit:
 | 
						|
  ret void
 | 
						|
}
 | 
						|
 | 
						|
declare dso_local void @foo() nounwind
 | 
						|
declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
 | 
						|
declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
 | 
						|
declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 | 
						|
declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
 |