102 lines
5.8 KiB
LLVM
102 lines
5.8 KiB
LLVM
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
|
|
; RUN: opt < %s -memcpyopt -S -verify-memoryssa | FileCheck %s
|
|
|
|
; Check that a call featuring a scalable-vector byval argument fed by a memcpy
|
|
; doesn't crash the compiler. It previously assumed the byval type's size could
|
|
; be represented as a known constant amount.
|
|
define void @byval_caller(i8 *%P) {
|
|
; CHECK-LABEL: @byval_caller(
|
|
; CHECK-NEXT: [[A:%.*]] = alloca i8, align 1
|
|
; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A]], i8* align 4 [[P:%.*]], i64 8, i1 false)
|
|
; CHECK-NEXT: [[VA:%.*]] = bitcast i8* [[A]] to <vscale x 1 x i8>*
|
|
; CHECK-NEXT: call void @byval_callee(<vscale x 1 x i8>* byval(<vscale x 1 x i8>) align 1 [[VA]])
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%a = alloca i8
|
|
call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %a, i8* align 4 %P, i64 8, i1 false)
|
|
%va = bitcast i8* %a to <vscale x 1 x i8>*
|
|
call void @byval_callee(<vscale x 1 x i8>* align 1 byval(<vscale x 1 x i8>) %va)
|
|
ret void
|
|
}
|
|
|
|
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4, i8* align 4, i64, i1)
|
|
declare void @byval_callee(<vscale x 1 x i8>* align 1 byval(<vscale x 1 x i8>))
|
|
|
|
; Check that two scalable-vector stores (overlapping, with a constant offset)
|
|
; do not crash the compiler when checked whether or not they can be merged into
|
|
; a single memset. There was previously an assumption that the stored values'
|
|
; sizes could be represented by a known constant amount.
|
|
define void @merge_stores_both_scalable(<vscale x 1 x i8>* %ptr) {
|
|
; CHECK-LABEL: @merge_stores_both_scalable(
|
|
; CHECK-NEXT: store <vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8>* [[PTR:%.*]], align 1
|
|
; CHECK-NEXT: [[PTRI8:%.*]] = bitcast <vscale x 1 x i8>* [[PTR]] to i8*
|
|
; CHECK-NEXT: [[PTR_NEXT:%.*]] = getelementptr i8, i8* [[PTRI8]], i64 1
|
|
; CHECK-NEXT: [[PTR_NEXT_2:%.*]] = bitcast i8* [[PTR_NEXT]] to <vscale x 1 x i8>*
|
|
; CHECK-NEXT: store <vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8>* [[PTR_NEXT_2]], align 1
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
store <vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8>* %ptr
|
|
%ptri8 = bitcast <vscale x 1 x i8>* %ptr to i8*
|
|
%ptr.next = getelementptr i8, i8* %ptri8, i64 1
|
|
%ptr.next.2 = bitcast i8* %ptr.next to <vscale x 1 x i8>*
|
|
store <vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8>* %ptr.next.2
|
|
ret void
|
|
}
|
|
|
|
; As above, but where the base is scalable but the subsequent store(s) are not.
|
|
define void @merge_stores_first_scalable(<vscale x 1 x i8>* %ptr) {
|
|
; CHECK-LABEL: @merge_stores_first_scalable(
|
|
; CHECK-NEXT: store <vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8>* [[PTR:%.*]], align 1
|
|
; CHECK-NEXT: [[PTRI8:%.*]] = bitcast <vscale x 1 x i8>* [[PTR]] to i8*
|
|
; CHECK-NEXT: [[PTR_NEXT:%.*]] = getelementptr i8, i8* [[PTRI8]], i64 1
|
|
; CHECK-NEXT: store i8 0, i8* [[PTR_NEXT]], align 1
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
store <vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8>* %ptr
|
|
%ptri8 = bitcast <vscale x 1 x i8>* %ptr to i8*
|
|
%ptr.next = getelementptr i8, i8* %ptri8, i64 1
|
|
store i8 zeroinitializer, i8* %ptr.next
|
|
ret void
|
|
}
|
|
|
|
; As above, but where the base is not scalable but the subsequent store(s) are.
|
|
define void @merge_stores_second_scalable(i8* %ptr) {
|
|
; CHECK-LABEL: @merge_stores_second_scalable(
|
|
; CHECK-NEXT: store i8 0, i8* [[PTR:%.*]], align 1
|
|
; CHECK-NEXT: [[PTR_NEXT:%.*]] = getelementptr i8, i8* [[PTR]], i64 1
|
|
; CHECK-NEXT: [[PTR_NEXT_2:%.*]] = bitcast i8* [[PTR_NEXT]] to <vscale x 1 x i8>*
|
|
; CHECK-NEXT: store <vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8>* [[PTR_NEXT_2]], align 1
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
store i8 zeroinitializer, i8* %ptr
|
|
%ptr.next = getelementptr i8, i8* %ptr, i64 1
|
|
%ptr.next.2 = bitcast i8* %ptr.next to <vscale x 1 x i8>*
|
|
store <vscale x 1 x i8> zeroinitializer, <vscale x 1 x i8>* %ptr.next.2
|
|
ret void
|
|
}
|
|
|
|
; Check that the call-slot optimization doesn't crash when encountering scalable types.
|
|
define void @callslotoptzn(<vscale x 4 x float> %val, <vscale x 4 x float>* %out) {
|
|
; CHECK-LABEL: @callslotoptzn(
|
|
; CHECK-NEXT: [[ALLOC:%.*]] = alloca <vscale x 4 x float>, align 16
|
|
; CHECK-NEXT: [[IDX:%.*]] = tail call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
|
|
; CHECK-NEXT: [[BALLOC:%.*]] = getelementptr inbounds <vscale x 4 x float>, <vscale x 4 x float>* [[ALLOC]], i64 0, i64 0
|
|
; CHECK-NEXT: [[STRIDE:%.*]] = getelementptr inbounds float, float* [[BALLOC]], <vscale x 4 x i32> [[IDX]]
|
|
; CHECK-NEXT: call void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> [[VAL:%.*]], <vscale x 4 x float*> [[STRIDE]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
|
|
; CHECK-NEXT: [[LI:%.*]] = load <vscale x 4 x float>, <vscale x 4 x float>* [[ALLOC]], align 4
|
|
; CHECK-NEXT: store <vscale x 4 x float> [[LI]], <vscale x 4 x float>* [[OUT:%.*]], align 4
|
|
; CHECK-NEXT: ret void
|
|
;
|
|
%alloc = alloca <vscale x 4 x float>, align 16
|
|
%idx = tail call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
|
|
%balloc = getelementptr inbounds <vscale x 4 x float>, <vscale x 4 x float>* %alloc, i64 0, i64 0
|
|
%stride = getelementptr inbounds float, float* %balloc, <vscale x 4 x i32> %idx
|
|
call void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> %val, <vscale x 4 x float*> %stride, i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i32 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
|
|
%li = load <vscale x 4 x float>, <vscale x 4 x float>* %alloc, align 4
|
|
store <vscale x 4 x float> %li, <vscale x 4 x float>* %out, align 4
|
|
ret void
|
|
}
|
|
|
|
declare <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
|
|
declare void @llvm.masked.scatter.nxv4f32.nxv4p0f32(<vscale x 4 x float> , <vscale x 4 x float*> , i32, <vscale x 4 x i1>)
|