llvm-project/llvm/test/Transforms/LoopVectorize/invariant-store-vectorizati...

261 lines
8.9 KiB
LLVM

; RUN: opt < %s -licm -loop-vectorize -force-vector-width=4 -dce -instcombine -licm -S | FileCheck %s
; First licm pass is to hoist/sink invariant stores if possible. Today LICM does
; not hoist/sink the invariant stores. Even if that changes, we should still
; vectorize this loop in case licm is not run.
; The next licm pass after vectorization is to hoist/sink loop invariant
; instructions.
target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
; all tests check that it is legal to vectorize the stores to invariant
; address.
; CHECK-LABEL: inv_val_store_to_inv_address_with_reduction(
; memory check is found.conflict = b[max(n-1,1)] > a && (i8* a)+1 > (i8* b)
; CHECK: vector.memcheck:
; CHECK: found.conflict
; CHECK-LABEL: vector.body:
; CHECK: %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ [[ADD:%[a-zA-Z0-9.]+]], %vector.body ]
; CHECK: %wide.load = load <4 x i32>
; CHECK: [[ADD]] = add <4 x i32> %vec.phi, %wide.load
; CHECK-NEXT: store i32 %ntrunc, i32* %a
; CHECK-NEXT: %index.next = add i64 %index, 4
; CHECK-NEXT: icmp eq i64 %index.next, %n.vec
; CHECK-NEXT: br i1
; CHECK-LABEL: middle.block:
; CHECK: %rdx.shuf = shufflevector <4 x i32>
define i32 @inv_val_store_to_inv_address_with_reduction(i32* %a, i64 %n, i32* %b) {
entry:
%ntrunc = trunc i64 %n to i32
br label %for.body
for.body: ; preds = %for.body, %entry
%i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
%tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
%tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
%tmp2 = load i32, i32* %tmp1, align 8
%tmp3 = add i32 %tmp0, %tmp2
store i32 %ntrunc, i32* %a
%i.next = add nuw nsw i64 %i, 1
%cond = icmp slt i64 %i.next, %n
br i1 %cond, label %for.body, label %for.end
for.end: ; preds = %for.body
%tmp4 = phi i32 [ %tmp3, %for.body ]
ret i32 %tmp4
}
; CHECK-LABEL: inv_val_store_to_inv_address(
; CHECK-LABEL: vector.body:
; CHECK: store i32 %ntrunc, i32* %a
; CHECK: store <4 x i32>
; CHECK-NEXT: %index.next = add i64 %index, 4
; CHECK-NEXT: icmp eq i64 %index.next, %n.vec
; CHECK-NEXT: br i1
define void @inv_val_store_to_inv_address(i32* %a, i64 %n, i32* %b) {
entry:
%ntrunc = trunc i64 %n to i32
br label %for.body
for.body: ; preds = %for.body, %entry
%i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
%tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
%tmp2 = load i32, i32* %tmp1, align 8
store i32 %ntrunc, i32* %a
store i32 %ntrunc, i32* %tmp1
%i.next = add nuw nsw i64 %i, 1
%cond = icmp slt i64 %i.next, %n
br i1 %cond, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
; Both of these tests below are handled as predicated stores.
; Conditional store
; if (b[i] == k) a = ntrunc
; TODO: We can be better with the code gen for the first test and we can have
; just one scalar store if vector.or.reduce(vector_cmp(b[i] == k)) is 1.
; CHECK-LABEL:inv_val_store_to_inv_address_conditional(
; CHECK-LABEL: vector.body:
; CHECK: %wide.load = load <4 x i32>, <4 x i32>*
; CHECK: [[CMP:%[a-zA-Z0-9.]+]] = icmp eq <4 x i32> %wide.load, %{{.*}}
; CHECK: store <4 x i32>
; CHECK-NEXT: [[EE:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMP]], i32 0
; CHECK-NEXT: br i1 [[EE]], label %pred.store.if, label %pred.store.continue
; CHECK-LABEL: pred.store.if:
; CHECK-NEXT: store i32 %ntrunc, i32* %a
; CHECK-NEXT: br label %pred.store.continue
; CHECK-LABEL: pred.store.continue:
; CHECK-NEXT: [[EE1:%[a-zA-Z0-9.]+]] = extractelement <4 x i1> [[CMP]], i32 1
define void @inv_val_store_to_inv_address_conditional(i32* %a, i64 %n, i32* %b, i32 %k) {
entry:
%ntrunc = trunc i64 %n to i32
br label %for.body
for.body: ; preds = %for.body, %entry
%i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
%tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
%tmp2 = load i32, i32* %tmp1, align 8
%cmp = icmp eq i32 %tmp2, %k
store i32 %ntrunc, i32* %tmp1
br i1 %cmp, label %cond_store, label %latch
cond_store:
store i32 %ntrunc, i32* %a
br label %latch
latch:
%i.next = add nuw nsw i64 %i, 1
%cond = icmp slt i64 %i.next, %n
br i1 %cond, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
; if (b[i] == k)
; a = ntrunc
; else a = k;
; TODO: We could vectorize this once we support multiple uniform stores to the
; same address.
; CHECK-LABEL:inv_val_store_to_inv_address_conditional_diff_values(
; CHECK-NOT: load <4 x i32>
define void @inv_val_store_to_inv_address_conditional_diff_values(i32* %a, i64 %n, i32* %b, i32 %k) {
entry:
%ntrunc = trunc i64 %n to i32
br label %for.body
for.body: ; preds = %for.body, %entry
%i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
%tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
%tmp2 = load i32, i32* %tmp1, align 8
%cmp = icmp eq i32 %tmp2, %k
store i32 %ntrunc, i32* %tmp1
br i1 %cmp, label %cond_store, label %cond_store_k
cond_store:
store i32 %ntrunc, i32* %a
br label %latch
cond_store_k:
store i32 %k, i32 * %a
br label %latch
latch:
%i.next = add nuw nsw i64 %i, 1
%cond = icmp slt i64 %i.next, %n
br i1 %cond, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
; Instcombine'd version of above test. Now the store is no longer of invariant
; value.
; TODO: We should be able to vectorize this loop once we support vectorizing
; stores of variant values to invariant addresses.
; CHECK-LABEL: inv_val_store_to_inv_address_conditional_diff_values_ic
; CHECK-NOT: <4 x
define void @inv_val_store_to_inv_address_conditional_diff_values_ic(i32* %a, i64 %n, i32* %b, i32 %k) {
entry:
%ntrunc = trunc i64 %n to i32
br label %for.body
for.body: ; preds = %for.body, %entry
%i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
%tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
%tmp2 = load i32, i32* %tmp1, align 8
%cmp = icmp eq i32 %tmp2, %k
store i32 %ntrunc, i32* %tmp1
br i1 %cmp, label %cond_store, label %cond_store_k
cond_store:
br label %latch
cond_store_k:
br label %latch
latch:
%storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ]
store i32 %storeval, i32* %a
%i.next = add nuw nsw i64 %i, 1
%cond = icmp slt i64 %i.next, %n
br i1 %cond, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
; invariant val stored to invariant address predicated on invariant condition
; This is not treated as a predicated store since the block the store belongs to
; is the latch block (which doesn't need to be predicated).
; TODO: We should vectorize this loop once we relax the check for
; variant/invariant values being stored to invariant address.
; CHECK-LABEL: inv_val_store_to_inv_address_conditional_inv
; CHECK-NOT: <4 x
define void @inv_val_store_to_inv_address_conditional_inv(i32* %a, i64 %n, i32* %b, i32 %k) {
entry:
%ntrunc = trunc i64 %n to i32
%cmp = icmp eq i32 %ntrunc, %k
br label %for.body
for.body: ; preds = %for.body, %entry
%i = phi i64 [ %i.next, %latch ], [ 0, %entry ]
%tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
%tmp2 = load i32, i32* %tmp1, align 8
store i32 %ntrunc, i32* %tmp1
br i1 %cmp, label %cond_store, label %cond_store_k
cond_store:
br label %latch
cond_store_k:
br label %latch
latch:
%storeval = phi i32 [ %ntrunc, %cond_store ], [ %k, %cond_store_k ]
store i32 %storeval, i32* %a
%i.next = add nuw nsw i64 %i, 1
%cond = icmp slt i64 %i.next, %n
br i1 %cond, label %for.body, label %for.end
for.end: ; preds = %for.body
ret void
}
; TODO: This loop can be vectorized once we support variant value being
; stored into invariant address.
; CHECK-LABEL: variant_val_store_to_inv_address
; CHECK-NOT: <4 x i32>
define i32 @variant_val_store_to_inv_address(i32* %a, i64 %n, i32* %b, i32 %k) {
entry:
%ntrunc = trunc i64 %n to i32
%cmp = icmp eq i32 %ntrunc, %k
br label %for.body
for.body: ; preds = %for.body, %entry
%i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
%tmp0 = phi i32 [ %tmp3, %for.body ], [ 0, %entry ]
%tmp1 = getelementptr inbounds i32, i32* %b, i64 %i
%tmp2 = load i32, i32* %tmp1, align 8
store i32 %tmp2, i32* %a
%tmp3 = add i32 %tmp0, %tmp2
%i.next = add nuw nsw i64 %i, 1
%cond = icmp slt i64 %i.next, %n
br i1 %cond, label %for.body, label %for.end
for.end: ; preds = %for.body
%rdx.lcssa = phi i32 [ %tmp0, %for.body ]
ret i32 %rdx.lcssa
}