llvm-project/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll

292 lines
9.9 KiB
LLVM

; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; RUN: opt < %s -vector-combine -S -mtriple=x86_64-- | FileCheck %s
; Eliminating extract is profitable.
define i8 @ext0_ext0_add(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext0_ext0_add(
; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i8> [[X:%.*]], [[Y:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
; CHECK-NEXT: ret i8 [[TMP2]]
;
%e0 = extractelement <16 x i8> %x, i32 0
%e1 = extractelement <16 x i8> %y, i32 0
%r = add i8 %e0, %e1
ret i8 %r
}
; Eliminating extract is still profitable. Flags propagate.
define i8 @ext1_ext1_add_flags(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext1_ext1_add_flags(
; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw <16 x i8> [[X:%.*]], [[Y:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1
; CHECK-NEXT: ret i8 [[TMP2]]
;
%e0 = extractelement <16 x i8> %x, i32 1
%e1 = extractelement <16 x i8> %y, i32 1
%r = add nsw nuw i8 %e0, %e1
ret i8 %r
}
; Negative test - eliminating extract is profitable, but vector shift is expensive.
define i8 @ext1_ext1_shl(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext1_ext1_shl(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 1
; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1
; CHECK-NEXT: [[R:%.*]] = shl i8 [[E0]], [[E1]]
; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 1
%e1 = extractelement <16 x i8> %y, i32 1
%r = shl i8 %e0, %e1
ret i8 %r
}
; Negative test - eliminating extract is profitable, but vector multiply is expensive.
define i8 @ext13_ext13_mul(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext13_ext13_mul(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 13
; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 13
; CHECK-NEXT: [[R:%.*]] = mul i8 [[E0]], [[E1]]
; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 13
%e1 = extractelement <16 x i8> %y, i32 13
%r = mul i8 %e0, %e1
ret i8 %r
}
; Negative test - cost is irrelevant because sdiv has potential UB.
define i8 @ext0_ext0_sdiv(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext0_ext0_sdiv(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
; CHECK-NEXT: [[R:%.*]] = sdiv i8 [[E0]], [[E1]]
; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 0
%e1 = extractelement <16 x i8> %y, i32 0
%r = sdiv i8 %e0, %e1
ret i8 %r
}
; Extracts are free and vector op has same cost as scalar, but we
; speculatively transform to vector to create more optimization
; opportunities..
define double @ext0_ext0_fadd(<2 x double> %x, <2 x double> %y) {
; CHECK-LABEL: @ext0_ext0_fadd(
; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[X:%.*]], [[Y:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 0
; CHECK-NEXT: ret double [[TMP2]]
;
%e0 = extractelement <2 x double> %x, i32 0
%e1 = extractelement <2 x double> %y, i32 0
%r = fadd double %e0, %e1
ret double %r
}
; Eliminating extract is profitable. Flags propagate.
define double @ext1_ext1_fsub(<2 x double> %x, <2 x double> %y) {
; CHECK-LABEL: @ext1_ext1_fsub(
; CHECK-NEXT: [[TMP1:%.*]] = fsub fast <2 x double> [[X:%.*]], [[Y:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x double> [[TMP1]], i32 1
; CHECK-NEXT: ret double [[TMP2]]
;
%e0 = extractelement <2 x double> %x, i32 1
%e1 = extractelement <2 x double> %y, i32 1
%r = fsub fast double %e0, %e1
ret double %r
}
; Negative test - type mismatch.
define double @ext1_ext1_fadd_different_types(<2 x double> %x, <4 x double> %y) {
; CHECK-LABEL: @ext1_ext1_fadd_different_types(
; CHECK-NEXT: [[E0:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
; CHECK-NEXT: [[E1:%.*]] = extractelement <4 x double> [[Y:%.*]], i32 1
; CHECK-NEXT: [[R:%.*]] = fadd fast double [[E0]], [[E1]]
; CHECK-NEXT: ret double [[R]]
;
%e0 = extractelement <2 x double> %x, i32 1
%e1 = extractelement <4 x double> %y, i32 1
%r = fadd fast double %e0, %e1
ret double %r
}
; Disguised same vector operand; scalar code is not cheaper (with default
; x86 target), so aggressively form vector binop.
define i32 @ext1_ext1_add_same_vec(<4 x i32> %x) {
; CHECK-LABEL: @ext1_ext1_add_same_vec(
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[X:%.*]], [[X]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
; CHECK-NEXT: ret i32 [[TMP2]]
;
%e0 = extractelement <4 x i32> %x, i32 1
%e1 = extractelement <4 x i32> %x, i32 1
%r = add i32 %e0, %e1
ret i32 %r
}
; Functionally equivalent to above test; should transform as above.
define i32 @ext1_ext1_add_same_vec_cse(<4 x i32> %x) {
; CHECK-LABEL: @ext1_ext1_add_same_vec_cse(
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[X:%.*]], [[X]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
; CHECK-NEXT: ret i32 [[TMP2]]
;
%e0 = extractelement <4 x i32> %x, i32 1
%r = add i32 %e0, %e0
ret i32 %r
}
; Don't assert if extract indices have different types.
define i32 @ext1_ext1_add_same_vec_diff_idx_ty(<4 x i32> %x) {
; CHECK-LABEL: @ext1_ext1_add_same_vec_diff_idx_ty(
; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[X:%.*]], [[X]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1
; CHECK-NEXT: ret i32 [[TMP2]]
;
%e0 = extractelement <4 x i32> %x, i32 1
%e1 = extractelement <4 x i32> %x, i64 1
%r = add i32 %e0, %e1
ret i32 %r
}
declare void @use_i8(i8)
; Negative test - same vector operand; scalar code is cheaper than general case
; and vector code would be more expensive still.
define i8 @ext1_ext1_add_same_vec_extra_use0(<16 x i8> %x) {
; CHECK-LABEL: @ext1_ext1_add_same_vec_extra_use0(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
; CHECK-NEXT: call void @use_i8(i8 [[E0]])
; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[X]], i32 0
; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]]
; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 0
call void @use_i8(i8 %e0)
%e1 = extractelement <16 x i8> %x, i32 0
%r = add i8 %e0, %e1
ret i8 %r
}
; Negative test - same vector operand; scalar code is cheaper than general case
; and vector code would be more expensive still.
define i8 @ext1_ext1_add_same_vec_extra_use1(<16 x i8> %x) {
; CHECK-LABEL: @ext1_ext1_add_same_vec_extra_use1(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[X]], i32 0
; CHECK-NEXT: call void @use_i8(i8 [[E1]])
; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]]
; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 0
%e1 = extractelement <16 x i8> %x, i32 0
call void @use_i8(i8 %e1)
%r = add i8 %e0, %e1
ret i8 %r
}
; Negative test - same vector operand; scalar code is cheaper than general case
; and vector code would be more expensive still.
define i8 @ext1_ext1_add_same_vec_cse_extra_use(<16 x i8> %x) {
; CHECK-LABEL: @ext1_ext1_add_same_vec_cse_extra_use(
; CHECK-NEXT: [[E:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
; CHECK-NEXT: call void @use_i8(i8 [[E]])
; CHECK-NEXT: [[R:%.*]] = add i8 [[E]], [[E]]
; CHECK-NEXT: ret i8 [[R]]
;
%e = extractelement <16 x i8> %x, i32 0
call void @use_i8(i8 %e)
%r = add i8 %e, %e
ret i8 %r
}
; Vector code costs the same as scalar, so aggressively form vector op.
define i8 @ext1_ext1_add_uses1(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext1_ext1_add_uses1(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
; CHECK-NEXT: call void @use_i8(i8 [[E0]])
; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i8> [[X]], [[Y:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
; CHECK-NEXT: ret i8 [[TMP2]]
;
%e0 = extractelement <16 x i8> %x, i32 0
call void @use_i8(i8 %e0)
%e1 = extractelement <16 x i8> %y, i32 0
%r = add i8 %e0, %e1
ret i8 %r
}
; Vector code costs the same as scalar, so aggressively form vector op.
define i8 @ext1_ext1_add_uses2(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext1_ext1_add_uses2(
; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
; CHECK-NEXT: call void @use_i8(i8 [[E1]])
; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i8> [[X:%.*]], [[Y]]
; CHECK-NEXT: [[TMP2:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0
; CHECK-NEXT: ret i8 [[TMP2]]
;
%e0 = extractelement <16 x i8> %x, i32 0
%e1 = extractelement <16 x i8> %y, i32 0
call void @use_i8(i8 %e1)
%r = add i8 %e0, %e1
ret i8 %r
}
; TODO: Different extract indexes requires a shuffle.
define i8 @ext0_ext1_add(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext0_ext1_add(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 0
; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 1
; CHECK-NEXT: [[R:%.*]] = add i8 [[E0]], [[E1]]
; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 0
%e1 = extractelement <16 x i8> %y, i32 1
%r = add i8 %e0, %e1
ret i8 %r
}
define i8 @ext5_ext0_add(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext5_ext0_add(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 5
; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 0
; CHECK-NEXT: [[R:%.*]] = sub i8 [[E0]], [[E1]]
; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 5
%e1 = extractelement <16 x i8> %y, i32 0
%r = sub i8 %e0, %e1
ret i8 %r
}
define i8 @ext5_ext6_add(<16 x i8> %x, <16 x i8> %y) {
; CHECK-LABEL: @ext5_ext6_add(
; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x i8> [[X:%.*]], i32 5
; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x i8> [[Y:%.*]], i32 6
; CHECK-NEXT: [[R:%.*]] = and i8 [[E0]], [[E1]]
; CHECK-NEXT: ret i8 [[R]]
;
%e0 = extractelement <16 x i8> %x, i32 5
%e1 = extractelement <16 x i8> %y, i32 6
%r = and i8 %e0, %e1
ret i8 %r
}