Commit Graph

22902 Commits

Author SHA1 Message Date
Simon Pilgrim 4204361fed [X86] X86InstrInfo.cpp - fix signed/unsigned promotion warnings in addImm calls
addImm takes a int64_t arg but we were using uint64_t types
2022-06-15 18:21:43 +01:00
Paul Robinson 654a835c3f [PS5] Trap after noreturn calls, with special case for stack-check-fail 2022-06-15 09:02:17 -07:00
Phoebe Wang e1c5afa47d Reland "Reland "[X86][RFC] Enable `_Float16` type support on X86 following the psABI""
Fixed the missing SQRT promotion. Adding several missing operations too.
2022-06-15 23:00:18 +08:00
Thomas Joerg 37455b1f71 Revert "Reland "[X86][RFC] Enable `_Float16` type support on X86 following the psABI""
This reverts commit 6e02e27536.

This introduces a crash in the backend. Reproducer in MLIR's LLVM
dialect follows. Let me know if you have trouble reproducing this.

module {
  llvm.func @malloc(i64) -> !llvm.ptr<i8>
  llvm.func @_mlir_ciface_tf_report_error(!llvm.ptr<i8>, i32, !llvm.ptr<i8>)
  llvm.mlir.global internal constant @error_message_2208944672953921889("failed to allocate memory at loc(\22-\22:3:8)\00")
  llvm.func @_mlir_ciface_tf_alloc(!llvm.ptr<i8>, i64, i64, i32, i32, !llvm.ptr<i32>) -> !llvm.ptr<i8>
  llvm.func @Rsqrt_CPU_DT_HALF_DT_HALF(%arg0: !llvm.ptr<i8>, %arg1: i64, %arg2: !llvm.ptr<i8>) -> !llvm.struct<(i64, ptr<i8>)> attributes {llvm.emit_c_interface, tf_entry} {
    %0 = llvm.mlir.constant(8 : i32) : i32
    %1 = llvm.mlir.constant(8 : index) : i64
    %2 = llvm.mlir.constant(2 : index) : i64
    %3 = llvm.mlir.constant(dense<0.000000e+00> : vector<4xf16>) : vector<4xf16>
    %4 = llvm.mlir.constant(dense<[0, 1, 2, 3]> : vector<4xi32>) : vector<4xi32>
    %5 = llvm.mlir.constant(dense<1.000000e+00> : vector<4xf16>) : vector<4xf16>
    %6 = llvm.mlir.constant(false) : i1
    %7 = llvm.mlir.constant(1 : i32) : i32
    %8 = llvm.mlir.constant(0 : i32) : i32
    %9 = llvm.mlir.constant(4 : index) : i64
    %10 = llvm.mlir.constant(0 : index) : i64
    %11 = llvm.mlir.constant(1 : index) : i64
    %12 = llvm.mlir.constant(-1 : index) : i64
    %13 = llvm.mlir.null : !llvm.ptr<f16>
    %14 = llvm.getelementptr %13[%9] : (!llvm.ptr<f16>, i64) -> !llvm.ptr<f16>
    %15 = llvm.ptrtoint %14 : !llvm.ptr<f16> to i64
    %16 = llvm.alloca %15 x f16 {alignment = 32 : i64} : (i64) -> !llvm.ptr<f16>
    %17 = llvm.alloca %15 x f16 {alignment = 32 : i64} : (i64) -> !llvm.ptr<f16>
    %18 = llvm.mlir.null : !llvm.ptr<i64>
    %19 = llvm.getelementptr %18[%arg1] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
    %20 = llvm.ptrtoint %19 : !llvm.ptr<i64> to i64
    %21 = llvm.alloca %20 x i64 : (i64) -> !llvm.ptr<i64>
    llvm.br ^bb1(%10 : i64)
  ^bb1(%22: i64):  // 2 preds: ^bb0, ^bb2
    %23 = llvm.icmp "slt" %22, %arg1 : i64
    llvm.cond_br %23, ^bb2, ^bb3
  ^bb2:  // pred: ^bb1
    %24 = llvm.bitcast %arg2 : !llvm.ptr<i8> to !llvm.ptr<struct<(ptr<f16>, ptr<f16>, i64)>>
    %25 = llvm.getelementptr %24[%10, 2] : (!llvm.ptr<struct<(ptr<f16>, ptr<f16>, i64)>>, i64) -> !llvm.ptr<i64>
    %26 = llvm.add %22, %11  : i64
    %27 = llvm.getelementptr %25[%26] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
    %28 = llvm.load %27 : !llvm.ptr<i64>
    %29 = llvm.getelementptr %21[%22] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
    llvm.store %28, %29 : !llvm.ptr<i64>
    llvm.br ^bb1(%26 : i64)
  ^bb3:  // pred: ^bb1
    llvm.br ^bb4(%10, %11 : i64, i64)
  ^bb4(%30: i64, %31: i64):  // 2 preds: ^bb3, ^bb5
    %32 = llvm.icmp "slt" %30, %arg1 : i64
    llvm.cond_br %32, ^bb5, ^bb6
  ^bb5:  // pred: ^bb4
    %33 = llvm.bitcast %arg2 : !llvm.ptr<i8> to !llvm.ptr<struct<(ptr<f16>, ptr<f16>, i64)>>
    %34 = llvm.getelementptr %33[%10, 2] : (!llvm.ptr<struct<(ptr<f16>, ptr<f16>, i64)>>, i64) -> !llvm.ptr<i64>
    %35 = llvm.add %30, %11  : i64
    %36 = llvm.getelementptr %34[%35] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
    %37 = llvm.load %36 : !llvm.ptr<i64>
    %38 = llvm.mul %37, %31  : i64
    llvm.br ^bb4(%35, %38 : i64, i64)
  ^bb6:  // pred: ^bb4
    %39 = llvm.bitcast %arg2 : !llvm.ptr<i8> to !llvm.ptr<ptr<f16>>
    %40 = llvm.getelementptr %39[%11] : (!llvm.ptr<ptr<f16>>, i64) -> !llvm.ptr<ptr<f16>>
    %41 = llvm.load %40 : !llvm.ptr<ptr<f16>>
    %42 = llvm.getelementptr %13[%11] : (!llvm.ptr<f16>, i64) -> !llvm.ptr<f16>
    %43 = llvm.ptrtoint %42 : !llvm.ptr<f16> to i64
    %44 = llvm.alloca %7 x i32 : (i32) -> !llvm.ptr<i32>
    llvm.store %8, %44 : !llvm.ptr<i32>
    %45 = llvm.call @_mlir_ciface_tf_alloc(%arg0, %31, %43, %8, %7, %44) : (!llvm.ptr<i8>, i64, i64, i32, i32, !llvm.ptr<i32>) -> !llvm.ptr<i8>
    %46 = llvm.bitcast %45 : !llvm.ptr<i8> to !llvm.ptr<f16>
    %47 = llvm.icmp "eq" %31, %10 : i64
    %48 = llvm.or %6, %47  : i1
    %49 = llvm.mlir.null : !llvm.ptr<i8>
    %50 = llvm.icmp "ne" %45, %49 : !llvm.ptr<i8>
    %51 = llvm.or %50, %48  : i1
    llvm.cond_br %51, ^bb7, ^bb13
  ^bb7:  // pred: ^bb6
    %52 = llvm.urem %31, %9  : i64
    %53 = llvm.sub %31, %52  : i64
    llvm.br ^bb8(%10 : i64)
  ^bb8(%54: i64):  // 2 preds: ^bb7, ^bb9
    %55 = llvm.icmp "slt" %54, %53 : i64
    llvm.cond_br %55, ^bb9, ^bb10
  ^bb9:  // pred: ^bb8
    %56 = llvm.mul %54, %11  : i64
    %57 = llvm.add %56, %10  : i64
    %58 = llvm.add %57, %10  : i64
    %59 = llvm.getelementptr %41[%58] : (!llvm.ptr<f16>, i64) -> !llvm.ptr<f16>
    %60 = llvm.bitcast %59 : !llvm.ptr<f16> to !llvm.ptr<vector<4xf16>>
    %61 = llvm.load %60 {alignment = 2 : i64} : !llvm.ptr<vector<4xf16>>
    %62 = "llvm.intr.sqrt"(%61) : (vector<4xf16>) -> vector<4xf16>
    %63 = llvm.fdiv %5, %62  : vector<4xf16>
    %64 = llvm.getelementptr %46[%58] : (!llvm.ptr<f16>, i64) -> !llvm.ptr<f16>
    %65 = llvm.bitcast %64 : !llvm.ptr<f16> to !llvm.ptr<vector<4xf16>>
    llvm.store %63, %65 {alignment = 2 : i64} : !llvm.ptr<vector<4xf16>>
    %66 = llvm.add %54, %9  : i64
    llvm.br ^bb8(%66 : i64)
  ^bb10:  // pred: ^bb8
    %67 = llvm.icmp "ult" %53, %31 : i64
    llvm.cond_br %67, ^bb11, ^bb12
  ^bb11:  // pred: ^bb10
    %68 = llvm.mul %53, %12  : i64
    %69 = llvm.add %31, %68  : i64
    %70 = llvm.mul %53, %11  : i64
    %71 = llvm.add %70, %10  : i64
    %72 = llvm.trunc %69 : i64 to i32
    %73 = llvm.mlir.undef : vector<4xi32>
    %74 = llvm.insertelement %72, %73[%8 : i32] : vector<4xi32>
    %75 = llvm.shufflevector %74, %73 [0 : i32, 0 : i32, 0 : i32, 0 : i32] : vector<4xi32>, vector<4xi32>
    %76 = llvm.icmp "slt" %4, %75 : vector<4xi32>
    %77 = llvm.add %71, %10  : i64
    %78 = llvm.getelementptr %41[%77] : (!llvm.ptr<f16>, i64) -> !llvm.ptr<f16>
    %79 = llvm.bitcast %78 : !llvm.ptr<f16> to !llvm.ptr<vector<4xf16>>
    %80 = llvm.intr.masked.load %79, %76, %3 {alignment = 2 : i32} : (!llvm.ptr<vector<4xf16>>, vector<4xi1>, vector<4xf16>) -> vector<4xf16>
    %81 = llvm.bitcast %16 : !llvm.ptr<f16> to !llvm.ptr<vector<4xf16>>
    llvm.store %80, %81 : !llvm.ptr<vector<4xf16>>
    %82 = llvm.load %81 {alignment = 2 : i64} : !llvm.ptr<vector<4xf16>>
    %83 = "llvm.intr.sqrt"(%82) : (vector<4xf16>) -> vector<4xf16>
    %84 = llvm.fdiv %5, %83  : vector<4xf16>
    %85 = llvm.bitcast %17 : !llvm.ptr<f16> to !llvm.ptr<vector<4xf16>>
    llvm.store %84, %85 {alignment = 2 : i64} : !llvm.ptr<vector<4xf16>>
    %86 = llvm.load %85 : !llvm.ptr<vector<4xf16>>
    %87 = llvm.getelementptr %46[%77] : (!llvm.ptr<f16>, i64) -> !llvm.ptr<f16>
    %88 = llvm.bitcast %87 : !llvm.ptr<f16> to !llvm.ptr<vector<4xf16>>
    llvm.intr.masked.store %86, %88, %76 {alignment = 2 : i32} : vector<4xf16>, vector<4xi1> into !llvm.ptr<vector<4xf16>>
    llvm.br ^bb12
  ^bb12:  // 2 preds: ^bb10, ^bb11
    %89 = llvm.mul %2, %1  : i64
    %90 = llvm.mul %arg1, %2  : i64
    %91 = llvm.add %90, %11  : i64
    %92 = llvm.mul %91, %1  : i64
    %93 = llvm.add %89, %92  : i64
    %94 = llvm.alloca %93 x i8 : (i64) -> !llvm.ptr<i8>
    %95 = llvm.bitcast %94 : !llvm.ptr<i8> to !llvm.ptr<ptr<f16>>
    llvm.store %46, %95 : !llvm.ptr<ptr<f16>>
    %96 = llvm.getelementptr %95[%11] : (!llvm.ptr<ptr<f16>>, i64) -> !llvm.ptr<ptr<f16>>
    llvm.store %46, %96 : !llvm.ptr<ptr<f16>>
    %97 = llvm.getelementptr %95[%2] : (!llvm.ptr<ptr<f16>>, i64) -> !llvm.ptr<ptr<f16>>
    %98 = llvm.bitcast %97 : !llvm.ptr<ptr<f16>> to !llvm.ptr<i64>
    llvm.store %10, %98 : !llvm.ptr<i64>
    %99 = llvm.bitcast %94 : !llvm.ptr<i8> to !llvm.ptr<struct<(ptr<f16>, ptr<f16>, i64, i64)>>
    %100 = llvm.getelementptr %99[%10, 3] : (!llvm.ptr<struct<(ptr<f16>, ptr<f16>, i64, i64)>>, i64) -> !llvm.ptr<i64>
    %101 = llvm.getelementptr %100[%arg1] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
    %102 = llvm.sub %arg1, %11  : i64
    llvm.br ^bb14(%102, %11 : i64, i64)
  ^bb13:  // pred: ^bb6
    %103 = llvm.mlir.addressof @error_message_2208944672953921889 : !llvm.ptr<array<42 x i8>>
    %104 = llvm.getelementptr %103[%10, %10] : (!llvm.ptr<array<42 x i8>>, i64, i64) -> !llvm.ptr<i8>
    llvm.call @_mlir_ciface_tf_report_error(%arg0, %0, %104) : (!llvm.ptr<i8>, i32, !llvm.ptr<i8>) -> ()
    %105 = llvm.mul %2, %1  : i64
    %106 = llvm.mul %2, %10  : i64
    %107 = llvm.add %106, %11  : i64
    %108 = llvm.mul %107, %1  : i64
    %109 = llvm.add %105, %108  : i64
    %110 = llvm.alloca %109 x i8 : (i64) -> !llvm.ptr<i8>
    %111 = llvm.bitcast %110 : !llvm.ptr<i8> to !llvm.ptr<ptr<f16>>
    llvm.store %13, %111 : !llvm.ptr<ptr<f16>>
    %112 = llvm.getelementptr %111[%11] : (!llvm.ptr<ptr<f16>>, i64) -> !llvm.ptr<ptr<f16>>
    llvm.store %13, %112 : !llvm.ptr<ptr<f16>>
    %113 = llvm.getelementptr %111[%2] : (!llvm.ptr<ptr<f16>>, i64) -> !llvm.ptr<ptr<f16>>
    %114 = llvm.bitcast %113 : !llvm.ptr<ptr<f16>> to !llvm.ptr<i64>
    llvm.store %10, %114 : !llvm.ptr<i64>
    %115 = llvm.call @malloc(%109) : (i64) -> !llvm.ptr<i8>
    "llvm.intr.memcpy"(%115, %110, %109, %6) : (!llvm.ptr<i8>, !llvm.ptr<i8>, i64, i1) -> ()
    %116 = llvm.mlir.undef : !llvm.struct<(i64, ptr<i8>)>
    %117 = llvm.insertvalue %10, %116[0] : !llvm.struct<(i64, ptr<i8>)>
    %118 = llvm.insertvalue %115, %117[1] : !llvm.struct<(i64, ptr<i8>)>
    llvm.return %118 : !llvm.struct<(i64, ptr<i8>)>
  ^bb14(%119: i64, %120: i64):  // 2 preds: ^bb12, ^bb15
    %121 = llvm.icmp "sge" %119, %10 : i64
    llvm.cond_br %121, ^bb15, ^bb16
  ^bb15:  // pred: ^bb14
    %122 = llvm.getelementptr %21[%119] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
    %123 = llvm.load %122 : !llvm.ptr<i64>
    %124 = llvm.getelementptr %100[%119] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
    llvm.store %123, %124 : !llvm.ptr<i64>
    %125 = llvm.getelementptr %101[%119] : (!llvm.ptr<i64>, i64) -> !llvm.ptr<i64>
    llvm.store %120, %125 : !llvm.ptr<i64>
    %126 = llvm.mul %120, %123  : i64
    %127 = llvm.sub %119, %11  : i64
    llvm.br ^bb14(%127, %126 : i64, i64)
  ^bb16:  // pred: ^bb14
    %128 = llvm.call @malloc(%93) : (i64) -> !llvm.ptr<i8>
    "llvm.intr.memcpy"(%128, %94, %93, %6) : (!llvm.ptr<i8>, !llvm.ptr<i8>, i64, i1) -> ()
    %129 = llvm.mlir.undef : !llvm.struct<(i64, ptr<i8>)>
    %130 = llvm.insertvalue %arg1, %129[0] : !llvm.struct<(i64, ptr<i8>)>
    %131 = llvm.insertvalue %128, %130[1] : !llvm.struct<(i64, ptr<i8>)>
    llvm.return %131 : !llvm.struct<(i64, ptr<i8>)>
  }
  llvm.func @_mlir_ciface_Rsqrt_CPU_DT_HALF_DT_HALF(%arg0: !llvm.ptr<struct<(i64, ptr<i8>)>>, %arg1: !llvm.ptr<i8>, %arg2: !llvm.ptr<struct<(i64, ptr<i8>)>>) attributes {llvm.emit_c_interface, tf_entry} {
    %0 = llvm.load %arg2 : !llvm.ptr<struct<(i64, ptr<i8>)>>
    %1 = llvm.extractvalue %0[0] : !llvm.struct<(i64, ptr<i8>)>
    %2 = llvm.extractvalue %0[1] : !llvm.struct<(i64, ptr<i8>)>
    %3 = llvm.call @Rsqrt_CPU_DT_HALF_DT_HALF(%arg1, %1, %2) : (!llvm.ptr<i8>, i64, !llvm.ptr<i8>) -> !llvm.struct<(i64, ptr<i8>)>
    llvm.store %3, %arg0 : !llvm.ptr<struct<(i64, ptr<i8>)>>
    llvm.return
  }
}
2022-06-15 13:24:24 +02:00
Simon Pilgrim cf2072bcad [X86] X86TargetTransformInfo.cpp - use InstructionCost type to accumulate instructions costs 2022-06-15 12:21:01 +01:00
Benjamin Kramer fb34d531af Promote bf16 to f32 when the target doesn't support it
This is modeled after the half-precision fp support. Two new nodes are
introduced for casting from and to bf16. Since casting from bf16 is a
simple operation I opted to always directly lower it to integer
arithmetic. The other way round is more complicated if you want to
preserve IEEE semantics, so it's handled by a new __truncsfbf2
compiler-rt builtin.

This is of course very bare bones, but sufficient to get a semi-softened
fadd on x86.

Possible future improvements:
 - Targets with bf16 conversion instructions can now make fp_to_bf16 legal
 - The software conversion to bf16 can be replaced by a trivial
   implementation under fast math.

Differential Revision: https://reviews.llvm.org/D126953
2022-06-15 12:56:31 +02:00
Simon Pilgrim 4fd561415e [X86] needCarryOrOverflowFlag/onlyZeroFlagUsed - merge identical switch cases. NFCI.
Makes it easier to grok and fixes various bugprone-branch-clone warnings.
2022-06-15 10:40:22 +01:00
Amir Ayupov 5965878d4d [X86][NFC] Use mnemonic tables in validateInstruction 4/4
Group switch cases by opcode:
- VGATHERDPD
- VGATHERDPS
- VGATHERQPD
- VGATHERQPS
- VPGATHERDD
- VPGATHERDQ
- VPGATHERQD
- VPGATHERQQ

Distinguish masked vs non-masked forms by EVEX encoding.

Reviewed By: skan, craig.topper

Differential Revision: https://reviews.llvm.org/D127719
2022-06-14 19:53:44 -07:00
Luo, Yuanke 54ec8e25fc [X86][AMX] Fix klockwork issue. 2022-06-15 09:26:59 +08:00
Phoebe Wang 6e02e27536 Reland "[X86][RFC] Enable `_Float16` type support on X86 following the psABI"
Disabled 2 mlir tests due to the runtime doesn't support `_Float16`, see
the issue here https://github.com/llvm/llvm-project/issues/55992
2022-06-15 09:15:31 +08:00
Amir Ayupov 6226e46c5f [X86][NFC] Use mnemonic tables in validateInstruction 3/4
Group switch cases by opcode:
- V4FMADDPS
- V4FMADDSS
- V4FNMADDPS
- V4FNMADDSS
- VP4DPWSSDS
- VP4DPWSSD

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D127718
2022-06-14 12:11:47 -07:00
Amir Ayupov df16c077dc [X86][NFC] Use mnemonic tables in validateInstruction 2/4
Group switch cases by opcode:
- VFCMULCPH
- VFCMULCSH
- VFMULCPH
- VFMULCSH

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D127717
2022-06-14 12:09:37 -07:00
Amir Ayupov 4bf928bce4 [X86][NFC] Use mnemonic tables in validateInstruction 1/4
Group switch cases by opcode:
- VFCMADDCPH
- VFCMADDCSH
- VFMADDCPH
- VFMADDCSH

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D127716
2022-06-14 12:06:23 -07:00
Simon Pilgrim 64eea34420 [X86] combineEXTEND_VECTOR_INREG - don't attempt to shuffle combine ANY_EXTEND_VECTOR_INREG without SSE41
Without SSE41, ANY_EXTEND_VECTOR_INREG nodes are likely to be prematurely combined to a target shuffle preventing generic sign extension folds.

Fixes a number of sign-extend regressions in D127115.
2022-06-13 17:42:04 +01:00
Maksim Panchenko 8f6512fea0 [X86][Disassembler] Fix displacement operand size for symbolizer
On 64-bit X86, 0x66 operand-size override prefix will change the size of
the instruction operand, e.g. from 32 bits to 16 bits, but it will not
modify the size of the displacement operand used for memory addressing,
which will always be 32 bits.

Reviewed By: skan, rafauler

Differential Revision: https://reviews.llvm.org/D126726
2022-06-13 00:14:43 -07:00
Kazu Hirata 92ab024f81 [X86] Use default member initialization (NFC)
Identified with modernize-use-default-member-init.
2022-06-12 18:30:46 -07:00
Jez Ng d4bcb45db7 [MC][re-land] Omit DWARF unwind info if compact unwind is present where eligible
This reverts commit d941d59783.

Differential Revision: https://reviews.llvm.org/D122258
2022-06-12 17:24:19 -04:00
Mehdi Amini 5d8298a768 Revert "[X86][RFC] Enable `_Float16` type support on X86 following the psABI"
This reverts commit 2d2da259c8.

This breaks MLIR integration test (JIT crashing), reverting in the
meantime.
2022-06-12 15:14:37 +00:00
Jez Ng d941d59783 Revert "[MC] Omit DWARF unwind info if compact unwind is present where eligible"
This reverts commit ef501bf85d.
2022-06-12 10:47:08 -04:00
Simon Pilgrim b5d7beeb97 [X86] combineConcatVectorOps - add support for concatenation of VSELECT/BLENDV nodes (REAPPLIED)
If the LHS/RHS selection operands can be cheaply concatenated back together then replace 2 x 128-bit selection nodes with 1 x 256-bit node

Addresses the regression introduced in the bug fix from rGd5af6a38082b39ae520a328e44dc29ebcb036bb2

REAPPLIED with for bug identified in rGea8fb3b60196
2022-06-12 15:40:36 +01:00
Jez Ng ef501bf85d [MC] Omit DWARF unwind info if compact unwind is present where eligible
Previously, omitting unnecessary DWARF unwinds was only done in two
cases:
* For Darwin + aarch64, if no DWARF unwind info is needed for all the
  functions in a TU, then the `__eh_frame` section would be omitted
  entirely. If any one function needed DWARF unwind, then MC would emit
  DWARF unwind entries for all the functions in the TU.
* For watchOS, MC would omit DWARF unwind on a per-function basis, as
  long as compact unwind was available for that function.

This diff makes it so that we omit DWARF unwind on a per-function basis
for Darwin + aarch64 as well. In addition, we introduce the flag
`--emit-dwarf-unwind=` which can toggle between `always`,
`no-compact-unwind` (only emit DWARF when CU cannot be emitted for a
given function), and the target platform `default`.  `no-compact-unwind`
is particularly useful for newer x86_64 platforms: we don't want to omit
DWARF unwind for x86_64 in general due to possible backwards compat
issues, but we should make it possible for people to opt into this
behavior if they are only targeting newer platforms.

**Motivation:** I'm working on adding support for `__eh_frame` to LLD,
but I'm concerned that we would suffer a perf hit. Processing compact
unwind is already expensive, and that's a simpler format than EH frames.
Given that MC currently produces one EH frame entry for every compact
unwind entry, I don't think processing them will be cheap. I tried to do
something clever on LLD's end to drop the unnecessary EH frames at parse
time, but this made the code significantly more complex. So I'm looking
at fixing this at the MC level instead.

**Addendum:** It turns out that there was a latent bug in the X86
backend when `OmitDwarfIfHaveCompactUnwind` is naively enabled, which is
not too surprising given that this combination has not been heretofore
used.

For functions that have unwind info that cannot be encoded with CU, MC
would end up dropping both the compact unwind entry (OK; existing
behavior) as well as the DWARF entries (not OK).  This diff fixes things
so that we emit the DWARF entry, as well as a CU entry with encoding
`UNWIND_X86_MODE_DWARF` -- this basically tells the unwinder to look for
the DWARF entry. I'm not 100% sure the `UNWIND_X86_MODE_DWARF` CU entry
is necessary, this was the simplest fix. ld64 seems to be able to handle
both the absence and presence of this CU entry. Ultimately ld64 (and
LLD) will synthesize `UNWIND_X86_MODE_DWARF` if it is absent, so there
is no impact to the final binary size.

Reviewed By: davide, lhames

Differential Revision: https://reviews.llvm.org/D122258
2022-06-12 10:03:56 -04:00
Phoebe Wang 2d2da259c8 [X86][RFC] Enable `_Float16` type support on X86 following the psABI
GCC and Clang/LLVM will support `_Float16` on X86 in C/C++, following
the latest X86 psABI. (https://gitlab.com/x86-psABIs)

_Float16 arithmetic will be performed using native half-precision. If
native arithmetic instructions are not available, it will be performed
at a higher precision (currently always float) and then truncated down
to _Float16 immediately after each single arithmetic operation.

Reviewed By: LuoYuanke

Differential Revision: https://reviews.llvm.org/D107082
2022-06-12 11:40:00 +08:00
Simon Pilgrim 7841d09449 [X86][AVX512] Retain pmuldq broadcast loads on 32-bit targets
Don't demand just the lower 32-bits on 32-bit AVX512 targets to preserve 64-bit broadcast loads patterns
2022-06-11 19:30:00 +01:00
Simon Pilgrim 6eaea225c7 [X86] combineTargetShuffle - break if-else chain. NFC.
(style) Both cases always continue.
2022-06-11 09:16:39 +01:00
Simon Pilgrim 89d2b1e4f7 [X86] emitOrXorXorTree - break if-else chain. NFC.
(style) Both cases always return.
2022-06-11 09:16:38 +01:00
Fangrui Song adf4142f76 [MC] De-capitalize SwitchSection. NFC
Add SwitchSection to return switchSection. The API will be removed soon.
2022-06-10 22:50:55 -07:00
Eli Friedman 0ff51d5dde Fix interaction of CFI instructions with MachineOutliner.
1. When checking if a candidate contains a CFI instruction, actually
iterate over all of the instructions, instead of stopping halfway
through.
2. Make sure copied CFI directives refer to the correct instruction.

Fixes https://github.com/llvm/llvm-project/issues/55842

Differential Revision: https://reviews.llvm.org/D126930
2022-06-10 13:37:49 -07:00
Guillaume Chatelet 38637ee477 [clang] Add support for __builtin_memset_inline
In the same spirit as D73543 and in reply to https://reviews.llvm.org/D126768#3549920 this patch is adding support for `__builtin_memset_inline`.

The idea is to get support from the compiler to easily write efficient memory function implementations.

This patch could be split in two:
 - one for the LLVM part adding the `llvm.memset.inline.*` intrinsics.
 - and another one for the Clang part providing the instrinsic as a builtin.

Differential Revision: https://reviews.llvm.org/D126903
2022-06-10 13:13:59 +00:00
Simon Pilgrim 5acbb2dda2 [X86] combineMulToPMADDWD - don't bitcast the source ops before splitting to ensure we split the build vectors early
Fixes a regression on D127115 - splitting was creating extract_subvector(bitcast(build_vector())) patterns which prevented the build vectors being split before being bitcast to vXi16 types, resulting in various issues with further folding of the (now legal) build vectors
2022-06-10 13:44:49 +01:00
Simon Pilgrim 7ac33b8aac [X86] Remove !VT.is128BitVector() check. NFCI.
The code is inside a if(VT.is256BitVector() || VT.is512BitVector()) condition
2022-06-09 21:39:45 +01:00
Simon Pilgrim 72a049d778 [X86][AVX2] LowerINSERT_VECTOR_ELT - support v4i64 insertion as BLENDI(X, SCALAR_TO_VECTOR(Y)) 2022-06-09 21:18:10 +01:00
Simon Pilgrim 1a02db9882 [X86] canonicalizeShuffleWithBinOps - add TODO for X86ISD::ANDNP bitwise handling
Its just as safe to move shuffles across X86ISD::ANDNP as any other logical bitop, they just tend to appear too late to matter.

Noticed while triaging D127115 regressions.
2022-06-09 12:18:26 +01:00
Guillaume Chatelet dc3367970e [SelectionDAG] Handle bzero/memset libcalls globally instead of per target
Differential Revision: https://reviews.llvm.org/D127279
2022-06-09 08:34:55 +00:00
Simon Pilgrim 9a76337fee [X86] combineMOVMSK - constant fold with getTargetConstantBitsFromNode not just BUILD_VECTOR
Help avoid a regression in D127115
2022-06-08 17:48:55 +01:00
Matt Arsenault cc5a1b3dd9 llvm-reduce: Add cloning of target MachineFunctionInfo
MIR support is totally unusable for AMDGPU without this, since the set
of reserved registers is set from fields here.

Add a clone method to MachineFunctionInfo. This is a subtle variant of
the copy constructor that is required if there are any MIR constructs
that use pointers. Specifically, at minimum fields that reference
MachineBasicBlocks or the MachineFunction need to be adjusted to the
values in the new function.
2022-06-07 10:14:48 -04:00
Guillaume Chatelet 0788186182 [Alignment][NFC] Remove usage of MemSDNode::getAlignment
I can't remove the function just yet as it is used in the generated .inc files.
I would also like to provide a way to compare alignment with TypeSize since it came up a few times.

Differential Revision: https://reviews.llvm.org/D126910
2022-06-07 13:52:20 +00:00
Simon Pilgrim f5507978a3 [X86] getFauxShuffleMask - add VSELECT/BLENDV handling
First step towards enabling shuffle combining starting from VSELECT/BLENDV nodes - this should eventually help improve the codegen reported at Issue #54819
2022-06-07 14:46:25 +01:00
Simon Pilgrim 5cea1553b8 [X86] X86SpeculativeLoadHardening.cpp - pass DebugLoc by const reference not value. 2022-06-07 12:38:05 +01:00
Simon Pilgrim 1b6d3bdc82 [X86] foldMaskedMergeImpl - pass SDLoc by const reference not value. 2022-06-07 12:36:30 +01:00
Simon Pilgrim 63e3035dbe [X86] LowerGC_TRANSITION - remove redundant SDLoc(). 2022-06-07 10:57:58 +01:00
Fangrui Song 15d82c62dc [MC] De-capitalize MCStreamer functions
Follow-up to c031378ce0 .
The class is mostly consistent now.
2022-06-07 00:31:02 -07:00
Shilei Tian 0c3e6e5717 [NFC] Remove trailing whitespace 2022-06-06 18:59:13 -04:00
Fangrui Song 77e300ffdf [MC] Change EndOfStatement "unexpected tokens in .xxx directive " to "expected newline" 2022-06-05 15:11:01 -07:00
Kazu Hirata 9a8e65de8c [Target] Use MachineBasicBlock::erase (NFC) 2022-06-04 22:41:24 -07:00
Eric Christopher 93cb6b9c83 Revert "[X86] combineConcatVectorOps - add support for concatenation VSELECT/BLENDV nodes"
See the original commit for a testcase.

This reverts commit ea8fb3b601.
2022-06-03 12:31:11 -07:00
Simon Pilgrim de2b543505 [X86] LowerVSETCC - merge getConstant() calls with flipped/unflipped sign masks. NFCI. 2022-06-01 15:09:48 +01:00
Sanjay Patel 3a503a4a9c [x86] fix miscompile from wrongly identified fneg
We may need to peek through a bitcast when identifying an fneg idiom
via its pool constant, but we can't allow a different-sized constant
in that match.

This is noted in issue #55758 with an example that needs fast-math,
but as the test here shows, this has potential to miscompile more
generally (no fast-math required).

Differential Revision: https://reviews.llvm.org/D126775
2022-06-01 09:56:33 -04:00
Simon Pilgrim f6dbb0b6fb [X86] Fix typo in extraction type introduced in rGed0303aa2251e4484a2b4ff7f236c9f7cdfb2092
It doesn't look like we have test coverage for this at the moment :(
2022-06-01 12:31:27 +01:00
Simon Pilgrim ea8fb3b601 [X86] combineConcatVectorOps - add support for concatenation VSELECT/BLENDV nodes
If the LHS/RHS selection operands can be cheaply concatenated back together then replace 2 x 128-bit selection nodes with 1 x 256-bit node

Addresses the regression introduced in the bug fix from rGd5af6a38082b39ae520a328e44dc29ebcb036bb2
2022-06-01 10:46:06 +01:00
Phoebe Wang a2ea5b496b [X86] Add support for `-mharden-sls=[none|all|return|indirect-jmp]`
The patch addresses the feature request from https://github.com/ClangBuiltLinux/linux/issues/1633. The implementation borrows a lot from aarch64.

Reviewed By: nickdesaulniers, MaskRay

Differential Revision: https://reviews.llvm.org/D126137
2022-06-01 09:45:04 +08:00
Simon Pilgrim d5af6a3808 [X86] LowerMINMAX - split v4i64 types on AVX1 targets (Issue #55648)
Originally we tried to use default expansion for v4i64 types to make it easier to concatenate the results back together, but this can cause infinite loop issues with existing VSELECT splitting code in narrowExtractedVectorSelect if we have other uses of the VSELECT results (e.g. reduction patterns).

To fix the infinite loop, this patch always splits MIN/MAX v4i64 nodes during lowering and I've added a TODO for combineConcatVectorOps to investigate when we can cheaply concatenate VSELECT/BLENDV nodes together.

Fixes #55648 - regression test case will be added in a follow up.
2022-05-31 17:28:56 +01:00
Simon Pilgrim af0113cf77 [X86] combineEXTRACT_SUBVECTOR - pull out repeated getVectorNumElements() calls. NFC. 2022-05-31 16:13:54 +01:00
Simon Pilgrim b9443cb6fa [X86] narrowExtractedVectorSelect - don't peek through bitcasts to find source vector
We don't seem to need this for any test coverage and it was making tracking of the uses() of the source vector more difficult

Noticed while investigating Issue #55648
2022-05-31 14:57:18 +01:00
Simon Pilgrim ed0303aa22 [X86] LowerTRUNCATE - avoid creating extract_subvector(bitcast(vec)) patterns
We have a generic DAG combine to attempt to fold extract_subvector(bitcast(vec)) -> bitcast(extract_subvector(vec)) but if we create these patterns late in lowering then we often miss them.

Noticed while investigating Issue #55648 which gets caught in an infinite loop trying to split extract_subvector(bitcast(vselect()) patterns - this doesn't fix the issue yet but reduces the regressions from the WIP fix.
2022-05-31 14:30:56 +01:00
Simon Pilgrim d384a4c530 [X86] Adjust vector test costs to match SoG (Issue #54889)
znver1/2 models were incorrectly modelling the latency/throughput/uops and znver1 ymm variants also require double pumping.

Now matches what I can decipher from the AMD SoG, Agner and instlatx64 numbers vs the llvm-exegesis report provided by @fabian-r
2022-05-31 09:14:06 +01:00
Xiang1 Zhang 5d5aba78db [X86][NFC] Refine X86 Domain Reassignment for compiling time
Differential Revision: https://reviews.llvm.org/D126622
2022-05-31 10:10:40 +08:00
Simon Pilgrim 14cc4674bf [X86] Adjust vector fp test costs to match int test costs
znver1/2 models were missing the vtestps/pd overrides to match the vptest integer equivalents.

Noticed while investigating Issue #54889
2022-05-30 09:50:15 +01:00
Simon Pilgrim 1956f28037 [X86] Adjust vector extend to ymm to match SoG (Issue #54889)
znver1 ymm variants of VPMOVSX**/VPMOVZX** instructions require double pumping.

Now matches AMD SoG, Agner and instlatx64 numbers.

Thanks to @fabian-r for the report
2022-05-30 08:58:56 +01:00
Simon Pilgrim c99690462e [X86] Adjust vector shift costs to match SoG (Issue #54889)
znver1/2 models were incorrectly modelling the fpupipe (should be pipe2 for shift-by-scalar-amount and pipe1 for shift-by-element-amount) and znver1 ymm variants also require double pumping.

Now matches AMD SoG, Agner and instlatx64 numbers.

Thanks to @fabian-r for the report
2022-05-29 17:55:39 +01:00
eopXD 6a84579243 [LSR][TTI][PowerPC][SystemZ][X86] Add const-ness to TTI::isLSRCostLess. NFC
Reviewed By: Meinersbur

Differential Revision: https://reviews.llvm.org/D126350
2022-05-27 15:22:23 -07:00
Luo, Yuanke aaaf9cede7 [X86][AMX] Replace LDTILECFG with PLDTILECFGV on auto-config.
There is intrinsic `@llvm.x86.ldtilecfg` which is lowered to LDTILECFG.
This intrinsic is open for user to configure tile registers by
themselves. There is a chance that `@llvm.x86.ldtilecfg` would be mixed
with the new AMX intrinsics which depend on compiler to configure tile
registers. Separate pusedo instruction PLDTILECFGV would avoid
unexpected behavious when `@llvm.x86.ldtilecfg` is mixed with new AMX
intrinsics. Though user should not mix the two programming model,
compiler should avoid crash or UB when they are mixed.

Differential Revision: https://reviews.llvm.org/D126519
2022-05-27 16:38:35 +08:00
Zongwei Lan ad73ce318e [Target] use getSubtarget<> instead of static_cast<>(getSubtarget())
Differential Revision: https://reviews.llvm.org/D125391
2022-05-26 11:22:41 -07:00
Fangrui Song 9ee15bba47 [MC] Lower case the first letter of EmitCOFF* EmitWin* EmitCV*. NFC 2022-05-26 00:14:08 -07:00
Maksim Panchenko bed9efed71 [MCDisassembler] Disambiguate Size parameter in tryAddingSymbolicOperand()
MCSymbolizer::tryAddingSymbolicOperand() overloaded the Size parameter
to specify either the instruction size or the operand size depending on
the architecture. However, for proper symbolic disassembly on X86, we
need to know both sizes, as an instruction can have two operands, and
the instruction size cannot be reliably calculated based on the operand
offset and its size. Hence, split Size into OpSize and InstSize.

For X86, the new interface allows to fix a couple of issues:
  * Correctly adjust the value of PC-relative operands.
  * Set operand size to zero when the operand is specified implicitly.

Differential Revision: https://reviews.llvm.org/D126101
2022-05-25 13:44:32 -07:00
Craig Topper 06fee478d2 [X86] Add isSimple check to the load combine in combineExtractVectorElt.
I think we need to be sure the load isn't volatile before we
duplicate and shrink it.

Reviewed By: spatel

Differential Revision: https://reviews.llvm.org/D126353
2022-05-25 09:11:11 -07:00
Simon Pilgrim 6c80267d0f [CostModel][X86] getScalarizationOverhead - improve extraction costs for > 128-bit vectors
We were using the default getScalarizationOverhead expansion for extraction costs, which adds up all the individual element extraction costs.

This is fine for 128-bit vectors, but for 256/512-bit vectors each element extraction also has to account for extracting the upper 128-bit subvector extraction before it can handle the element. For scalarization costs we only need to extract each demanded subvector once.

Differential Revision: https://reviews.llvm.org/D125527
2022-05-24 15:18:08 +01:00
Luo, Yuanke 3b1de7ab60 [X86][AMX] Reduce the compiling time for non-amx code.
Differential Revision: https://reviews.llvm.org/D126280
2022-05-24 18:02:51 +08:00
Luo, Yuanke 496156ac57 [X86][AMX] Multiple configure for AMX register.
The previous solution depends on variable name to record the shape
information. However it is not reliable, because in release build
compiler would not set the variable name. It can be accomplished with an
additional option `fno-discard-value-names`, but it is not acceptable
for users.
This patch is to preconfigure the tile register with machine
instruction. It follow the same way what sigle configure does. In the
future we can fall back to multiple configure when single configure
fails due to the shape dependency issue.
The algorithm to configure the tile register is simple in the patch. We
may improve it in the future. It configure tile register based on basic
block. Compiler would spill the tile register if it live out the basic
block. After the configure there should be no spill across tile
confgiure in the register alloction. Just like fast register allocation
the algorithm walk the instruction in reverse order. When the shape
dependency doesn't meet, it insert ldtilecfg after the last instruction
that define the shape.
In post configuration compiler also walk the basic block to collect the
physical tile register number and generate instruction to fill the stack
slot for the correponding shape information.
TODO: There is some following work in D125602. The risk is modifying the
fast RA may cause regression as fast RA is usded for different targets.
We may create an independent RA for tile register.

Differential Revision: https://reviews.llvm.org/D125075
2022-05-24 13:18:42 +08:00
Luo, Yuanke d5999bd3f7 [X86][AMX][NFC] Refactor X86LowerAMXCast.cpp
Change static function to X86LowerAMXCast member function.

Differential Revision: https://reviews.llvm.org/D126058
2022-05-20 19:32:09 +08:00
Bill Wendling 6e00a34cdb [AArch64] Add support for -fzero-call-used-regs
Support the "-fzero-call-used-regs" option on AArch64. This involves much less
specialized code than the X86 version. Most of the checks can be done with
TableGen.

Reviewed By: nickdesaulniers, MaskRay

Differential Revision: https://reviews.llvm.org/D124836
2022-05-19 16:58:28 -07:00
Sotiris Apostolakis a094ad03f3 [NFC] Fix typos in X86CmovConversion 2022-05-19 15:13:11 +00:00
Jay Foad 6bec3e9303 [APInt] Remove all uses of zextOrSelf, sextOrSelf and truncOrSelf
Most clients only used these methods because they wanted to be able to
extend or truncate to the same bit width (which is a no-op). Now that
the standard zext, sext and trunc allow this, there is no reason to use
the OrSelf versions.

The OrSelf versions additionally have the strange behaviour of allowing
extending to a *smaller* width, or truncating to a *larger* width, which
are also treated as no-ops. A small amount of client code relied on this
(ConstantRange::castOp and MicrosoftCXXNameMangler::mangleNumber) and
needed rewriting.

Differential Revision: https://reviews.llvm.org/D125557
2022-05-19 11:23:13 +01:00
Simon Pilgrim 320545b577 [X86] Rename combineCONCAT_VECTORS\INSERT_SUBVECTOR\EXTRACT_SUBVECTOR to match Opcode name. NFCI.
Its a lot easier to quickly search for the combine when it actually contains the name of the opcode it combines.
2022-05-17 18:37:53 +01:00
Simon Pilgrim c64f5d44ad [X86] Attempt to fold EFLAGS into X86ISD::ADD/SUB ops
We already use combineAddOrSubToADCOrSBB to fold extended EFLAGS results into ISD::ADD/SUB ops as X86ISD::ADC/SBB carry ops.

This patch extends this to also try to fold EFLAGS results with X86ISD::ADD/SUB ops

Differential Revision: https://reviews.llvm.org/D125642
2022-05-17 10:59:24 +01:00
Sanjay Patel be7f09f7b2 [IR] create and use helper functions that test the signbit; NFCI 2022-05-16 11:26:23 -04:00
Simon Pilgrim b3077f563d [X86] Move combineAddOrSubToADCOrSBB earlier. NFC.
Make it easier to reuse in X86 ADD/SUB combines in an upcoming patch.
2022-05-15 22:06:33 +01:00
Simon Pilgrim 896557e129 [X86] Adjust fadd costs to match SoG
znver1/2 models were incorrectly modelling these on fpupipe 0 instead of 2/3 and znver1 ymm variants also require double pumping.

Now matches AMD SoG, Agner and instlatx64 numbers.

Thanks to @fabian-r for the report
2022-05-15 21:28:29 +01:00
Simon Pilgrim fd1f0c51ef [X86] lowerShuffleAsLanePermuteAndSHUFP always succeeds, so just return the result. NFC. 2022-05-15 15:53:36 +01:00
Simon Pilgrim c0f59be358 [X86] Pull out repeated isShuffleMaskInputInPlace calls. NFC. 2022-05-15 15:35:09 +01:00
Simon Pilgrim 32162cf291 [X86] lowerV4I64Shuffle - try harder to lower to PERMQ(BLENDD(V1,V2)) pattern 2022-05-15 14:57:58 +01:00
Simon Pilgrim bc90bbb759 [X86] LowerAVG - fix cut+paste typo. NFC. 2022-05-14 17:42:09 +01:00
Simon Pilgrim 98f82d69bd [X86] LowerStore - use is64BitVector() wrapper. NFCI. 2022-05-13 15:30:18 +01:00
Mingming Liu cb22cb2691 [X86] Fix 80 column violation in X86InstrInfo.cpp. NFC
Differential Revision: https://reviews.llvm.org/D125345
2022-05-10 19:56:14 -07:00
Mingming Liu 852f3d9987 Revert "[NFC] Run clang-format on llvm/lib/Target/X86/X86InstroInfo.cpp"
This reverts commit 8bef5476de.

Need to revert, update commit message and reapply.
2022-05-10 19:53:31 -07:00
Mingming Liu 8bef5476de [NFC] Run clang-format on llvm/lib/Target/X86/X86InstroInfo.cpp
Differential Revision: https://reviews.llvm.org/D125345
2022-05-10 17:56:51 -07:00
Mingming Liu fc58d7a326 [Peephole-opt][X86] Enhance peephole opt to see through SUBREG_TO_REG
(following AND) and eliminates redundant TEST instruction.

Differential Revision: https://reviews.llvm.org/D124118
2022-05-10 15:56:20 -07:00
Mingming Liu 1555c41abb Revert "Enhance peephole optimization."
This reverts commit d84ca05ef7.

Will revert, update commit message and re-commit.
2022-05-10 13:59:05 -07:00
Mingming Liu d84ca05ef7 Enhance peephole optimization.
Differential Revision: https://reviews.llvm.org/D124118
2022-05-10 12:35:35 -07:00
Matthias Braun cd19af74c0 Avoid 8 and 16bit switch conditions on x86
This adds a `TargetLoweringBase::getSwitchConditionType` callback to
give targets a chance to control the type used in
`CodeGenPrepare::optimizeSwitchInst`.

Implement callback for X86 to avoid i8 and i16 types where possible as
they often incur extra zero-extensions.

This is NFC for non-X86 targets.

Differential Revision: https://reviews.llvm.org/D124894
2022-05-10 10:00:10 -07:00
Simon Pilgrim 6824cf1ab7 [X86] Set some more plausible latencies for horizontal add/subs on znver1
These are all microcoded/multi-pipe nightmares on Ryzen, but we shouldn't just be using the WriteMicrocoded class which is for REALLY bad microcoded nightmares - instead use the same approximate latencies as znver2 (Agner and uops.info both suggest similar values) - and make sure we use the FPU defs for both

Fixes #53242
2022-05-08 15:48:42 +01:00
Simon Pilgrim eeb44579f1 [X86] Add description comments to SandyBridge for COPY/WriteZero/WriteVecMaskedGatherWriteback cases. NFC.
Match other models.

Use X86WriteRes for WriteVecMaskedGatherWriteback like other models as well.
2022-05-07 10:42:19 +01:00
Simon Pilgrim 3d107ce2b2 [CostModel][X86] Relax fcmp costs on SSE41 targets or later
Only pre-SSE41 targets double-pump the fp comparison ops
2022-05-06 13:29:40 +01:00
Simon Pilgrim cbfa857346 [CostModel][X86] Adjust 128-bit select costs to account for slow BLENDV op
Based off the script from D103695 - Jaguar, Bulldozer, Silvermont (et al) and Haswell all have slow BLENDV ops, so adjust the worse case cost values
2022-05-06 13:07:34 +01:00
Simon Pilgrim d21bf51494 [CostModel][X86] Adjust pre-SSE41 fp scalar select costs to account for vector ops
Based off the script from D103695, we now mainly use BLENDV or OR(AND,ANDN) to select scalar float/double ops
2022-05-06 11:41:55 +01:00
Simon Pilgrim f0e8c1d6d9 [CostModel][X86] Adjust 256-bit select costs to account for slow BLENDV op
Based off the script from D103695, on AVX1, Jaguar/Bulldozer both have low throughput for ymm select patterns (BLENDV + OR(AND,ANDN))), and even on AVX2 Haswell still struggles with BLENDV ops
2022-05-06 11:27:37 +01:00
Nick Desaulniers 18fd09ab64 [X86SchedSandyBridge] update cost of COPY to 1 cycle from 0
To match the cost of other scheduling models. This is expected to
schedule mov instructions around INLINEASM less frequently for the
default machineschedule (pre-RA scheduling).

Suggested by Craig Topper.

Link: https://github.com/llvm/llvm-project/issues/41914

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D122350
2022-05-05 11:14:22 -07:00
Luo, Yuanke 373ce14760 [X86][AMX] Replace PXOR instruction with SET0 in AMX pre config.
To generate zero value, the PXOR instruction need 3 operands that is
tied to the same vreg. If is not good in SSA form and with undef value
two address instruction pass may convert
`%0:vr128 = PXORrr undef %0, undef %0`
to `%1:vr128 = PXORrr undef %1:vr128(tied-def 0), undef %0:vr128`.
It is not expected.
It can be simplified to SET0 instruction which only take 1 destination
operand. It should be more friendly to two address instruction pass and
register allocation pass.
`%0:vr128 = V_SET0`
Also add AVX1 code path so that it is consistant to other code.

Differential Revision: https://reviews.llvm.org/D124903
2022-05-05 10:44:57 +08:00
Craig Topper 589517925b [X86] Call initializeX86PreTileConfigPass from LLVMInitializeX86Target.
Without this, the pass doesn't show up in print-before/after-all.

Differential Revision: https://reviews.llvm.org/D124973
2022-05-04 19:09:06 -07:00
Luo, Yuanke fe7d0067bd [X86][AMX] Add mayLoad/mayStore property for AMX instructions. 2022-05-03 14:48:22 +08:00
Simon Pilgrim 59dc8ce95a [X86] Reduce some superfluous diffs between znver1/znver2 models. NFC
znver2 is a mainly a search+replace of the znver1 model, but for no reason the HADD and DPPS have been moved around - try to keep these in sync (no actual changes in the models).
2022-05-02 16:45:43 +01:00
Simon Pilgrim ce9c0faca1 [X86][AMX] combineLdSt - don't dereference dyn_cast. NFC
This leads to null pointer dereference warnings - use cast<> which will assert that the cast correct.
2022-05-02 16:45:43 +01:00
Simon Pilgrim c7662dc3e5 [X86] MOVDDUP has the same sched behaviour as MOVSHDUP/MOVSLDUP on Skylake
Fixes an old TODO - confirmed on Agner + uops.info
2022-05-02 12:50:37 +01:00
Simon Pilgrim 86bb7df6e6 [CostModel][X86] getScalarizationOverhead - handle vXi1 extracts with MOVMSK (pre-AVX512)
We can quickly extract multiple elements of a bool vector using MOVMSK ops - since we don't know what generated the vXi1, I've been optimistic and assumed we can use PMOVMSKB to extract the maximum number of bools with a single op.

The MOVMSK pattern isn't great for extract+insert round trips as vXi1 type legalization can interfere with this a lot - so this relies on us remaining good at using getScalarizationOverhead properly (and tagging both Insert and Extract modes) for those round trip cases.

The AVX512 KMOV codegen for bool extraction is a bit of a mess so for now I've not included that - the per-element cost is a lot more accurate for current codegen.
2022-05-02 09:58:39 +01:00
Simon Pilgrim 980f41d7c4 [X86] (style) Use auto for dyn_cast<> results 2022-05-01 17:15:18 +01:00
Simon Pilgrim d4f06ec874 [X86] (style) Don't use auto for non obvious types 2022-05-01 17:10:21 +01:00
Simon Pilgrim d5198cf92f [CostModel][X86] Check for 'null op' truncations
If the legalized src/dst types are the same, assume the "truncation" is free.

This fixes some edge cases such as mul lo/hi ops and bool vectors which will get legalized back to legal vector widths
2022-05-01 12:03:40 +01:00
Simon Pilgrim c2964746e3 [CostModel][X86] Reduce cost of vector selects on SSE2/AVX1 targets
Based off the script from D103695, we were exaggerating the cost of the OR(AND(X,M),AND(Y,~M)) expansion using instruction count instead of effective throughput
2022-05-01 09:32:14 +01:00
Simon Pilgrim 92235e3bf4 [X86] lowerShuffleAsRepeatedMaskAndLanePermute - permit 32-bit sublane permute for unary v32i8 cases
Increase the likelihood that we can lower to a permd(pshufb()) pattern, but only after we've attempted with 64-bit sublane permutes first

Fixes #55066
2022-04-30 11:00:28 +01:00
Simon Pilgrim b424055b52 [X86] lowerShuffleAsRepeatedMaskAndLanePermute - move the sublane split code into a lambda helper. NFC.
This is a NFC cleanup as part of the work on #55066 - the idea being that we will be able to check for multiple sub lane scales.
2022-04-29 16:03:50 +01:00
Alexey Bataev 371412e065 [COST]Fix crash for non-power-2 vector shuffle mask.
Need to normalizize the mask to avoid possible crashes during attempts
to estimate cost of the very long shuffles with non-power-2 number of
elements in masks.
2022-04-29 07:28:07 -07:00
Simon Pilgrim 3562f855b7 [X86] SimplifyDemandedVectorEltsForTargetNode - fold (uniform) shift(0,x) -> 0 2022-04-29 12:08:47 +01:00
Simon Pilgrim 336a1233b2 [X86] SimplifyDemandedVectorEltsForTargetNode - fold shift(0,x) -> 0 2022-04-29 11:32:54 +01:00
Simon Pilgrim 6c44e398ec [X86] combineShuffle - reuse SDLoc. NFCI. 2022-04-29 10:30:11 +01:00
Simon Pilgrim 2d7f0b1c22 [X86] Fold ANDNP(undef,x)/ANDNP(x,undef) -> 0
Matches the fold in DAGCombiner::visitANDLike.
2022-04-29 10:20:48 +01:00
Simon Pilgrim ab17ed0723 [X86] Don't fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y)) on BMI2 targets
With BMI2 we have SHRX which is a lot quicker than regular x86 shifts.

Fixes #55138
2022-04-28 21:28:16 +01:00
Alan Zhao 3333c28fc0 [llvm-ml] Improve indirect call parsing
In MASM, if a QWORD symbol is passed to a jmp or call instruction in
64-bit mode or a DWORD or WORD symbol is passed in 32-bit mode, then
MSVC's assembler recognizes that as an indirect call. Additionally, if
the operand is qualified as a ptr, then that should also be an indirect
call.

Furthermore, in 64-bit mode, such operands are implicitly rip-relative
(in fact, MSVC's assembler ml64.exe does not allow explicitly specifying
rip as a base register.)

To keep this patch managable, this patch does not include:
* error messages for wrong operand types (e.g. passing a QWORD in 32-bit
  mode)
* resolving indirect calls if the symbol is declared after it's first
  use (llvm-ml currently only runs a single pass).
* imlementing the extern keyword (required to resolve
  https://crbug.com/762167.)

This patch is likely missing a bunch of edge cases, so please do point
them out in the review.

Reviewed By: epastor, hans, MaskRay

Committed By: epastor (on behalf of ayzhao)

Differential Revision: https://reviews.llvm.org/D124413
2022-04-28 13:17:19 -04:00
Simon Pilgrim a9215ed9cc [InstCombine][X86] simplifyDemandedVectorEltsIntrinsic - handle avx2 per-element vector shifts 2022-04-28 18:14:54 +01:00
Alexey Bataev 75e1cf4a6a [COST]Improve cost model for shuffles in SLP.
Introduced masks where they are not added and improved target dependent
cost models to avoid returning of the incorrect cost results after
adding masks.

Differential Revision: https://reviews.llvm.org/D100486
2022-04-28 10:04:41 -07:00
Simon Pilgrim 9e3b7e8e65 [X86] getTargetVShiftByConstNode - use SelectionDAG::FoldConstantArithmetic to perform constant folding. NFCI.
Remove some unnecessary code duplication.
2022-04-28 17:10:20 +01:00
Alexey Bataev 9861ca0c23 Revert "[COST]Improve cost model for shuffles in SLP."
This reverts commit 29a470e380 to fix
a crash reported in https://reviews.llvm.org/D100486#3479989.
2022-04-28 08:11:56 -07:00
Simon Pilgrim de7cee24b6 [X86] getBT - attempt to peek through aext(and(trunc(x),c)) mask/modulo
Ideally we'd fold this with generic DAGCombiner, but that only works for !isTruncateFree cases - we might be able to adapt IsDesirableToPromoteOp to find truncated src ops in the future, but for now just use this peephole.

Noticed in Issue #55138
2022-04-28 16:10:26 +01:00
Simon Pilgrim ed8dffef4c [X86] getFauxShuffle - don't assume an UNDEF src element for AND/ANDNP results in an UNDEF shuffle mask index
The other src element might be zero, guaranteeing zero.

Fixes #55157
2022-04-28 12:32:58 +01:00
Luo, Yuanke 942ec5c36d [X86][AMX] combine tile cast and load/store instruction.
The `llvm.x86.cast.tile.to.vector` intrinsic is lowered to
`llvm.x86.tilestored64.internal` and `load <256 x i32>`. The
`llvm.x86.cast.vector.to.tile` is lowered to `store <256 x i32>` and
`llvm.x86.tileloadd64.internal`. When `llvm.x86.cast.tile.to.vector` is
used by `store <256 x i32>` or `load <256 x i32>` is used by
`llvm.x86.cast.vector.to.tile`, they can be combined by
`llvm.x86.tilestored64.internal` and `llvm.x86.tileloadd64.internal`.

Differential Revision: https://reviews.llvm.org/D124378
2022-04-28 14:55:21 +08:00
Shengchen Kan 6a6b0e4a63 [X86] Check the address in machine verifier
1. The scale factor must be 1, 2, 4, 8
2. The displacement must fit in 32-bit signed integer

Noticed by: https://github.com/llvm/llvm-project/issues/55091

Reviewed By: pengfei

Differential Revision: https://reviews.llvm.org/D124455
2022-04-28 10:05:39 +08:00
Bill Wendling 8f2ec974d1 [X86] Move target-generic code into CodeGen [NFC]
This code is the same for all platforms.

Differential Revision: https://reviews.llvm.org/D124566
2022-04-27 15:37:28 -07:00
Simon Pilgrim e378577524 [X86] Use is128BitLaneRepeatedShuffleMask wrapper. NFC.
We don't need to know the actual repeated mask.
2022-04-27 21:09:57 +01:00
Alexey Bataev 29a470e380 [COST]Improve cost model for shuffles in SLP.
Introduced masks where they are not added and improved target dependent
cost models to avoid returning of the incorrect cost results after
adding masks.

Differential Revision: https://reviews.llvm.org/D100486
2022-04-27 10:56:26 -07:00
Simon Pilgrim 03482bccad [X86] collectConcatOps - add ability to collect from vector 'widening' patterns
Recognise insert_subvector(undef, x, lo/hi) patterns where we double the width of a vector - creating an UNDEF subvector on the fly.
2022-04-27 15:38:58 +01:00
Vasileios Porpodas fa8a9fea47 Recommit "[SLP][TTI] Refactoring of `getShuffleCost` `Args` to work like `getArithmeticInstrCost`"
This reverts commit 6a9bbd9f20.

Code review: https://reviews.llvm.org/D124202
2022-04-26 14:02:40 -07:00
Vasileios Porpodas 6a9bbd9f20 Revert "[SLP][TTI] Refactoring of `getShuffleCost` `Args` to work like `getArithmeticInstrCost`"
This reverts commit 55ce296d6f.
2022-04-26 11:25:26 -07:00
Vasileios Porpodas 55ce296d6f [SLP][TTI] Refactoring of `getShuffleCost` `Args` to work like `getArithmeticInstrCost`
Before this patch `Args` was used to pass a broadcat's arguments by SLP.
This patch changes this. `Args` is now used for passing the operands of
the shuffle.

Differential Revision: https://reviews.llvm.org/D124202
2022-04-26 11:11:29 -07:00
Xiang1 Zhang c430f0f532 [X86] Add use condition for combineSetCCMOVMSK
Reviewed by RKSimon, LuoYuanke

Differential Revision: https://reviews.llvm.org/D123652
2022-04-26 16:42:50 +08:00
Luo, Yuanke f3ad7ea03a [X86][AMX] Report error when shapes are not pre-defined.
Instead of report fatal error, this patch emit error message and exit
when shapes are not pre-defined. This would cause the compiling fail but
not crash.

Differential Revision: https://reviews.llvm.org/D124342
2022-04-26 14:57:25 +08:00
David Green 9727c77d58 [NFC] Rename Instrinsic to Intrinsic 2022-04-25 18:13:23 +01:00
Simon Pilgrim e8305c0b8f [X86] combineX86ShuffleChain - don't fold to truncate(concat(V1,V2)) if it was already a PACK op
Fixes #55050
2022-04-25 17:13:44 +01:00
Vasileios Porpodas 889588ee97 [SLP] Refactoring isLegalBroadcastLoad() to use `ElementCount`.
Replacing `unsigned` with `ElementCount` in the argument of `isLegalBroadcastLoad()`.
This helps reduce the diff of a future SLP patch for AArch64.
2022-04-21 10:19:00 -07:00
gpei-dev 3e6b904f0a Force insert zero-idiom and break false dependency of dest register for several instructions.
The related instructions are:

VPERMD/Q/PS/PD
VRANGEPD/PS/SD/SS
VGETMANTSS/SD/SH
VGETMANDPS/PD - mem version only
VPMULLQ
VFMULCSH/PH
VFCMULCSH/PH

Differential Revision: https://reviews.llvm.org/D116072
2022-04-21 16:47:13 +08:00
Matt Arsenault 3659780d58 MachineModuleInfo: Remove UsesMorestackAddr
This is x86 specific, and adds statefulness to
MachineModuleInfo. Instead of explicitly tracking this, infer if we
need to declare the symbol based on the reference previously inserted.

This produces a small change in the output due to the move from
AsmPrinter::doFinalization to X86's emitEndOfAsmFile. This will now be
moved relative to other end of file fields, which I'm assuming doesn't
matter (e.g. the __morestack_addr declaration is now after the
.note.GNU-split-stack part)

This also produces another small change in code if the module happened
to define/declare __morestack_addr, but I assume that's invalid and
doesn't really matter.
2022-04-20 11:10:20 -04:00
Matt Arsenault d7938b1a81 MachineModuleInfo: Move HasSplitStack handling to AsmPrinter
This is used to emit one field in doFinalization for the module. We
can accumulate this when emitting all individual functions directly in
the AsmPrinter, rather than accumulating additional state in
MachineModuleInfo.

Move the special case behavior predicate into MachineFrameInfo to
share it. This now promotes it to generic behavior. I'm assuming this
is fine because no other target implements adjustForSegmentedStacks,
or has tests using the split-stack attribute.
2022-04-20 10:54:29 -04:00
Matt Arsenault 209e7ef874 X86: Do not use ValueMap for PreallocatedIds
ValueMap should only be necessary if the IR values can be
replaced. This is only used during codegen, when it's illegal to
change the underlying IR. This allows using the default copy
constructor for X86MachineFunctionInfo.

I'm not happy about targets keeping state here that's only used in one
specific pass, but we don't have a better place to put it right now.
2022-04-19 21:07:47 -04:00
Craig Topper c6fdb1de47 [X86] Move some hasOneUse checks after checking what the opcode is.
Calling hasOneUse can be expensive on nodes with multiple results.
Especially when some results are Chains. By checking the opcode first,
we can avoid walking the uses if it isn't an interesting node,
and thus avoid calling hasOneUse on a node that might have many uses.

Found by profiling the IR given in D123857.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D123881
2022-04-16 14:18:58 -07:00
Craig Topper 9d86bf825c [X86] Move hasOneUse check after opcode check. NFC
Checking opcode is cheap. hasOneUse might not be if the node has
multiple results. By checking the opcode we can rule out nodes
with multiple results we aren't interested in.
2022-04-15 17:20:57 -07:00
Simon Pilgrim a305d8f44e [X86] Adjust fsetcc/fmin/fmax costs to match SoG (Issue #54889)
znver1/2 models were incorrectly modelling these as 3 cycle latency instructions on the wrong pipe and znver1 ymm variants also require double pumping.

Now matches AMD SoG, Agner and instlatx64 numbers.

Thanks to @fabian-r for the report
2022-04-14 13:27:33 +01:00
Liu, Chen3 bf60a5af0a [X86] Covert unsigned int 0 to float-point with FILD instruction.
unsinged int 0 will be convert to float/double -0.0 when the rounding
mode is set to 'FE_DOWNWARD'. Using FILD instruction instead of SSE
instructions on 32-bit target if the strictfp is enabled.

Differential Revision: https://reviews.llvm.org/D123660
2022-04-13 20:06:15 +08:00
Jonas Paulsson 46f83caebc [InlineAsm] Add support for address operands ("p").
This patch adds support for inline assembly address operands using the "p"
constraint on X86 and SystemZ.

This was in fact broken on X86 (see example at
https://reviews.llvm.org/D110267, Nov 23).

These operands should probably be treated the same as memory operands by
CodeGenPrepare, which have been commented with "TODO" there.

Review: Xiang Zhang and Ulrich Weigand

Differential Revision: https://reviews.llvm.org/D122220
2022-04-13 12:50:21 +02:00
Harald van Dijk 3337f50625
[X86] Fix handling of maskmovdqu in x32 differently
This reverts the functional changes of D103427 but keeps its tests, and
and reimplements the functionality by reusing the existing 32-bit
MASKMOVDQU and VMASKMOVDQU instructions as suggested by skan in review.
These instructions were previously predicated on Not64BitMode. This
reimplementation restores the disassembly of a class of instructions,
which will see a test added in followup patch D122449.

These instructions are in 64-bit mode special cased in
X86MCInstLower::Lower, because we use flags with one meaning for subtly
different things: we have an AdSize32 class which indicates both that
the instruction needs a 0x67 prefix and that the text form of the
instruction implies a 0x67 prefix. These instructions are special in
needing a 0x67 prefix but having a text form that does *not* imply a
0x67 prefix, so we encode this in MCInst as an instruction that has an
explicit address size override.

Note that originally VMASKMOVDQU64 was special cased to be excluded from
disassembly, as we cannot distinguish between VMASKMOVDQU and
VMASKMOVDQU64 and rely on the fact that these are indistinguishable, or
close enough to it, at the MCInst level that it does not matter which we
use. Because VMASKMOVDQU now receives special casing, even though it
does not make a difference in the current implementation, as a
precaution VMASKMOVDQU is excluded from disassembly rather than
VMASKMOVDQU64.

Reviewed By: RKSimon, skan

Differential Revision: https://reviews.llvm.org/D122540
2022-04-12 18:32:14 +01:00
Simon Pilgrim 0488c6638b [X86] getFauxShuffleMask - remove use DemandedElts TODO
Most of the getTargetShuffleInputs recursive calls have now gone and the remaining uses aren't likely to benefit from a DemandedElts mask
2022-04-12 15:36:30 +01:00
Simon Pilgrim 058a33d3c9 [X86] Account for high uop/resource usage in BSF/BSR instructions
znver1/2 models were incorrectly modelling these as single uop instructions, instead of the microcoded nightmares they really are.

Now matches AMD SoG, Agner and instlatx64 numbers.

Fixes #54811
2022-04-11 11:20:09 +01:00
Simon Pilgrim 1e803d305a Revert rG88ff6f70c45f2767576c64dde28cbfe7a90916ca "[X86] Extend vselect(cond, pshufb(x), pshufb(y)) -> or(pshufb(x), pshufb(y)) to include inner or(pshufb(x), pshufb(y)) chains"
Reverting while I investigate reports of internal test regressions/failures
2022-04-11 10:42:43 +01:00
Simon Pilgrim 88ff6f70c4 [X86] Extend vselect(cond, pshufb(x), pshufb(y)) -> or(pshufb(x), pshufb(y)) to include inner or(pshufb(x), pshufb(y)) chains 2022-04-10 13:04:53 +01:00
Simon Pilgrim c74d729bd6 [X86] combineExtractSubvector - fold extract_subvector(insert_subvector(V,X,C1),C1)
extract_subvector(insert_subvector(V,X,C1),C1) -> insert_subvector(extract_subvector(V,C1),X,0)

More aggressively attempt to reduce the width of an extract_subvector source - we currently only do this if we're inserting into a zero vector (i.e. canonicalizing to the AVX implicit zero upper elts pattern).

But if we're extracting from the same point as the inner insert_subvector then the fold is still relatively trivial - we can probably do even better if we can ensure the subvector isn't badly split.
2022-04-10 11:03:08 +01:00
Luo, Yuanke 690bed0cec [X86][AMX] Fix infinite loop of getShape.
When walk the user chain to get the shape of a phi node. If it is phi
node in the chain, we should walk to the user of this phi node instead
of the original phi node.
2022-04-10 14:44:51 +08:00
Simon Pilgrim 30a01bccda [X86] Fold concat(pshufb(x,y),pshufb(z,w)) -> pshufb(concat(x,z),concat(y,w)) 2022-04-09 16:05:50 +01:00
Simon Pilgrim 97ee923248 [X86] lowerV64I8Shuffle - attempt to fold to SHUFFLE(ALIGNR(X,Y)) and OR(PSHUFB(X),PSHUFB(Y)) 2022-04-09 14:09:39 +01:00
Simon Pilgrim 3d4bb78fbe [X86][SSE] combineSelect - more aggressively create zero elements in the or(pshufb(x), pshufb(y)) fold
When we fold vselect(cond, pshufb(x), pshufb(y)) -> or(pshufb(x), pshufb(y)), ensure we convert all undef elements to zero elements - this should help us expose more known zero elements for deeper chains of these cases.

Noticed while triaging Issue #54819
2022-04-09 12:53:00 +01:00
Simon Pilgrim f5b4507486 [X86] Reduce some superfluous diffs between znver1/znver2 models. NFC
znver2 is a mainly a search+replace of the znver1 model, but for no reason some lines have been moved around - try to keep these in sync (no actual changes in the models).
2022-04-09 10:59:18 +01:00
Nikita Popov 3075e5d2ef [X86][FastISel] Fix with.overflow + select eflags clobber (PR54369)
Don't try to directly use the with.overflow flag result in a cmov
if we need to materialize constants between the instruction
producing the overflow flag and the cmov. The current code is
careful to check that there are no other instructions in between,
but misses the constant materialization case (which may clobber
eflags via xor or constant expression evaluation).

Fixes https://github.com/llvm/llvm-project/issues/54369.

Differential Revision: https://reviews.llvm.org/D122825
2022-04-08 16:12:28 +02:00
Simon Pilgrim 5626bd4289 [X86] Fix SLM scheduler model for PMULLD (PR37059)
Adjust the PMULLD entry to match the Intel AoM numbers - PMULLD is a uop nightmare on SLM and we should model it as such.

We had reports of internal regressions the last time this was attempted (rG13a0f83a05ff), but no public repros, and tests I did last year when I had access to a SLM box failed to see anything. My hunch is that the more aggressive PMULLD -> PMADDWD folds we now perform might have helped. We can revisit this again if we ever receive an actual repro.

Fixes #36407
2022-04-08 10:07:06 +01:00
chenglin.bi f72b3a506b [x86] Replace getNodeIfExists to doesNodeExist when only check node exist
Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D123224
2022-04-08 00:33:05 +08:00
Simon Pilgrim cf3a09369a [X86] Enable fast variable per-lane shuffle tuning on all Ryzen targets (PR44795)
rGa3b8695bf592 enabled this for znver3, but AMD SoG, Agner and uops.info all agree that even znver1 has a fast per-lane shuffle op (VPSHUFB), but cross-lane shuffles seem to be slow (PERMPS etc.)

Fixes #44140

Differential Revision: https://reviews.llvm.org/D123306
2022-04-07 16:00:52 +01:00
Simon Pilgrim a1df2ef5cb [X86] Ensure ZN3Tuning inherits from ZN2Tuning instead of ZNTuning
At the moment ZN2Tuning is just a copy of ZNTuning, but we should try to keep a clean inheritance.
2022-04-07 14:01:15 +01:00
Wei Xiao 842d0bf931 [x86] Improve select lowering for smin(x, 0) & smax(x, 0)
smin(x, 0):
  (select (x < 0), x, 0) -> ((x >> (size_in_bits(x)-1))) & x

smax(x, 0):
  (select (x > 0), x, 0) -> (~(x >> (size_in_bits(x)-1))) & x
  The comparison is testing for a positive value, we have to invert the sign
  bit mask, so only do that transform if the target has a bitwise 'and not'
  instruction (the invert is free).

The transform is performed only when CMP has a single user to avoid
increasing total instruction number.

https://alive2.llvm.org/ce/z/euUnNm
https://alive2.llvm.org/ce/z/37339J

Differential Revision: https://reviews.llvm.org/D123109
2022-04-07 15:53:24 +08:00
Matt Arsenault c4ea925f50 AtomicExpand: Change return type for shouldExpandAtomicStoreInIR
Use the same enum as the other atomic instructions for consistency, in
preparation for addition of another strategy.

Introduce a new "Expand" option, since the store expansion does not
use cmpxchg. Alternatively, the existing CmpXChg strategy could be
renamed to Expand.
2022-04-06 22:34:04 -04:00
Roman Lebedev 9be6e7b0f2
[X86] `lowerBuildVectorAsBroadcast()`: with AVX512VL, allow i64->XMM broadcasts from constant pool
Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D123221
2022-04-06 18:33:40 +03:00
Shengchen Kan f4661b5a55 [X86] Fold MMX_MOVD64from64rr + store to MMX_MOVQ64mr instead of MMX_MOVD64from64mr in auto-generated table
This is a follow-up patch for D122241.
2022-04-06 21:33:57 +08:00
Shengchen Kan 4d21497006 [X86] Remove TB_NO_REVERSE for 2 memory folding entries
```
X86::MMX_MOVD64from64rr -> X86::MMX_MOVQ64mr
X86::MMX_MOVD64grr -> X86::MMX_MOVD64mr
```

These two entries were added in llvm-svn: 372770.
I think these two should be reversable.

Reviewed By: RKSimon, pengfei

Differential Revision: https://reviews.llvm.org/D122217
2022-04-06 17:21:12 +08:00
Martin Storsjö 46776f7556 Fix warnings about variables that are set but only used in debug mode
Add void casts to mark the variables used, next to the places where
they are used in assert or `LLVM_DEBUG()` expressions.

Differential Revision: https://reviews.llvm.org/D123117
2022-04-06 10:01:46 +03:00
Shengchen Kan 81b10f8200 [X86][tablgen] Consider the mnemonic when auto-generating memory folding table
Intuitively, the memory folding pair should have the same mnemonic.

This patch removes
```
{X86::SENDUIPI,X86::VMXON}
```
in the auto-generated table.
And `NotMemoryFoldable` for `TPAUSE` and `CLWB` can be saved.
```
{X86::MOVLHPSrr,X86::MOVHPSrm}
{X86::VMOVLHPSZrr,X86::VMOVHPSZ128rm}
{X86::VMOVLHPSrr,X86::VMOVHPSrm}
```
It seems the three pairs above are mistakenly killed.
But we can add them back manually later.

Reviewed By: Amir

Differential Revision: https://reviews.llvm.org/D122477
2022-04-06 12:53:05 +08:00
Pierre Gousseau a3d5f1cf5d [x86] Fix infinite loop inside DAG combiner with lzcnt feature.
The issue affects targets supporting fast-lzcnt such as btver2.
This removes extraneous zext/trunc node insertions to fix the infinite
loop.
This fixes Issue https://github.com/llvm/llvm-project/issues/54694

Differential Revision: https://reviews.llvm.org/D122900

Reviewed By: RKSimon, spatel, lebedev.ri
2022-04-05 17:32:10 +01:00
Wei Xiao ca33d74ca5 [X86] Improve x86-partial-reduction to support abs intrinsic
Current implementation only recognizes absolute operation implemented by
select instruction. This patch adds support for abs intrinsic.

Differential Revision: https://reviews.llvm.org/D122777
2022-04-05 11:32:09 +08:00
Simon Pilgrim ffe0cc82db [X86] Add XOR(X, MIN_SIGNED_VALUE) -> ADD(X, MIN_SIGNED_VALUE) isel patterns (PR52267)
Improve chances of folding to LEA patterns

Differential Revision: https://reviews.llvm.org/D123043
2022-04-04 19:47:06 +01:00
Simon Pilgrim 623d4b5787 [X86] Support optional NOT stages in the AND(SRL(X,Y),1) -> SETCC(BT(X,Y)) fold
Extension to D122891, peek through NOT() ops, adjusting the condcode as we go.
2022-04-04 10:51:26 +01:00
Simon Pilgrim fbfd78f7aa [X86] lowerShuffleAsRepeatedMaskAndLanePermute - allow v16i32 sub-lane permutes for v64i8 shuffles
Without VBMI, we are better off permuting v16i32 sub-lanes, even though its a variable shuffle, if it allows us to then shuffle v64i8 inlane repeated masks (PSHUFB etc.)

Fixes #54658
2022-04-03 10:05:10 +01:00
Simon Pilgrim 76cd11f303 [DAG] Add llvm::isMinSignedConstant helper. NFC
Pulled out of D122754
2022-04-01 17:47:34 +01:00
Simon Pilgrim c64f37f818 [X86] matchAddressRecursively - add XOR(X, MIN_SIGNED_VALUE) handling
Allows us to fold XOR(X, MIN_SIGNED_VALUE) == ADD(X, MIN_SIGNED_VALUE) into LEA patterns

As mentioned on PR52267.

Differential Revision: https://reviews.llvm.org/D122815
2022-04-01 17:26:29 +01:00
Simon Pilgrim b8652fbcbb [X86] Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y)) (RECOMMITTED)
As noticed on PR39174, if we're extracting a single non-constant bit index, then try to use BT+SETCC instead to avoid messing around moving the shift amount to the ECX register, using slow x86 shift ops etc.

Recommitted with a fix to ensure we zext/trunc the SETCC result to the original type.

Differential Revision: https://reviews.llvm.org/D122891
2022-04-01 16:59:06 +01:00
Simon Pilgrim 5a457bd2fa Revert rGa5f637bcbb7d1e08ce637f113fc117c3f4b2b110 "[X86] Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y))"
Investigating a sanitizer-windows buildbot breakage
2022-04-01 16:48:24 +01:00
Simon Pilgrim 9afa6811ad [X86] lowerShuffleAsRepeatedMaskAndLanePermute - allow 64-bit sublane shuffling on AVX512BW v64i8 shuffles
We were only performing this on 256-bit vectors on AVX2 targets

Noticed while triaging Issue #54658
2022-04-01 16:40:10 +01:00
Simon Pilgrim a5f637bcbb [X86] Fold AND(SRL(X,Y),1) -> SETCC(BT(X,Y))
As noticed on PR39174, if we're extracting a single non-constant bit index, then try to use BT+SETCC instead to avoid messing around moving the shift amount to the ECX register, using slow x86 shift ops etc.

Differential Revision: https://reviews.llvm.org/D122891
2022-04-01 16:07:56 +01:00
Simon Pilgrim 3245cfb8d3 [X86] Add getBT helper node for attempting to create a X86ISD::BT node
Avoids repeating all the extension/legalization wrappers in every use
2022-04-01 11:48:25 +01:00
Simon Pilgrim 919b657080 Revert rGff2d1bb2b749bd8a5697c25d2380b7c97a59ae06 "[X86] Add getBT helper node for attempting to create a X86ISD::BT node"
Typo means that this doesn't return a value in all cases.
2022-04-01 11:21:00 +01:00
Simon Pilgrim ff2d1bb2b7 [X86] Add getBT helper node for attempting to create a X86ISD::BT node
Avoids repeating all the extension/legalization wrapper in every use
2022-04-01 11:12:23 +01:00
Simon Pilgrim cb5c4a5917 [X86] lowerV8I16Shuffle - use explicit SmallVector<SDValue, 4> width to avoid MSVC AVX alignment bug
As discussed on Issue #54645 - building llc with /AVX can result in incorrectly aligned structs
2022-04-01 10:54:24 +01:00
Fangrui Song ac6878b330 [X86] Set frame-setup/frame-destroy on prologue/epilogue CFI instructions
This approach is used by AArch64/RISCV to make frame-setup/frame-destroy
instructions contiguous instead of being interleaved by CFI instructions. Code
checking `MBBI->getFlag(MachineInstr::FrameSetup) || MBBI->isCFIInstruction()`
can be simplified to just check FrameSetup.

This helps locate all CFI instructions in the prologue, which can be handy to use
.cfi_remember_state/.cfi_restore_state to decrease unwind table size (D114545).

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D122541
2022-03-31 23:04:50 -07:00
Matt Arsenault f635be3014 X86/GlobalISel: Use LLT form of getMachineMemOperand 2022-03-31 18:49:23 -04:00
Simon Pilgrim 535211c3eb [X86] Remove redundant FIXME
lowerV64I8Shuffle has been extended a lot since this was added.
2022-03-31 18:05:52 +01:00
Simon Pilgrim fac1729924 [X86] lowerV64I8Shuffle - don't use lowerShuffleWithPERMV until we've tried simpler options
Shuffle combining will still lower to this with better fast cross lane checks.

Noticed while triaging Issue #54658
2022-03-31 18:05:51 +01:00
Sanjay Patel 4a54e3eed3 [x86] try to replace 0.0 in fcmp with negated operand
This inverts a fold recently added to IR with:
3491f2f4b0

We can put -bidirectional on the Alive2 examples to show that
the reverse transforms work:
https://alive2.llvm.org/ce/z/8iVQwB

The motivation for the IR change was to improve matching to
'fabs' in IR (see https://github.com/llvm/llvm-project/issues/38828 ),
but it regressed x86 codegen for 'not-quite-fabs' patterns like
(X > -X) ? X : -X.
Ie, when there is no fast-math (nsz), the cmp+select is not a proper
fabs operation, but it does map nicely to the unusual NAN semantics
of MINSS/MAXSS.

I drafted this as a target-independent fold, but it doesn't appear to
help any other targets and seems to cause regressions for SystemZ at
least.

Differential Revision: https://reviews.llvm.org/D122726
2022-03-31 09:17:49 -04:00
Luo, Yuanke 6753eb0c90 [X86][AMX] Materialize undef or zero value to tilezero
The AMX combiner would store undef or zero to stack and invoke tileload
to load the data to tile register. To avoid the store/load, we can
materialzie undef or zero value to tilezero.

Differential Revision: https://reviews.llvm.org/D122714
2022-03-31 19:10:28 +08:00
Simon Pilgrim 481b185620 [X86] combineCarryThroughADD - recognise X86ISD::ADD(AND(X,1),-1) pattern can be folded to X86ISD::BT
As mentioned on D122482, if we've generated a masked overflow test see if we can fold it to X86ISD::BT to feed a X86ISD::ADC/SBB

Differential Revision: https://reviews.llvm.org/D122572
2022-03-31 09:52:55 +01:00
Luo, Yuanke 1141c8b6fc [X86][AMX] Fix bug for amx cast tranform
After combining amx cast operation, some amx cast intrinsic may be dead
code. This patch is to delete such dead code and avoid crash.
2022-03-30 17:22:30 +08:00
Simon Pilgrim 6697e3354f [X86] combineADC - fold ADC(C1,C2,Carry) -> ADC(0,C1+C2,Carry)
If we're not relying on the flag result, we can fold the constants together into the RHS immediate operand and set the LHS operand to zero, simplifying for further folds.

We could do something similar if the flag result is in use and the constant fold doesn't affect it, but I don't have any real test cases for this yet.

As suggested by @davezarzycki on Issue #35256

Differential Revision: https://reviews.llvm.org/D122482
2022-03-30 09:11:55 +01:00
Simon Pilgrim d663166acb [CostModel][X86] Reduce cost of v2i64 icmp base cost on SSE2 targets
Based off the script from D103695, we were exaggerating the cost of the v2i64 comparison expansion using instruction count instead of effective throughput
2022-03-30 09:11:55 +01:00
Simon Pilgrim 1ec109ec58 [X86] combineCarryThroughADD - remove unused peek through of SEXT/AEXT nodes. 2022-03-29 17:22:50 +01:00
Shao-Ce SUN 662b9fa02c [NFC][CodeGen] Add a setTargetDAGCombine use ArrayRef
Reviewed By: arsenm

Differential Revision: https://reviews.llvm.org/D122557
2022-03-29 09:53:24 +08:00
Simon Pilgrim 8a1956dfa5 [X86] lowerV64I8Shuffle - attempt to match with lowerShuffleAsLanePermuteAndPermute
Fixes #54562
2022-03-28 17:21:27 +01:00
Kazu Hirata 6212871968 [Target] Apply clang-tidy fixes for readability-redundant-member-init (NFC) 2022-03-27 22:22:37 -07:00
Phoebe Wang 674d52e8ce [X86] Refactor X86ScalarSSEf16/32/64 with hasFP16/SSE1/SSE2. NFCI
This is used for f16 emulation. We emulate f16 for SSE2 targets and
above. Refactoring makes the future code to be more clean.

Reviewed By: LuoYuanke

Differential Revision: https://reviews.llvm.org/D122475
2022-03-27 12:24:02 +08:00
Shengchen Kan dc68ca3eff [X86][tablgen] Rename field hasREX_WPrefix to hasREX_W for X86Inst. NFC
To make it more like hasVEX_L and hasEVEX_K, etc.
2022-03-26 23:14:08 +08:00
Shengchen Kan 271e8d2495 [X86][tablgen] Refine the class RecognizableInstr. NFCI
1. Add comments to explain why we set `isAsmParserOnly` for XACQUIRE and XRELEASE
2. Check `X86Inst` in the constructor of `RecognizableInstrBase` so that
   we can avoid the case where one of it's field is not initialized but
   accessed by user. (e.g. in X86EVEX2VEXTablesEmitter.cpp)
3. Move `Rec` from `RecognizableInstrBase` to `RecognizableInstr` to reduce
   size of `RecognizableInstrBase`
4. Remove out-of-date comments for shouldBeEmitted() (filter() was removed)
5. Add a basic field `IsAsmParserOnly` and remove the field
   `ShouldBeEmitted` b/c we can deduce it w/ little overhead
2022-03-26 22:41:49 +08:00
Simon Pilgrim 43a969debd [X86] combineADC - pull out repeated dyn_cast<ConstantSDNode> calls. NFC. 2022-03-25 12:53:08 +00:00
Simon Pilgrim 3db858c58c [X86] combineAdd - fold ADD(ADC(Y,0,W),X) -> ADC(X,Y,W)
This also exposed a missed ADC canonicalization of constant ops to the RHS
2022-03-25 10:52:10 +00:00
Simon Pilgrim 33b214b711 [X86] combineSub - fold SUB(X,ADC(Y,0,W)) -> SBB(X,Y,W) 2022-03-24 18:00:00 +00:00
Alexander Belyaev bef928f8b2 [llvm] Initialize and move UseUpRegs outside of `union` MemOp struct.
Asan complained about uninitialized bool

`invalid-bool-load`
llvm/lib/Target/X86/AsmParser/X86Operand.h:389:12: runtime error: load
of value 171, which is not a valid value for type 'bool'

Differential Revision: https://reviews.llvm.org/D122405
2022-03-24 16:53:38 +01:00
Shengchen Kan c34365149d [X86][NFC] Remove unused variable introduce by D121785 2022-03-24 18:48:10 +08:00
Dávid Bolvanský 03e7fb9d53 [NFCI] Fix set-but-unused warning in X86LoadValueInjectionLoadHardening.cpp 2022-03-24 08:33:40 +01:00
Dávid Bolvanský 44572be295 [NFCI] Fix set-but-unused warning in X86AsmBackend.cpp 2022-03-24 08:13:28 +01:00
Kai Luo 77cc68b049 [X86][NFC] Fix missing `override` in `isMemUseUpRegs`
Fix warning
```
warning: 'isMemUseUpRegs' overrides a member function but is not marked 'override' [-Winconsistent-missing-override]
```
2022-03-24 12:35:15 +08:00
Xiang1 Zhang 9566405020 [Inline asm] Fix mangle problem when variable used in inline asm.
(Correct 'Mem symbol + IntelExpr' output in PIC model)

Reviewed By: skan

Differential Revision: https://reviews.llvm.org/D121785
2022-03-24 09:41:23 +08:00
Xiang1 Zhang 287dad13ab [InlineAsm] Fix mangle problem when global variable used in inline asm
(Add modifier P for ARR[BaseReg+IndexReg+..])

Reviewed By: skan

Differential Revision: https://reviews.llvm.org/D120887
2022-03-24 09:41:23 +08:00
Xiang1 Zhang 8a6b644c79 [Inline asm] Fix mangle problem when variable used in inline asm.
(Connect InlineAsm Memory Operand with its real value not just name)
Revert 2 history bugfix patch:

Revert "[X86][MS-InlineAsm] Make the constraint *m to be simple place holder"
This patch revert https://reviews.llvm.org/D115225 which mainly
fix problems intrduced by https://reviews.llvm.org/D113096

This reverts commit d7c07f60b3.

Revert "Reland "[X86][MS-InlineAsm] Use exact conditions to recognize MS global variables""
This patch revert https://reviews.llvm.org/D116090 which fix problem
intrduced by https://reviews.llvm.org/D115225

This reverts commit 24c68ea1eb.

Reviewed By: skan

Differential Revision: https://reviews.llvm.org/D120886
2022-03-24 09:41:22 +08:00
Vasileios Porpodas 39aa202aff Recommit "[SLP] Fix lookahead operand reordering for splat loads." attempt 3, fixed assertion crash.
Original review: https://reviews.llvm.org/D121354

This reverts commit e6ead19b77.
2022-03-23 18:32:17 -07:00
Arthur Eubanks e6ead19b77 Revert "Recommit "[SLP] Fix lookahead operand reordering for splat loads." attempt 2, fixed assertion crash."
This reverts commit 27bd8f9492.

Causes crashes, see comments in D121973
2022-03-23 10:57:45 -07:00
Vasileios Porpodas 27bd8f9492 Recommit "[SLP] Fix lookahead operand reordering for splat loads." attempt 2, fixed assertion crash.
Original review: https://reviews.llvm.org/D121354

This reverts commit f7d7d2a08d.
2022-03-22 16:41:55 -07:00
Craig Topper 9933015fdd [X86] Fold MMX_MOVD64from64rr + store to MMX_MOVQ64mr instead of MMX_MOVD64from64mr.
MMX_MOVD64from64rr moves an MMX register to a 64-bit GPR.

MMX_MOVD64from64mr is the memory version of moving a MMX register to a
64-bit GPR. It requires the REX.W bit to be set. There are no isel
patterns that use this instruction.

MMX_MOVQ64mr is the MMX register store instruction. It doesn't
require a REX.W prefix. This makes it one byte shorter to encode
than MMX_MOVD64from64mr in many cases.

Both store instructions output the same mnemonic string. The assembler
would choose MMX_MOVQ64mr if it was to parse the output. Which is
another reason using it is the correct thing to do.

Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D122241
2022-03-22 14:21:55 -07:00
Arthur Eubanks f7d7d2a08d Revert "Recommit "[SLP] Fix lookahead operand reordering for splat loads.""
This reverts commit 79613185d3.

Causes crashes, see comments in https://reviews.llvm.org/D121973.
2022-03-22 13:33:49 -07:00
Simon Pilgrim 5a65f0b4d9 [X86][SandyBridge] Remove superfluous mmx store from vector load schedule model group
Noticed by D122216
2022-03-22 10:48:29 +00:00
Shengchen Kan 021b42367a [X86] Rename MMX_MOVD64from64rm to MMX_MOVD64from64mr b/c it stores sth, NFC
Reviewed By: pengfei, RKSimon

Differential Revision: https://reviews.llvm.org/D122216
2022-03-22 17:59:28 +08:00
Vasileios Porpodas 79613185d3 Recommit "[SLP] Fix lookahead operand reordering for splat loads."
Original review: https://reviews.llvm.org/D121354

The original commit 9136145eb0 broke the build on several targets.

Differential Revision: https://reviews.llvm.org/D121973
2022-03-21 15:57:32 -07:00
Simon Pilgrim 438ac282db [X86] combineAddOrSubToADCOrSBB - Fold ADD/SUB + (AND(SRL(X,Y),1) -> ADC/SBB+BT(X,Y) (REAPPLIED)
As suggested on PR35908, if we are adding/subtracting an extracted bit, attempt to use BT instead to fold the op and use a ADC/SBB op.

Reapply with extra type legality checks - LowerAndToBT was originally only used during lowering, now that it can occur earlier we might encounter illegal types that we can either promote to i32 or just bail.

Differential Revision: https://reviews.llvm.org/D122084
2022-03-21 21:37:42 +00:00
Maksim Panchenko f8a32f333c [X86][NFCI] Remove redundant functions
Reviewed By: skan

Differential Revision: https://reviews.llvm.org/D121731
2022-03-21 14:18:47 -07:00
Nikita Popov 1533682839 Revert "[X86] combineAddOrSubToADCOrSBB - Fold ADD/SUB + (AND(SRL(X,Y),1) -> ADC/SBB+BT(X,Y)"
This reverts commit 81569f5b6e.

This causes a segfault when building consumer-typeset in
ReleaseLTO-g configuration:
https://llvm-compile-time-tracker.com/show_error.php?commit=81569f5b6ef531a48023f28133481262ee1509a3
2022-03-21 21:52:36 +01:00
Simon Pilgrim 5fd9451668 [X86][AVX512] lower1BitShuffle - fold broadcast(setcc(x,y)) -> setcc(broadcast(x),broadcast(y)) (PR52500)
AVX512 has excellent broadcast ops for everything but vXi1 bool vectors - so if we're broadcasting a comparison result, see if we can broadcast the comparison operands instead.
2022-03-21 17:42:49 +00:00
Simon Pilgrim b6e2832fc2 [X86] Don't fold SUB(X,SBB(0,0,W)) -> SUB(ADC(0,0,W),Y)
This will further fold to a AND(SETCC_CARRY(),1) pattern which tends to prevent further folds.
2022-03-21 15:54:48 +00:00
Simon Pilgrim 315896d3ac [X86] Fold SUB(X,SBB(Y,Z,W)) -> SUB(ADC(X,Z,W),Y)
Prefer the commutable ADC over SBB to improve load folding opportunities
2022-03-21 14:20:46 +00:00
Simon Pilgrim ed51e26ab4 [X86] combineAddOrSubToADCOrSBB - commute + neg subtraction patterns
Handle SUB(AND(SRL(Y,Z),1),X) -> NEG(SBB(X,0,BT(Y,Z))) folds

I'll address the X86 lost folded-load regressions in a follow-up patch
2022-03-21 13:55:35 +00:00
Simon Pilgrim 5e9365c5eb [X86] combineAddOrSubToADCOrSBB - bail for illegal types
Ensure we don't attempt to fold to illegal types to ADC/SBB nodes.

After D122084 its possible for ADD(X,AND(SRL(Y,Z),1) patterns to be matched before type legalization.
2022-03-21 13:31:21 +00:00
Simon Pilgrim 81569f5b6e [X86] combineAddOrSubToADCOrSBB - Fold ADD/SUB + (AND(SRL(X,Y),1) -> ADC/SBB+BT(X,Y)
As suggested on PR35908, if we are adding/subtracting an extracted bit, attempt to use BT instead to fold the op and use a ADC/SBB op.

Differential Revision: https://reviews.llvm.org/D122084
2022-03-21 10:57:12 +00:00
Craig Topper 4b28980772 [X86] Simplify the interface to getCondNoFromDesc.
Instead of taking a SkipDefs parameter, rename to getCondSrcNoFromDesc
and have it return the source operand number. Make getCondFromMI
responsible for adding the number of Defs for MI instructions.

While there remove some unneeded casts to unsigned and check for
negative numbers instead of explicitly -1. Less than 0 is easier
for a compiler to codegen.

Differential Revision: https://reviews.llvm.org/D122113
2022-03-20 22:41:39 -07:00
Shengchen Kan 01136c0530 [X86][NFC] Run clang-format on cb26730aaa, fix typo and remove redundant else 2022-03-21 12:08:10 +08:00
Shengchen Kan cb26730aaa [X86][NFC] Unify implementations of getting condition code 2022-03-21 11:31:16 +08:00
Shengchen Kan 51e6059c12 [X86] Simplify function isDataInvariant by using X86MnemonicTables
This is not a NFC change b/c we add more instructions like
IMUL16/32/64r, MOV16ao16 and MOV16rr_REV etc to the list.
But I think it's reasonable.

Reviewed By: Amir

Differential Revision: https://reviews.llvm.org/D122063
2022-03-20 18:38:58 +08:00
Simon Pilgrim 1ae3c4e948 [X86] combineAddOrSubToADCOrSBB - split to more cleanly handle commuted variants.
Split combineAddOrSubToADCOrSBB into wrapper (which handles ADDs with commuted args) and the real combine, which no longer has to account for commutation.

I'm intending to extend combineAddOrSubToADCOrSBB to detect patterns other than just X86ISD::SETCC, so we need to detect all patterns without detecting them as part of a commutation swap.
2022-03-20 09:14:21 +00:00
Shengchen Kan e58dadf3e2 [X86][NFC] Generate fields and getters for subtarget features
Non-duplicated comments are moved from X86Subtarget.h to X86.td.
This is a follow-up patch for D120906.
2022-03-20 15:27:21 +08:00
Shengchen Kan ae0ae91903 [X86][NFC] Remove unused variable UseAA 2022-03-20 13:21:25 +08:00
Shengchen Kan c266776429 [X86][NFC] Remove unused feature UseAA 2022-03-20 13:14:13 +08:00
Shengchen Kan 076a9dc99a [X86][NFC] Rename hasCMOV() to canUseCMOV(), hasLAHFSAHF() to canUseLAHFSAHF()
To make them less like other feature functions.
This is a follow-up patch for D121978.
2022-03-20 12:00:25 +08:00
Craig Topper 57b41af838 [X86] Rename FeatureCMPXCHG8B/FeatureCMPXCHG16B to FeatureCX8/CX16 to match CPUID.
Rename hasCMPXCHG16B() to canUseCMPXCHG16B() to make it less like other
feature functions. Add a similar canUseCMPXCHG8B() that aliases
hasCX8() to keep similar naming.

Differential Revision: https://reviews.llvm.org/D121978
2022-03-19 12:34:06 -07:00
Simon Pilgrim 34110a7320 [X86] combineAddOrSubToADCOrSBB - pull out repeated Y.getOperand(1) calls. NFC. 2022-03-19 17:56:11 +00:00
Simon Pilgrim b90478d422 [X86] createShuffleMaskFromVSELECT - handle BLENDV constant masks as well as VSELECT constant masks
Handle constant masks for both vselect nodes (mask != 0) and blendv nodes (mask < 0)
2022-03-19 16:51:07 +00:00
Simon Pilgrim a6c18bfbe3 [X86] combineSelect - don't constant fold BLENDV nodes like VSELECT
If a X86ISD::BLENDV op appears before legalization (in this test case due to the icmp_slt x, 0) its constant mask was being treated as a vselect mask (mask != 0) instead of blendv (mask < 0)

This just prevents constant folding entirely for non-VSELECT ops.
2022-03-19 16:31:19 +00:00
Simon Pilgrim 56ad791f46 [X86] LowerAndToBT - fold BT(NOT(X),Y) -> BT(X,Y) and flip the CondCode 2022-03-19 14:03:03 +00:00
Simon Pilgrim c7ba5a9aff [X86][SSE] Add initial support for extracting non-constant bool vector elements
We can use MOVMSK+TEST/BT to extract individual bool elements even if the index isn't constant

This relies on combineBitcastvxi1 so some AVX512 cases still aren't optimized as they avoid MOVMSK usage.
2022-03-19 13:31:05 +00:00
Simon Pilgrim 5dde9c1286 [CostModel][X86] Reduce cost of extracting bool vector elements
For constant indices, these are now just a MOVMSK+TEST/BT
2022-03-18 19:02:47 +00:00
Amir Ayupov a954ade8ed [X86][NFC] Generate mnemonic tables
Produce mnemonic tables, adding the functions to llvm::X86 namespace.

Reviewed By: MaskRay, skan

Differential Revision: https://reviews.llvm.org/D121572
2022-03-18 01:46:48 -07:00
Shengchen Kan 920c2e5763 [X86][NFC] Rename target feature hasCMov->hasCMOV
This is a follow-up patch for D121975.
2022-03-18 14:05:52 +08:00
Craig Topper 6cfe41dcc8 [X86] Rename more target feature related things consistency. NFC
-Rename Mode*Bit to Is*Bit to match X86Subtarget.
-Rename FeatureLAHFSAHF to FeatureLAFHSAFH64 to match X86Subtarget.
-Use consistent capitalization

Reviewed By: skan

Differential Revision: https://reviews.llvm.org/D121975
2022-03-17 22:27:17 -07:00
Shengchen Kan 1a70febf82 [X86] Set Int_MemBarrier as a meta-instruction
Compiler only emits a comment for `Int_MemBarrier`, so it should
be marked as a meta-instruction, which can help improve accuracy
of debug location.

Reviewed By: pengfei

Differential Revision: https://reviews.llvm.org/D121879
2022-03-18 13:12:28 +08:00
Vasileios Porpodas 9136145eb0 Revert "[SLP] Fix lookahead operand reordering for splat loads." due to build failures
This reverts commit 5efa78985b.
2022-03-17 18:22:04 -07:00
Vasileios Porpodas 5efa78985b [SLP] Fix lookahead operand reordering for splat loads.
Splat loads are inexpensive in X86. For a 2-lane vector we need just one
instruction: `movddup (%reg), xmm0`. Using the standard Splat score leads
to worse code. This patch adds a new score dedicated for splat loads.

Please note that a splat is usually three IR instructions:
- It is usually a load and 2 inserts:
 %ld = load double, double* %gep
 %ins1 = insertelement <2 x double> poison, double %ld, i32 0
 %ins2 = insertelement <2 x double> %ins1, double %ld, i32 1

- But it can also be a load, an insert and a shuffle:
 %ld = load double, double* %gep
 %ins = insertelement <2 x double> poison, double %ld, i32 0
 %shf = shufflevector <2 x double> %ins, <2 x double> poison, <2 x i32> zeroinitializer

Because of this some of the lit tests contain more IR instructions.

Differential Revision: https://reviews.llvm.org/D121354
2022-03-17 18:05:54 -07:00
Sanjay Patel 67e9151096 [x86] try harder to use shift instead of test if it can save some immediate bytes
We favor 'and' and 'test' in earlier phases of optimization,
and that's usually the better option, but we can save a few
instruction bytes by converting a mask constant to a shift here.

Differential Revision: https://reviews.llvm.org/D121147
2022-03-17 09:10:57 -04:00
Sanjay Patel 83413bb617 [x86] reduce indentation; NFC
We may be able to refine the conditions for these transforms ( D120648 ).
2022-03-16 13:39:02 -04:00
Amir Ayupov 2c4e38fa6f [X86] Emit REX prefix immediately before the opcode
Fix prefix emission order to emit REX immediately before the opcode (SDM vol2,
2.1, Figure 2-1). According to SDM vol2 2.2.1, "Other placements are ignored".

This fix has a side effect of outputting segment override prefix in a different
order than previously (benign).

Follow-up to https://reviews.llvm.org/D120592

Reviewed By: skan, craig.topper

Differential Revision: https://reviews.llvm.org/D120871
2022-03-16 08:30:31 -07:00
Amir Ayupov 1d3719820f [X86] Preserve redundant Address-Size override prefix
Print and emit redundant Address-Size override prefix if it's set on the
instruction.

Reviewed By: skan

Differential Revision: https://reviews.llvm.org/D120592
2022-03-16 08:30:29 -07:00
Shengchen Kan 37b378386e [NFC][CodeGen] Rename some functions in MachineInstr.h and remove duplicated comments 2022-03-16 20:25:42 +08:00
Simon Pilgrim e3deb7d88b [X86] computeKnownBitsForTargetNode - add X86ISD::AND KnownBits handling
Fixes #54171
2022-03-16 11:05:36 +00:00
serge-sans-paille 989f1c72e0 Cleanup codegen includes
This is a (fixed) recommit of https://reviews.llvm.org/D121169

after:  1061034926
before: 1063332844

Discourse thread: https://discourse.llvm.org/t/include-what-you-use-include-cleanup
Differential Revision: https://reviews.llvm.org/D121681
2022-03-16 08:43:00 +01:00
Shengchen Kan 052d37dc7c [NFC][X86] Rename some variables and functions about target features
This is preparation for D121768. The member's name should align w/
the interface for trival target feature.
2022-03-16 13:08:52 +08:00
Matthias Braun 84ef62126a X86ISelDAGToDAG: Transform TEST + MOV64ri to SHR + TEST
Optimize a pattern where a sequence of 8/16 or 32 bits is tested for
zero: LLVM normalizes this towards and `AND` with mask which is usually
good, but does not work well on X86 when the mask does not fit into a
64bit register. This DagToDAG peephole transforms sequences like:

```
movabsq $562941363486720, %rax # imm = 0x1FFFE00000000
testq %rax, %rdi
```

to

```
shrq $33, %rdi
testw %di, %di
```

The result has a shorter encoding and saves a register if the tested
value isn't used otherwise.

Differential Revision: https://reviews.llvm.org/D121320
2022-03-15 14:18:04 -07:00
Simon Pilgrim f591231cad [X86] combineSelect - canonicalize (vXi1 bitcast(iX Cond)) with combineToExtendBoolVectorInReg before legalization
This replaces the attempt in 20af71f8ec to use combineToExtendBoolVectorInReg to create X86ISD::BLENDV masks directly, instead we use it to canonicalize the iX bitcast to a sign-extended mask and then truncate it back to vXi1 prior to legalization breaking it apart.

Fixes #53760
2022-03-15 12:16:11 +00:00
Amir Ayupov 842fa38dbe [X86] Fix cosmetic issues in instruction mnemonics
- Remove spurious } in invlpgb mnemonic
- Add \t between mnemonic and operands for ud1 instructions

Reviewed By: skan, craig.topper

Differential Revision: https://reviews.llvm.org/D121570
2022-03-14 12:29:44 -07:00
Mircea Trofin 294eca35a0 [regalloc] Remove -consider-local-interval-cost
Discussed extensively on D98232. The functionality introduced in D35816
never worked correctly. In D98232, it was fixed, but, as it was
introducing a large compile-time regression, and the value of the
original patch was called into doubt, we disabled it by default
everywhere. A year later, it appears that caused no grief, so it seems
safe to remove the disabled code.

This should be accompanied by re-opening bug 26810.

Differential Revision: https://reviews.llvm.org/D121128
2022-03-14 10:49:16 -07:00
Simon Pilgrim ad3a7654dc [X86] combineCMP - peek through zero-extensions for X86cmp(zext(x0),0) zero tests (PR38960)
If we're comparing a value against zero, strip away any zero-extension and perform the comparison on the pre-extended value

Fixes #38308

Differential Revision: https://reviews.llvm.org/D121472
2022-03-13 11:38:40 +00:00
Amir Ayupov 999fa9f687 [X86][NFC] Move table from getRelaxedOpcodeArith into its own class
Move out the table and prepare the code to reuse it for the reverse mapping.
Follows the example of memory folding/unfolding tables in X86InstrFoldTables.cpp

Preparation step to unify `llvm::X86::getRelaxedOpcodeArith` and
`getShortArithOpcode` in BOLT X86MCPlusBuilder.cpp.

Addresses https://lists.llvm.org/pipermail/llvm-dev/2022-January/154526.html

Reviewed By: skan, MaskRay

Differential Revision: https://reviews.llvm.org/D121402
2022-03-12 09:06:17 -08:00
serge-sans-paille ed98c1b376 Cleanup includes: DebugInfo & CodeGen
Discourse thread: https://discourse.llvm.org/t/include-what-you-use-include-cleanup
Differential Revision: https://reviews.llvm.org/D121332
2022-03-12 17:26:40 +01:00
Fangrui Song 689c3a2552 [MC] Fix letter case of some MCSection member functions 2022-03-11 20:07:00 -08:00
Nico Weber a278250b0f Revert "Cleanup codegen includes"
This reverts commit 7f230feeea.
Breaks CodeGenCUDA/link-device-bitcode.cu in check-clang,
and many LLVM tests, see comments on https://reviews.llvm.org/D121169
2022-03-10 07:59:22 -05:00
serge-sans-paille 7f230feeea Cleanup codegen includes
after:  1061034926
before: 1063332844

Differential Revision: https://reviews.llvm.org/D121169
2022-03-10 10:00:30 +01:00
Simon Pilgrim e4ab2024a6 [X86] convertIntLogicToFPLogic - enable fp-logic on pre-AVX targets for supported fp predicates (PR34563)
If the SETCC fp-condcode is supported on SSE as a single CMPPS/PD op then we can use convertIntLogicToFPLogic to reduce EFLAGS and XMM->GPR traffic like we do for AVX targets.

Differential Revision: https://reviews.llvm.org/D121210
2022-03-08 18:06:27 +00:00
Simon Pilgrim 9119eefe5f [X86] Add cheapX86FSETCC_SSE helper. NFC.
Identify FP CondCode that can be performed by a non-AVX SSE CMP op

Pulled out of D121210
2022-03-08 18:06:27 +00:00
Simon Pilgrim d0aa77440c [X86] convertIntLogicToFPLogic - pull out condcodes. NFCI. 2022-03-08 13:31:17 +00:00
Sanjay Patel 9fce696110 [x86] reduce code duplication for select of X86ISD::CMP; NFC 2022-03-07 15:14:20 -05:00
Maksim Panchenko cf9b3ef941 Revert "[X86] Fix MCSymbolizer interface for X86Disassembler"
This reverts commit 0c2b43ab8c.
2022-03-07 10:40:48 -08:00
Maksim Panchenko 0c2b43ab8c [X86] Fix MCSymbolizer interface for X86Disassembler
Fix a number of issues with MCSymbolizer::tryAddingSymbolicOperand()
in X86Disassembler:

  * Pass instruction size instead of immediate size.
  * Correctly adjust the value of PC-relative operands.
  * Set operand offset to zero when the operand is specified
    implicitly.

Reviewed By: Amir, skan

Differential Revision: https://reviews.llvm.org/D121065
2022-03-07 10:27:28 -08:00
Simon Pilgrim 588d97e246 [X86] getTargetVShiftNode - peek through any zext node
If the shift amount has been zero-extended, peek through as this might help us further canonicalize the shift amount.

Fixes regression mentioned in rG147cfcbef1255ba2b4875b76708dab1a685085f5
2022-03-04 17:41:45 +00:00
Simon Pilgrim 147cfcbef1 [X86] LowerShiftByScalarVariable - find splat patterns with getSplatSourceVector instead of getSplatValue
This completes the removal of uses of SelectionDAG::getSplatValue started in D119090 - by avoiding extracting the splatted element we make it a lot easier to zero-extend the bottom 64-bits of the shift amount and fixes issues we had on 32-bit targets where i64 isn't legal.

I've removed the old version of getTargetVShiftNode that took the scalar shift amount argument and LowerRotate can finally efficiently handle vXi16 rotates-by-scalar (using the same code as general funnel-shifts).

The only regression we see is in the X86-AVX2 PR52719 test case in vector-shift-ashr-256.ll - this is now hitting the same problem as the X86-AVX1 case (failure to simplify a multi-use X86ISD::VBROADCAST_LOAD) which I intend to address in a follow up patch.
2022-03-04 16:47:35 +00:00
Simon Pilgrim 940d7cd59f [X86] SimplifyDemandedVectorElts - adjust X86ISD::ANDNP demanded elts based off constant masks
Similar to what we already do in combineAndnp, if either operand is a constant then we can improve the demanded elts/bits.
2022-03-04 13:40:56 +00:00
Maksim Panchenko 7e570308f2 [NFC] Fix typos
Reviewed By: yota9, Amir

Differential Revision: https://reviews.llvm.org/D120859
2022-03-03 13:26:39 -08:00
Paul Robinson 7b85f0f32f [PS4] isPS4 and isPS4CPU are not meaningfully different 2022-03-03 11:36:59 -05:00
Simon Pilgrim 0c9c92ffc0 [X86][XOP] Tidyup VPHADD/VPHSUB unary horizontal ops default schedule class
Based off Agner and AMD SoG tables, the XOP VPHADD/VPHSUB unary horizontal ops are as fast as basic arithmetic ops, not the slower SSSE3 binary horizontal add/sub ops. This also matches what the bdver2 model already lists.

Noticed while investigating reduction add optimizations.
2022-03-03 12:07:48 +00:00
Simon Pilgrim 75c4a92706 [X86] Enable v32i16 FSHL/FSHR support
Now that we've improved splat detection we no longer see regressions in the funnel-shift-by-splat-amount test cases
2022-03-02 17:32:38 +00:00
Simon Pilgrim ab2cbb8466 [X86] LowerShiftByScalarVariable - remove 32-bit vXi64 bitcast shift amount handling
This was handled generically (and better) by D120553
2022-03-02 13:52:14 +00:00
Mircea Trofin cb2160760e [nfc][codegen] Move RegisterBank[Info].h under CodeGen
This wraps up from D119053. The 2 headers are moved as described,
fixed file headers and include guards, updated all files where the old
paths were detected (simple grep through the repo), and `clang-format`-ed it all.

Differential Revision: https://reviews.llvm.org/D119876
2022-03-01 21:53:25 -08:00
serge-sans-paille a494ae43be Cleanup includes: TransformsUtils
Estimation on the impact on preprocessor output:
before: 1065307662
after:  1064800684

Discourse thread: https://discourse.llvm.org/t/include-what-you-use-include-cleanup
Differential Revision: https://reviews.llvm.org/D120741
2022-03-01 21:00:07 +01:00
Phoebe Wang e03d216c28 [X86] Use bit test instructions to optimize some logic atomic operations
This is to match GCC's optimizations: https://gcc.godbolt.org/z/3odh9e7WE

Reviewed By: craig.topper

Differential Revision: https://reviews.llvm.org/D120199
2022-03-01 09:57:08 +08:00
Simon Pilgrim 2b46417aa2 [X86][SSE] Attempt to lower vec_reduce_add patterns with PSADBW for zero-extended vXi8 sources
For i16/32/64 vectors, if the upper bits are known to be zero, then we can try to truncate to vXi8 (if its worth it) and perform this as a PSADBW to add+zext each v4i8 subvector to a i64 sum, which we can then reduce together.

This addresses some of the PR42674 test cases where the source data was vXi8 but had been extended to match a wider unsigned integer accumulator.

Differential Revision: https://reviews.llvm.org/D120193
2022-02-27 15:17:42 +00:00
Jameson Nash c4b1a63a1b mark getTargetTransformInfo and getTargetIRAnalysis as const
Seems like this can be const, since Passes shouldn't modify it.

Reviewed By: wsmoses

Differential Revision: https://reviews.llvm.org/D120518
2022-02-25 14:30:44 -05:00
Pawe Bylica eb1ff70fc5
[X86] Combine ADC(ADD(X,Y),0,Carry) -> ADC(X,Y,Carry)
Reviewed By: RKSimon

Differential Revision: https://reviews.llvm.org/D120435
2022-02-25 14:31:20 +01:00
Simon Pilgrim 748bf545dc Revert rG87753cebf5f861eee418d6bce155dfa0b00f9878 "[X86] combineX86ShufflesRecursively - don't both widening inputs before calling combineX86ShuffleChain"
Reverting while we investigate codegen regression reports
2022-02-25 08:59:53 +00:00
Amir Ayupov e38fc14c43 [X86] Introduce x86-cmov-converter-force-all
Introduce an option to expand all CMOV groups into hammocks, matching GCC's
`-fno-if-conversion2` flag. The motivation is to leave CMOV conversion
opportunities to a binary optimizer that can make the decision based on branch
misprediction rate (available e.g. in Intel's LBR).

Reviewed By: MaskRay, skan

Differential Revision: https://reviews.llvm.org/D119777
2022-02-24 10:47:22 -08:00
Simon Pilgrim a636801a36 [X86] LowerRotate - enable v8i16 ROTL/ROTR on all pre-SSE41 targets
We're still better off expanding this once we have PMOVZX
2022-02-24 14:14:08 +00:00
Simon Pilgrim 0ea50bee83 [X86] SimplifyDemandedVectorEltsForTargetNode - add X86ISD::ANDNP handling 2022-02-24 13:51:51 +00:00
Simon Pilgrim e41a138520 [X86] LowerShiftByScalarVariable - use getSplatSourceVector for vXi8 shift expansion
Using getSplatValue causes poor codegen due to not always being able to remove the EXTRACT_VECTOR_ELT created inside getSplatValue.

The vXi16 shifts/rotates are still showing occasional regressions but vXi8 is a definite improvement.
2022-02-24 11:24:06 +00:00
Simon Pilgrim 427d9f60db [X86] combineX86ShufflesRecursively - pull out repeated getValueType/getSimpleValueType calls. 2022-02-23 18:45:28 +00:00
Simon Pilgrim 87753cebf5 [X86] combineX86ShufflesRecursively - don't both widening inputs before calling combineX86ShuffleChain
combineX86ShuffleChain no longer has to assume that the shuffle inputs are the right size, so don't create unnecessary nodes messing up oneuse limits as detailed on Issue #45319
2022-02-23 17:29:41 +00:00
Simon Pilgrim 22d0453128 [X86] combineX86ShuffleChainWithExtract - don't both widening inputs after peeking through ISD::EXTRACT_SUBVECTOR nodes
combineX86ShuffleChain no longer has to assume that the shuffle inputs are the right size, so don't create unnecessary nodes messing up oneuse limits as detailed on Issue #45319

Removing widening from combineX86ShufflesRecursively will be the next step, followed by removing combineX86ShuffleChainWithExtract entirely
2022-02-23 15:44:24 +00:00
Sanjay Patel ad7214f23d [x86] add load folding restriction to pushAddIntoCmovOfConsts()
With only a load-fold the diffs look neutral. If there's a load and store (rmw)
fold opportunity as shown in the test based on #53862, then we end up with an
extra instruction.

Fixes #53862

Differential Revision: https://reviews.llvm.org/D120281
2022-02-22 08:02:11 -05:00
Simon Pilgrim ec910751fe [X86] combineX86ShufflesRecursively - attempt to fold ISD::EXTRACT_SUBVECTOR into a shuffle chain
Peek through if we're extracting a non-zero'th subvector in an attempt to fold the extract into a lane-crossing shuffle

This also exposes a failure to fold extract_subvector(movddup(x),c) -> movddup(extract_subvector(x,c))
2022-02-20 18:50:33 +00:00
Simon Pilgrim 8ef3e895ad [X86] combineX86ShufflesRecursively - add TODO not to generate temporary nodes
Extension to PR45974, unless we actual combine the target shuffles we shouldn't be generating temporary nodes as they may interfere with the one use checks in the shuffle recursions
2022-02-20 15:59:23 +00:00
Simon Pilgrim ab069f37e8 [X86] combineArithReduction - pull out repeated getVectorNumElements() calls 2022-02-19 19:41:20 +00:00