[RISCV] Enable fixed length vectors and loop vectorization with same

This change enables the use of RISCV's variable length vector registers for fixed length vectors in the IR, and implicitly enables various IR transforms which generate fixed length vectors if legal (e.g. LoopVectorize). Specifically, this enables fixed length vectors which are known to be inbounds of the underlying variable hardware size. For context, remember that the +V extension provides a minimum VLEN of 128. The embedded variants provide lower minimums. The analogy here is essentially vectorizing for SSE on a machine which may or may not include AVX2/AVX512. We won't get full utilization by default, but we will get some benefit. And of course, with an explicit mcpu we can vectorize to the exact target hardware. The LV impact is mostly related to vectorizer robustness. In cases we haven't yet fully implemented scalable vectorization support, we can fall back to fixed length vectorization. SLP has been disabled for now, even when fixed vectors are enabled. See a310637 and associated review. There are a few addiitional code quality issues which need worked through before turning SLP on would be reasonable. Differential Revision: https://reviews.llvm.org/D131508
2022-08-26 14:36:18 -07:00 · 2022-08-26 14:36:18 -07:00 · b45a262679
parent af64a9248f
commit b45a262679
4 changed files with 86 additions and 23 deletions
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@ -44,7 +44,7 @@ static cl::opt<int> RVVVectorBitsMin(
             "with zero meaning no minimum size is assumed. A value of -1 "
             "means use Zvl*b extension. This is primarily used to enable "
             "autovectorization with fixed width vectors."),
-    cl::init(0), cl::Hidden);
+    cl::init(-1), cl::Hidden);

 static cl::opt<unsigned> RVVVectorLMULMax(
    "riscv-v-fixed-length-vector-lmul-max",
--- a/llvm/test/Analysis/CostModel/RISCV/active_lane_mask.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/active_lane_mask.ll
@ -15,16 +15,16 @@ define void @get_lane_mask() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %mask_nxv1i1_i32 = call <vscale x 1 x i1> @llvm.get.active.lane.mask.nxv1i1.i32(i32 undef, i32 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 undef, i64 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 undef, i16 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 56 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 224 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 112 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 undef, i64 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 undef, i16 undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
  %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 undef, i64 undef)
--- a/llvm/test/CodeGen/RISCV/fold-vector-cmp.ll
+++ b/llvm/test/CodeGen/RISCV/fold-vector-cmp.ll
@ -1,14 +1,31 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -start-after codegenprepare -mtriple=riscv64 -mattr=-v -o - %s | FileCheck %s
-; RUN: llc -start-after codegenprepare -mtriple=riscv64 -mattr=+v -o - %s | FileCheck %s
+; RUN: llc -start-after codegenprepare -mtriple=riscv64 -mattr=-v -o - %s | FileCheck --check-prefix=CHECK-NOV %s
+; RUN: llc -start-after codegenprepare -mtriple=riscv64 -mattr=+v -o - %s | FileCheck --check-prefix=CHECK-V %s

 ; Reproducer for https://github.com/llvm/llvm-project/issues/55168.
 ; We should always return 1 (and not -1).
 define i32 @test(i32 %call.i) {
-; CHECK-LABEL: test:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    li a0, 1
-; CHECK-NEXT:    ret
+; CHECK-NOV-LABEL: test:
+; CHECK-NOV:       # %bb.0:
+; CHECK-NOV-NEXT:    li a0, 1
+; CHECK-NOV-NEXT:    ret
+;
+; CHECK-V-LABEL: test:
+; CHECK-V:       # %bb.0:
+; CHECK-V-NEXT:    lui a1, 524288
+; CHECK-V-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; CHECK-V-NEXT:    vmv.v.x v8, a1
+; CHECK-V-NEXT:    vsetvli zero, zero, e32, mf2, tu, mu
+; CHECK-V-NEXT:    vmv.s.x v8, a0
+; CHECK-V-NEXT:    addiw a0, a1, 2
+; CHECK-V-NEXT:    vsetvli zero, zero, e32, mf2, ta, mu
+; CHECK-V-NEXT:    vmslt.vx v0, v8, a0
+; CHECK-V-NEXT:    vmv.v.i v8, 0
+; CHECK-V-NEXT:    vmerge.vim v8, v8, 1, v0
+; CHECK-V-NEXT:    vsetivli zero, 1, e32, mf2, ta, mu
+; CHECK-V-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-V-NEXT:    vmv.x.s a0, v8
+; CHECK-V-NEXT:    ret
  %t2 = insertelement <2 x i32> <i32 poison, i32 -2147483648>, i32 %call.i, i64 0
  %t3 = icmp slt <2 x i32> %t2, <i32 -2147483646, i32 -2147483646>
  %t4 = zext <2 x i1> %t3 to <2 x i32>
--- a/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/illegal-type.ll
@ -101,17 +101,63 @@ for.end:
 define void @uniform_store_i1(i1* noalias %dst, i64* noalias %start, i64 %N) {
 ; CHECK-LABEL: @uniform_store_i1(
 ; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N:%.*]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i64, i64* [[START:%.*]], i64 [[N_VEC]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64*> poison, i64* [[START]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64*> [[BROADCAST_SPLATINSERT]], <2 x i64*> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64*> poison, i64* [[START]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64*> [[BROADCAST_SPLATINSERT3]], <2 x i64*> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi i64* [ [[START]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <2 x i64> <i64 0, i64 1>
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, i64* [[POINTER_PHI]], <2 x i64> <i64 2, i64 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i64*> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, i64* [[TMP3]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <2 x i64>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, i64* [[TMP3]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i64* [[TMP6]] to <2 x i64>*
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, <2 x i64>* [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, <2 x i64*> [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, <2 x i64*> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <2 x i64*> [[TMP8]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq <2 x i64*> [[TMP9]], [[BROADCAST_SPLAT4]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
+; CHECK-NEXT:    store i1 [[TMP12]], i1* [[DST:%.*]], align 1
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
+; CHECK-NEXT:    store i1 [[TMP13]], i1* [[DST]], align 1
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP11]], i32 0
+; CHECK-NEXT:    store i1 [[TMP14]], i1* [[DST]], align 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x i1> [[TMP11]], i32 1
+; CHECK-NEXT:    store i1 [[TMP15]], i1* [[DST]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i64, i64* [[POINTER_PHI]], i64 4
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[START]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[FIRST_SROA:%.*]] = phi i64* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[START:%.*]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[FIRST_SROA:%.*]] = phi i64* [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, i64* [[FIRST_SROA]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = load i64, i64* [[FIRST_SROA]], align 4
 ; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i64, i64* [[FIRST_SROA]], i64 1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i64* [[INCDEC_PTR]], [[START]]
-; CHECK-NEXT:    store i1 [[CMP_NOT]], i1* [[DST:%.*]], align 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV]], [[N:%.*]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]], !llvm.loop [[LOOP0]]
+; CHECK-NEXT:    store i1 [[CMP_NOT]], i1* [[DST]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[END]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
 ;