1//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8//
9// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10// and generates target-independent LLVM-IR.
11// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12// of instructions in order to estimate the profitability of vectorization.
13//
14// The loop vectorizer combines consecutive loop iterations into a single
15// 'wide' iteration. After this transformation the index is incremented
16// by the SIMD vector width, and not by one.
17//
18// This pass has three parts:
19// 1. The main loop pass that drives the different parts.
20// 2. LoopVectorizationLegality - A unit that checks for the legality
21// of the vectorization.
22// 3. InnerLoopVectorizer - A unit that performs the actual
23// widening of instructions.
24// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25// of vectorization. It decides on the optimal vector width, which
26// can be one, if vectorization is not profitable.
27//
28// There is a development effort going on to migrate loop vectorizer to the
29// VPlan infrastructure and to introduce outer loop vectorization support (see
30// docs/VectorizationPlan.rst and
31// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32// purpose, we temporarily introduced the VPlan-native vectorization path: an
33// alternative vectorization path that is natively implemented on top of the
34// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35//
36//===----------------------------------------------------------------------===//
37//
38// The reduction-variable vectorization is based on the paper:
39// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40//
41// Variable uniformity checks are inspired by:
42// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43//
44// The interleaved access vectorization is based on the paper:
45// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46// Data for SIMD
47//
48// Other ideas/concepts are from:
49// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50//
51// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52// Vectorizing Compilers.
53//
54//===----------------------------------------------------------------------===//
55
56#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57#include "LoopVectorizationPlanner.h"
58#include "VPRecipeBuilder.h"
59#include "VPlan.h"
60#include "VPlanAnalysis.h"
61#include "VPlanCFG.h"
62#include "VPlanHelpers.h"
63#include "VPlanPatternMatch.h"
64#include "VPlanTransforms.h"
65#include "VPlanUtils.h"
66#include "VPlanVerifier.h"
67#include "llvm/ADT/APInt.h"
68#include "llvm/ADT/ArrayRef.h"
69#include "llvm/ADT/DenseMap.h"
70#include "llvm/ADT/DenseMapInfo.h"
71#include "llvm/ADT/Hashing.h"
72#include "llvm/ADT/MapVector.h"
73#include "llvm/ADT/STLExtras.h"
74#include "llvm/ADT/SmallPtrSet.h"
75#include "llvm/ADT/SmallVector.h"
76#include "llvm/ADT/Statistic.h"
77#include "llvm/ADT/StringRef.h"
78#include "llvm/ADT/Twine.h"
79#include "llvm/ADT/TypeSwitch.h"
80#include "llvm/ADT/iterator_range.h"
81#include "llvm/Analysis/AssumptionCache.h"
82#include "llvm/Analysis/BasicAliasAnalysis.h"
83#include "llvm/Analysis/BlockFrequencyInfo.h"
84#include "llvm/Analysis/CFG.h"
85#include "llvm/Analysis/CodeMetrics.h"
86#include "llvm/Analysis/DemandedBits.h"
87#include "llvm/Analysis/GlobalsModRef.h"
88#include "llvm/Analysis/LoopAccessAnalysis.h"
89#include "llvm/Analysis/LoopAnalysisManager.h"
90#include "llvm/Analysis/LoopInfo.h"
91#include "llvm/Analysis/LoopIterator.h"
92#include "llvm/Analysis/OptimizationRemarkEmitter.h"
93#include "llvm/Analysis/ProfileSummaryInfo.h"
94#include "llvm/Analysis/ScalarEvolution.h"
95#include "llvm/Analysis/ScalarEvolutionExpressions.h"
96#include "llvm/Analysis/TargetLibraryInfo.h"
97#include "llvm/Analysis/TargetTransformInfo.h"
98#include "llvm/Analysis/ValueTracking.h"
99#include "llvm/Analysis/VectorUtils.h"
100#include "llvm/IR/Attributes.h"
101#include "llvm/IR/BasicBlock.h"
102#include "llvm/IR/CFG.h"
103#include "llvm/IR/Constant.h"
104#include "llvm/IR/Constants.h"
105#include "llvm/IR/DataLayout.h"
106#include "llvm/IR/DebugInfo.h"
107#include "llvm/IR/DebugLoc.h"
108#include "llvm/IR/DerivedTypes.h"
109#include "llvm/IR/DiagnosticInfo.h"
110#include "llvm/IR/Dominators.h"
111#include "llvm/IR/Function.h"
112#include "llvm/IR/IRBuilder.h"
113#include "llvm/IR/InstrTypes.h"
114#include "llvm/IR/Instruction.h"
115#include "llvm/IR/Instructions.h"
116#include "llvm/IR/IntrinsicInst.h"
117#include "llvm/IR/Intrinsics.h"
118#include "llvm/IR/MDBuilder.h"
119#include "llvm/IR/Metadata.h"
120#include "llvm/IR/Module.h"
121#include "llvm/IR/Operator.h"
122#include "llvm/IR/PatternMatch.h"
123#include "llvm/IR/ProfDataUtils.h"
124#include "llvm/IR/Type.h"
125#include "llvm/IR/Use.h"
126#include "llvm/IR/User.h"
127#include "llvm/IR/Value.h"
128#include "llvm/IR/Verifier.h"
129#include "llvm/Support/Casting.h"
130#include "llvm/Support/CommandLine.h"
131#include "llvm/Support/Debug.h"
132#include "llvm/Support/ErrorHandling.h"
133#include "llvm/Support/InstructionCost.h"
134#include "llvm/Support/MathExtras.h"
135#include "llvm/Support/NativeFormatting.h"
136#include "llvm/Support/raw_ostream.h"
137#include "llvm/Transforms/Utils/BasicBlockUtils.h"
138#include "llvm/Transforms/Utils/InjectTLIMappings.h"
139#include "llvm/Transforms/Utils/Local.h"
140#include "llvm/Transforms/Utils/LoopSimplify.h"
141#include "llvm/Transforms/Utils/LoopUtils.h"
142#include "llvm/Transforms/Utils/LoopVersioning.h"
143#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
144#include "llvm/Transforms/Utils/SizeOpts.h"
145#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
146#include <algorithm>
147#include <cassert>
148#include <cstdint>
149#include <functional>
150#include <iterator>
151#include <limits>
152#include <memory>
153#include <string>
154#include <tuple>
155#include <utility>
156
157using namespace llvm;
158
159#define LV_NAME "loop-vectorize"
160#define DEBUG_TYPE LV_NAME
161
162#ifndef NDEBUG
163const char VerboseDebug[] = DEBUG_TYPE "-verbose";
164#endif
165
166/// @{
167/// Metadata attribute names
168const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
169const char LLVMLoopVectorizeFollowupVectorized[] =
170 "llvm.loop.vectorize.followup_vectorized";
171const char LLVMLoopVectorizeFollowupEpilogue[] =
172 "llvm.loop.vectorize.followup_epilogue";
173/// @}
174
175STATISTIC(LoopsVectorized, "Number of loops vectorized");
176STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
177STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
178STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
179
180static cl::opt<bool> EnableEpilogueVectorization(
181 "enable-epilogue-vectorization", cl::init(Val: true), cl::Hidden,
182 cl::desc("Enable vectorization of epilogue loops."));
183
184static cl::opt<unsigned> EpilogueVectorizationForceVF(
185 "epilogue-vectorization-force-VF", cl::init(Val: 1), cl::Hidden,
186 cl::desc("When epilogue vectorization is enabled, and a value greater than "
187 "1 is specified, forces the given VF for all applicable epilogue "
188 "loops."));
189
190static cl::opt<unsigned> EpilogueVectorizationMinVF(
191 "epilogue-vectorization-minimum-VF", cl::Hidden,
192 cl::desc("Only loops with vectorization factor equal to or larger than "
193 "the specified value are considered for epilogue vectorization."));
194
195/// Loops with a known constant trip count below this number are vectorized only
196/// if no scalar iteration overheads are incurred.
197static cl::opt<unsigned> TinyTripCountVectorThreshold(
198 "vectorizer-min-trip-count", cl::init(Val: 16), cl::Hidden,
199 cl::desc("Loops with a constant trip count that is smaller than this "
200 "value are vectorized only if no scalar iteration overheads "
201 "are incurred."));
202
203static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
204 "vectorize-memory-check-threshold", cl::init(Val: 128), cl::Hidden,
205 cl::desc("The maximum allowed number of runtime memory checks"));
206
207// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
208// that predication is preferred, and this lists all options. I.e., the
209// vectorizer will try to fold the tail-loop (epilogue) into the vector body
210// and predicate the instructions accordingly. If tail-folding fails, there are
211// different fallback strategies depending on these values:
212namespace PreferPredicateTy {
213 enum Option {
214 ScalarEpilogue = 0,
215 PredicateElseScalarEpilogue,
216 PredicateOrDontVectorize
217 };
218} // namespace PreferPredicateTy
219
220static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
221 "prefer-predicate-over-epilogue",
222 cl::init(Val: PreferPredicateTy::ScalarEpilogue),
223 cl::Hidden,
224 cl::desc("Tail-folding and predication preferences over creating a scalar "
225 "epilogue loop."),
226 cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
227 "scalar-epilogue",
228 "Don't tail-predicate loops, create scalar epilogue"),
229 clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
230 "predicate-else-scalar-epilogue",
231 "prefer tail-folding, create scalar epilogue if tail "
232 "folding fails."),
233 clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
234 "predicate-dont-vectorize",
235 "prefers tail-folding, don't attempt vectorization if "
236 "tail-folding fails.")));
237
238static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
239 "force-tail-folding-style", cl::desc("Force the tail folding style"),
240 cl::init(Val: TailFoldingStyle::None),
241 cl::values(
242 clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
243 clEnumValN(
244 TailFoldingStyle::Data, "data",
245 "Create lane mask for data only, using active.lane.mask intrinsic"),
246 clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
247 "data-without-lane-mask",
248 "Create lane mask with compare/stepvector"),
249 clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
250 "Create lane mask using active.lane.mask intrinsic, and use "
251 "it for both data and control flow"),
252 clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
253 "data-and-control-without-rt-check",
254 "Similar to data-and-control, but remove the runtime check"),
255 clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
256 "Use predicated EVL instructions for tail folding. If EVL "
257 "is unsupported, fallback to data-without-lane-mask.")));
258
259static cl::opt<bool> MaximizeBandwidth(
260 "vectorizer-maximize-bandwidth", cl::init(Val: false), cl::Hidden,
261 cl::desc("Maximize bandwidth when selecting vectorization factor which "
262 "will be determined by the smallest type in loop."));
263
264static cl::opt<bool> EnableInterleavedMemAccesses(
265 "enable-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
266 cl::desc("Enable vectorization on interleaved memory accesses in a loop"));
267
268/// An interleave-group may need masking if it resides in a block that needs
269/// predication, or in order to mask away gaps.
270static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
271 "enable-masked-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
272 cl::desc("Enable vectorization on masked interleaved memory accesses in a loop"));
273
274static cl::opt<unsigned> ForceTargetNumScalarRegs(
275 "force-target-num-scalar-regs", cl::init(Val: 0), cl::Hidden,
276 cl::desc("A flag that overrides the target's number of scalar registers."));
277
278static cl::opt<unsigned> ForceTargetNumVectorRegs(
279 "force-target-num-vector-regs", cl::init(Val: 0), cl::Hidden,
280 cl::desc("A flag that overrides the target's number of vector registers."));
281
282static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
283 "force-target-max-scalar-interleave", cl::init(Val: 0), cl::Hidden,
284 cl::desc("A flag that overrides the target's max interleave factor for "
285 "scalar loops."));
286
287static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
288 "force-target-max-vector-interleave", cl::init(Val: 0), cl::Hidden,
289 cl::desc("A flag that overrides the target's max interleave factor for "
290 "vectorized loops."));
291
292cl::opt<unsigned> llvm::ForceTargetInstructionCost(
293 "force-target-instruction-cost", cl::init(Val: 0), cl::Hidden,
294 cl::desc("A flag that overrides the target's expected cost for "
295 "an instruction to a single constant value. Mostly "
296 "useful for getting consistent testing."));
297
298static cl::opt<bool> ForceTargetSupportsScalableVectors(
299 "force-target-supports-scalable-vectors", cl::init(Val: false), cl::Hidden,
300 cl::desc(
301 "Pretend that scalable vectors are supported, even if the target does "
302 "not support them. This flag should only be used for testing."));
303
304static cl::opt<unsigned> SmallLoopCost(
305 "small-loop-cost", cl::init(Val: 20), cl::Hidden,
306 cl::desc(
307 "The cost of a loop that is considered 'small' by the interleaver."));
308
309static cl::opt<bool> LoopVectorizeWithBlockFrequency(
310 "loop-vectorize-with-block-frequency", cl::init(Val: true), cl::Hidden,
311 cl::desc("Enable the use of the block frequency analysis to access PGO "
312 "heuristics minimizing code growth in cold regions and being more "
313 "aggressive in hot regions."));
314
315// Runtime interleave loops for load/store throughput.
316static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
317 "enable-loadstore-runtime-interleave", cl::init(Val: true), cl::Hidden,
318 cl::desc(
319 "Enable runtime interleaving until load/store ports are saturated"));
320
321/// The number of stores in a loop that are allowed to need predication.
322static cl::opt<unsigned> NumberOfStoresToPredicate(
323 "vectorize-num-stores-pred", cl::init(Val: 1), cl::Hidden,
324 cl::desc("Max number of stores to be predicated behind an if."));
325
326static cl::opt<bool> EnableIndVarRegisterHeur(
327 "enable-ind-var-reg-heur", cl::init(Val: true), cl::Hidden,
328 cl::desc("Count the induction variable only once when interleaving"));
329
330static cl::opt<bool> EnableCondStoresVectorization(
331 "enable-cond-stores-vec", cl::init(Val: true), cl::Hidden,
332 cl::desc("Enable if predication of stores during vectorization."));
333
334static cl::opt<unsigned> MaxNestedScalarReductionIC(
335 "max-nested-scalar-reduction-interleave", cl::init(Val: 2), cl::Hidden,
336 cl::desc("The maximum interleave count to use when interleaving a scalar "
337 "reduction in a nested loop."));
338
339static cl::opt<bool>
340 PreferInLoopReductions("prefer-inloop-reductions", cl::init(Val: false),
341 cl::Hidden,
342 cl::desc("Prefer in-loop vector reductions, "
343 "overriding the targets preference."));
344
345static cl::opt<bool> ForceOrderedReductions(
346 "force-ordered-reductions", cl::init(Val: false), cl::Hidden,
347 cl::desc("Enable the vectorisation of loops with in-order (strict) "
348 "FP reductions"));
349
350static cl::opt<bool> PreferPredicatedReductionSelect(
351 "prefer-predicated-reduction-select", cl::init(Val: false), cl::Hidden,
352 cl::desc(
353 "Prefer predicating a reduction operation over an after loop select."));
354
355cl::opt<bool> llvm::EnableVPlanNativePath(
356 "enable-vplan-native-path", cl::Hidden,
357 cl::desc("Enable VPlan-native vectorization path with "
358 "support for outer loop vectorization."));
359
360cl::opt<bool>
361 llvm::VerifyEachVPlan("vplan-verify-each",
362#ifdef EXPENSIVE_CHECKS
363 cl::init(true),
364#else
365 cl::init(Val: false),
366#endif
367 cl::Hidden,
368 cl::desc("Verfiy VPlans after VPlan transforms."));
369
370// This flag enables the stress testing of the VPlan H-CFG construction in the
371// VPlan-native vectorization path. It must be used in conjuction with
372// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
373// verification of the H-CFGs built.
374static cl::opt<bool> VPlanBuildStressTest(
375 "vplan-build-stress-test", cl::init(Val: false), cl::Hidden,
376 cl::desc(
377 "Build VPlan for every supported loop nest in the function and bail "
378 "out right after the build (stress test the VPlan H-CFG construction "
379 "in the VPlan-native vectorization path)."));
380
381cl::opt<bool> llvm::EnableLoopInterleaving(
382 "interleave-loops", cl::init(Val: true), cl::Hidden,
383 cl::desc("Enable loop interleaving in Loop vectorization passes"));
384cl::opt<bool> llvm::EnableLoopVectorization(
385 "vectorize-loops", cl::init(Val: true), cl::Hidden,
386 cl::desc("Run the Loop vectorization passes"));
387
388static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
389 "force-widen-divrem-via-safe-divisor", cl::Hidden,
390 cl::desc(
391 "Override cost based safe divisor widening for div/rem instructions"));
392
393static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
394 "vectorizer-maximize-bandwidth-for-vector-calls", cl::init(Val: true),
395 cl::Hidden,
396 cl::desc("Try wider VFs if they enable the use of vector variants"));
397
398static cl::opt<bool> EnableEarlyExitVectorization(
399 "enable-early-exit-vectorization", cl::init(Val: true), cl::Hidden,
400 cl::desc(
401 "Enable vectorization of early exit loops with uncountable exits."));
402
403// Likelyhood of bypassing the vectorized loop because there are zero trips left
404// after prolog. See `emitIterationCountCheck`.
405static constexpr uint32_t MinItersBypassWeights[] = {1, 127};
406
407/// A helper function that returns true if the given type is irregular. The
408/// type is irregular if its allocated size doesn't equal the store size of an
409/// element of the corresponding vector type.
410static bool hasIrregularType(Type *Ty, const DataLayout &DL) {
411 // Determine if an array of N elements of type Ty is "bitcast compatible"
412 // with a <N x Ty> vector.
413 // This is only true if there is no padding between the array elements.
414 return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
415}
416
417/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
418/// ElementCount to include loops whose trip count is a function of vscale.
419static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
420 const Loop *L) {
421 return ElementCount::getFixed(MinVal: SE->getSmallConstantTripCount(L));
422}
423
424/// Returns "best known" trip count, which is either a valid positive trip count
425/// or std::nullopt when an estimate cannot be made (including when the trip
426/// count would overflow), for the specified loop \p L as defined by the
427/// following procedure:
428/// 1) Returns exact trip count if it is known.
429/// 2) Returns expected trip count according to profile data if any.
430/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
431/// 4) Returns std::nullopt if all of the above failed.
432static std::optional<ElementCount>
433getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
434 bool CanUseConstantMax = true) {
435 // Check if exact trip count is known.
436 if (auto ExpectedTC = getSmallConstantTripCount(SE: PSE.getSE(), L))
437 return ExpectedTC;
438
439 // Check if there is an expected trip count available from profile data.
440 if (LoopVectorizeWithBlockFrequency)
441 if (auto EstimatedTC = getLoopEstimatedTripCount(L))
442 return ElementCount::getFixed(MinVal: *EstimatedTC);
443
444 if (!CanUseConstantMax)
445 return std::nullopt;
446
447 // Check if upper bound estimate is known.
448 if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
449 return ElementCount::getFixed(MinVal: ExpectedTC);
450
451 return std::nullopt;
452}
453
454namespace {
455// Forward declare GeneratedRTChecks.
456class GeneratedRTChecks;
457
458using SCEV2ValueTy = DenseMap<const SCEV *, Value *>;
459} // namespace
460
461namespace llvm {
462
463AnalysisKey ShouldRunExtraVectorPasses::Key;
464
465/// InnerLoopVectorizer vectorizes loops which contain only one basic
466/// block to a specified vectorization factor (VF).
467/// This class performs the widening of scalars into vectors, or multiple
468/// scalars. This class also implements the following features:
469/// * It inserts an epilogue loop for handling loops that don't have iteration
470/// counts that are known to be a multiple of the vectorization factor.
471/// * It handles the code generation for reduction variables.
472/// * Scalarization (implementation using scalars) of un-vectorizable
473/// instructions.
474/// InnerLoopVectorizer does not perform any vectorization-legality
475/// checks, and relies on the caller to check for the different legality
476/// aspects. The InnerLoopVectorizer relies on the
477/// LoopVectorizationLegality class to provide information about the induction
478/// and reduction variables that were found to a given vectorization factor.
479class InnerLoopVectorizer {
480public:
481 InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
482 LoopInfo *LI, DominatorTree *DT,
483 const TargetLibraryInfo *TLI,
484 const TargetTransformInfo *TTI, AssumptionCache *AC,
485 OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
486 ElementCount MinProfitableTripCount,
487 unsigned UnrollFactor, LoopVectorizationCostModel *CM,
488 BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI,
489 GeneratedRTChecks &RTChecks, VPlan &Plan)
490 : OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
491 AC(AC), ORE(ORE), VF(VecWidth),
492 MinProfitableTripCount(MinProfitableTripCount), UF(UnrollFactor),
493 Builder(PSE.getSE()->getContext()), Cost(CM), BFI(BFI), PSI(PSI),
494 RTChecks(RTChecks), Plan(Plan),
495 VectorPHVPB(Plan.getVectorLoopRegion()->getSinglePredecessor()) {}
496
497 virtual ~InnerLoopVectorizer() = default;
498
499 /// Create a new empty loop that will contain vectorized instructions later
500 /// on, while the old loop will be used as the scalar remainder. Control flow
501 /// is generated around the vectorized (and scalar epilogue) loops consisting
502 /// of various checks and bypasses. Return the pre-header block of the new
503 /// loop. In the case of epilogue vectorization, this function is overriden to
504 /// handle the more complex control flow around the loops.
505 virtual BasicBlock *createVectorizedLoopSkeleton();
506
507 /// Fix the vectorized code, taking care of header phi's, and more.
508 void fixVectorizedLoop(VPTransformState &State);
509
510 /// Fix the non-induction PHIs in \p Plan.
511 void fixNonInductionPHIs(VPTransformState &State);
512
513 /// Returns the original loop trip count.
514 Value *getTripCount() const { return TripCount; }
515
516 /// Used to set the trip count after ILV's construction and after the
517 /// preheader block has been executed. Note that this always holds the trip
518 /// count of the original loop for both main loop and epilogue vectorization.
519 void setTripCount(Value *TC) { TripCount = TC; }
520
521 /// Return the additional bypass block which targets the scalar loop by
522 /// skipping the epilogue loop after completing the main loop.
523 BasicBlock *getAdditionalBypassBlock() const {
524 assert(AdditionalBypassBlock &&
525 "Trying to access AdditionalBypassBlock but it has not been set");
526 return AdditionalBypassBlock;
527 }
528
529protected:
530 friend class LoopVectorizationPlanner;
531
532 /// Returns (and creates if needed) the trip count of the widened loop.
533 Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
534
535 // Create a check to see if the vector loop should be executed
536 Value *createIterationCountCheck(ElementCount VF, unsigned UF) const;
537
538 /// Emit a bypass check to see if the vector trip count is zero, including if
539 /// it overflows.
540 void emitIterationCountCheck(BasicBlock *Bypass);
541
542 /// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
543 /// vector loop preheader, middle block and scalar preheader.
544 void createVectorLoopSkeleton(StringRef Prefix);
545
546 /// Allow subclasses to override and print debug traces before/after vplan
547 /// execution, when trace information is requested.
548 virtual void printDebugTracesAtStart() {}
549 virtual void printDebugTracesAtEnd() {}
550
551 /// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
552 /// vector preheader and its predecessor, also connecting the new block to the
553 /// scalar preheader.
554 void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
555
556 /// The original loop.
557 Loop *OrigLoop;
558
559 /// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
560 /// dynamic knowledge to simplify SCEV expressions and converts them to a
561 /// more usable form.
562 PredicatedScalarEvolution &PSE;
563
564 /// Loop Info.
565 LoopInfo *LI;
566
567 /// Dominator Tree.
568 DominatorTree *DT;
569
570 /// Target Library Info.
571 const TargetLibraryInfo *TLI;
572
573 /// Target Transform Info.
574 const TargetTransformInfo *TTI;
575
576 /// Assumption Cache.
577 AssumptionCache *AC;
578
579 /// Interface to emit optimization remarks.
580 OptimizationRemarkEmitter *ORE;
581
582 /// The vectorization SIMD factor to use. Each vector will have this many
583 /// vector elements.
584 ElementCount VF;
585
586 ElementCount MinProfitableTripCount;
587
588 /// The vectorization unroll factor to use. Each scalar is vectorized to this
589 /// many different vector instructions.
590 unsigned UF;
591
592 /// The builder that we use
593 IRBuilder<> Builder;
594
595 // --- Vectorization state ---
596
597 /// The vector-loop preheader.
598 BasicBlock *LoopVectorPreHeader = nullptr;
599
600 /// The scalar-loop preheader.
601 BasicBlock *LoopScalarPreHeader = nullptr;
602
603 /// Middle Block between the vector and the scalar.
604 BasicBlock *LoopMiddleBlock = nullptr;
605
606 /// Trip count of the original loop.
607 Value *TripCount = nullptr;
608
609 /// Trip count of the widened loop (TripCount - TripCount % (VF*UF))
610 Value *VectorTripCount = nullptr;
611
612 /// The profitablity analysis.
613 LoopVectorizationCostModel *Cost;
614
615 /// BFI and PSI are used to check for profile guided size optimizations.
616 BlockFrequencyInfo *BFI;
617 ProfileSummaryInfo *PSI;
618
619 /// Structure to hold information about generated runtime checks, responsible
620 /// for cleaning the checks, if vectorization turns out unprofitable.
621 GeneratedRTChecks &RTChecks;
622
623 /// The additional bypass block which conditionally skips over the epilogue
624 /// loop after executing the main loop. Needed to resume inductions and
625 /// reductions during epilogue vectorization.
626 BasicBlock *AdditionalBypassBlock = nullptr;
627
628 VPlan &Plan;
629
630 /// The vector preheader block of \p Plan, used as target for check blocks
631 /// introduced during skeleton creation.
632 VPBlockBase *VectorPHVPB;
633};
634
635/// Encapsulate information regarding vectorization of a loop and its epilogue.
636/// This information is meant to be updated and used across two stages of
637/// epilogue vectorization.
638struct EpilogueLoopVectorizationInfo {
639 ElementCount MainLoopVF = ElementCount::getFixed(MinVal: 0);
640 unsigned MainLoopUF = 0;
641 ElementCount EpilogueVF = ElementCount::getFixed(MinVal: 0);
642 unsigned EpilogueUF = 0;
643 BasicBlock *MainLoopIterationCountCheck = nullptr;
644 BasicBlock *EpilogueIterationCountCheck = nullptr;
645 Value *TripCount = nullptr;
646 Value *VectorTripCount = nullptr;
647 VPlan &EpiloguePlan;
648
649 EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
650 ElementCount EVF, unsigned EUF,
651 VPlan &EpiloguePlan)
652 : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
653 EpiloguePlan(EpiloguePlan) {
654 assert(EUF == 1 &&
655 "A high UF for the epilogue loop is likely not beneficial.");
656 }
657};
658
659/// An extension of the inner loop vectorizer that creates a skeleton for a
660/// vectorized loop that has its epilogue (residual) also vectorized.
661/// The idea is to run the vplan on a given loop twice, firstly to setup the
662/// skeleton and vectorize the main loop, and secondly to complete the skeleton
663/// from the first step and vectorize the epilogue. This is achieved by
664/// deriving two concrete strategy classes from this base class and invoking
665/// them in succession from the loop vectorizer planner.
666class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
667public:
668 InnerLoopAndEpilogueVectorizer(
669 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
670 DominatorTree *DT, const TargetLibraryInfo *TLI,
671 const TargetTransformInfo *TTI, AssumptionCache *AC,
672 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
673 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
674 ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
675 : InnerLoopVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
676 EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, CM,
677 BFI, PSI, Checks, Plan),
678 EPI(EPI) {}
679
680 // Override this function to handle the more complex control flow around the
681 // three loops.
682 BasicBlock *createVectorizedLoopSkeleton() final {
683 return createEpilogueVectorizedLoopSkeleton();
684 }
685
686 /// The interface for creating a vectorized skeleton using one of two
687 /// different strategies, each corresponding to one execution of the vplan
688 /// as described above.
689 virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
690
691 /// Holds and updates state information required to vectorize the main loop
692 /// and its epilogue in two separate passes. This setup helps us avoid
693 /// regenerating and recomputing runtime safety checks. It also helps us to
694 /// shorten the iteration-count-check path length for the cases where the
695 /// iteration count of the loop is so small that the main vector loop is
696 /// completely skipped.
697 EpilogueLoopVectorizationInfo &EPI;
698};
699
700/// A specialized derived class of inner loop vectorizer that performs
701/// vectorization of *main* loops in the process of vectorizing loops and their
702/// epilogues.
703class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
704public:
705 EpilogueVectorizerMainLoop(
706 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
707 DominatorTree *DT, const TargetLibraryInfo *TLI,
708 const TargetTransformInfo *TTI, AssumptionCache *AC,
709 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
710 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
711 ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
712 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
713 EPI, CM, BFI, PSI, Check, Plan) {}
714 /// Implements the interface for creating a vectorized skeleton using the
715 /// *main loop* strategy (ie the first pass of vplan execution).
716 BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
717
718protected:
719 /// Emits an iteration count bypass check once for the main loop (when \p
720 /// ForEpilogue is false) and once for the epilogue loop (when \p
721 /// ForEpilogue is true).
722 BasicBlock *emitIterationCountCheck(BasicBlock *Bypass, bool ForEpilogue);
723 void printDebugTracesAtStart() override;
724 void printDebugTracesAtEnd() override;
725};
726
727// A specialized derived class of inner loop vectorizer that performs
728// vectorization of *epilogue* loops in the process of vectorizing loops and
729// their epilogues.
730class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
731public:
732 EpilogueVectorizerEpilogueLoop(
733 Loop *OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo *LI,
734 DominatorTree *DT, const TargetLibraryInfo *TLI,
735 const TargetTransformInfo *TTI, AssumptionCache *AC,
736 OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
737 LoopVectorizationCostModel *CM, BlockFrequencyInfo *BFI,
738 ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
739 : InnerLoopAndEpilogueVectorizer(OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
740 EPI, CM, BFI, PSI, Checks, Plan) {
741 TripCount = EPI.TripCount;
742 }
743 /// Implements the interface for creating a vectorized skeleton using the
744 /// *epilogue loop* strategy (ie the second pass of vplan execution).
745 BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
746
747protected:
748 /// Emits an iteration count bypass check after the main vector loop has
749 /// finished to see if there are any iterations left to execute by either
750 /// the vector epilogue or the scalar epilogue.
751 BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
752 BasicBlock *Bypass,
753 BasicBlock *Insert);
754 void printDebugTracesAtStart() override;
755 void printDebugTracesAtEnd() override;
756};
757} // end namespace llvm
758
759/// Look for a meaningful debug location on the instruction or its operands.
760static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
761 if (!I)
762 return DebugLoc::getUnknown();
763
764 DebugLoc Empty;
765 if (I->getDebugLoc() != Empty)
766 return I->getDebugLoc();
767
768 for (Use &Op : I->operands()) {
769 if (Instruction *OpInst = dyn_cast<Instruction>(Val&: Op))
770 if (OpInst->getDebugLoc() != Empty)
771 return OpInst->getDebugLoc();
772 }
773
774 return I->getDebugLoc();
775}
776
777/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
778/// is passed, the message relates to that particular instruction.
779#ifndef NDEBUG
780static void debugVectorizationMessage(const StringRef Prefix,
781 const StringRef DebugMsg,
782 Instruction *I) {
783 dbgs() << "LV: " << Prefix << DebugMsg;
784 if (I != nullptr)
785 dbgs() << " " << *I;
786 else
787 dbgs() << '.';
788 dbgs() << '\n';
789}
790#endif
791
792/// Create an analysis remark that explains why vectorization failed
793///
794/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
795/// RemarkName is the identifier for the remark. If \p I is passed it is an
796/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
797/// the location of the remark. If \p DL is passed, use it as debug location for
798/// the remark. \return the remark object that can be streamed to.
799static OptimizationRemarkAnalysis
800createLVAnalysis(const char *PassName, StringRef RemarkName, Loop *TheLoop,
801 Instruction *I, DebugLoc DL = {}) {
802 BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
803 // If debug location is attached to the instruction, use it. Otherwise if DL
804 // was not provided, use the loop's.
805 if (I && I->getDebugLoc())
806 DL = I->getDebugLoc();
807 else if (!DL)
808 DL = TheLoop->getStartLoc();
809
810 return OptimizationRemarkAnalysis(PassName, RemarkName, DL, CodeRegion);
811}
812
813namespace llvm {
814
815/// Return a value for Step multiplied by VF.
816Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
817 int64_t Step) {
818 assert(Ty->isIntegerTy() && "Expected an integer step");
819 return B.CreateElementCount(Ty, EC: VF.multiplyCoefficientBy(RHS: Step));
820}
821
822/// Return the runtime value for VF.
823Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF) {
824 return B.CreateElementCount(Ty, EC: VF);
825}
826
827void reportVectorizationFailure(const StringRef DebugMsg,
828 const StringRef OREMsg, const StringRef ORETag,
829 OptimizationRemarkEmitter *ORE, Loop *TheLoop,
830 Instruction *I) {
831 LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
832 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
833 ORE->emit(
834 OptDiag: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I)
835 << "loop not vectorized: " << OREMsg);
836}
837
838/// Reports an informative message: print \p Msg for debugging purposes as well
839/// as an optimization remark. Uses either \p I as location of the remark, or
840/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
841/// remark. If \p DL is passed, use it as debug location for the remark.
842static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
843 OptimizationRemarkEmitter *ORE,
844 Loop *TheLoop, Instruction *I = nullptr,
845 DebugLoc DL = {}) {
846 LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
847 LoopVectorizeHints Hints(TheLoop, true /* doesn't matter */, *ORE);
848 ORE->emit(OptDiag: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop,
849 I, DL)
850 << Msg);
851}
852
853/// Report successful vectorization of the loop. In case an outer loop is
854/// vectorized, prepend "outer" to the vectorization remark.
855static void reportVectorization(OptimizationRemarkEmitter *ORE, Loop *TheLoop,
856 VectorizationFactor VF, unsigned IC) {
857 LLVM_DEBUG(debugVectorizationMessage(
858 "Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
859 nullptr));
860 StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
861 ORE->emit(RemarkBuilder: [&]() {
862 return OptimizationRemark(LV_NAME, "Vectorized", TheLoop->getStartLoc(),
863 TheLoop->getHeader())
864 << "vectorized " << LoopType << "loop (vectorization width: "
865 << ore::NV("VectorizationFactor", VF.Width)
866 << ", interleaved count: " << ore::NV("InterleaveCount", IC) << ")";
867 });
868}
869
870} // end namespace llvm
871
872namespace llvm {
873
874// Loop vectorization cost-model hints how the scalar epilogue loop should be
875// lowered.
876enum ScalarEpilogueLowering {
877
878 // The default: allowing scalar epilogues.
879 CM_ScalarEpilogueAllowed,
880
881 // Vectorization with OptForSize: don't allow epilogues.
882 CM_ScalarEpilogueNotAllowedOptSize,
883
884 // A special case of vectorisation with OptForSize: loops with a very small
885 // trip count are considered for vectorization under OptForSize, thereby
886 // making sure the cost of their loop body is dominant, free of runtime
887 // guards and scalar iteration overheads.
888 CM_ScalarEpilogueNotAllowedLowTripLoop,
889
890 // Loop hint predicate indicating an epilogue is undesired.
891 CM_ScalarEpilogueNotNeededUsePredicate,
892
893 // Directive indicating we must either tail fold or not vectorize
894 CM_ScalarEpilogueNotAllowedUsePredicate
895};
896
897/// LoopVectorizationCostModel - estimates the expected speedups due to
898/// vectorization.
899/// In many cases vectorization is not profitable. This can happen because of
900/// a number of reasons. In this class we mainly attempt to predict the
901/// expected speedup/slowdowns due to the supported instruction set. We use the
902/// TargetTransformInfo to query the different backends for the cost of
903/// different operations.
904class LoopVectorizationCostModel {
905 friend class LoopVectorizationPlanner;
906
907public:
908 LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
909 PredicatedScalarEvolution &PSE, LoopInfo *LI,
910 LoopVectorizationLegality *Legal,
911 const TargetTransformInfo &TTI,
912 const TargetLibraryInfo *TLI, DemandedBits *DB,
913 AssumptionCache *AC,
914 OptimizationRemarkEmitter *ORE, const Function *F,
915 const LoopVectorizeHints *Hints,
916 InterleavedAccessInfo &IAI,
917 ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI)
918 : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
919 TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
920 Hints(Hints), InterleaveInfo(IAI) {
921 if (TTI.supportsScalableVectors() || ForceTargetSupportsScalableVectors)
922 initializeVScaleForTuning();
923 CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
924 // Query this against the original loop and save it here because the profile
925 // of the original loop header may change as the transformation happens.
926 OptForSize = llvm::shouldOptimizeForSize(BB: L->getHeader(), PSI, BFI,
927 QueryType: PGSOQueryType::IRPass);
928 }
929
930 /// \return An upper bound for the vectorization factors (both fixed and
931 /// scalable). If the factors are 0, vectorization and interleaving should be
932 /// avoided up front.
933 FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
934
935 /// \return True if runtime checks are required for vectorization, and false
936 /// otherwise.
937 bool runtimeChecksRequired();
938
939 /// Setup cost-based decisions for user vectorization factor.
940 /// \return true if the UserVF is a feasible VF to be chosen.
941 bool selectUserVectorizationFactor(ElementCount UserVF) {
942 collectNonVectorizedAndSetWideningDecisions(VF: UserVF);
943 return expectedCost(VF: UserVF).isValid();
944 }
945
946 /// \return True if maximizing vector bandwidth is enabled by the target or
947 /// user options, for the given register kind.
948 bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
949
950 /// \return True if maximizing vector bandwidth is enabled by the target or
951 /// user options, for the given vector factor.
952 bool useMaxBandwidth(ElementCount VF);
953
954 /// \return The size (in bits) of the smallest and widest types in the code
955 /// that needs to be vectorized. We ignore values that remain scalar such as
956 /// 64 bit loop indices.
957 std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
958
959 /// \return The desired interleave count.
960 /// If interleave count has been specified by metadata it will be returned.
961 /// Otherwise, the interleave count is computed and returned. VF and LoopCost
962 /// are the selected vectorization factor and the cost of the selected VF.
963 unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF,
964 InstructionCost LoopCost);
965
966 /// Memory access instruction may be vectorized in more than one way.
967 /// Form of instruction after vectorization depends on cost.
968 /// This function takes cost-based decisions for Load/Store instructions
969 /// and collects them in a map. This decisions map is used for building
970 /// the lists of loop-uniform and loop-scalar instructions.
971 /// The calculated cost is saved with widening decision in order to
972 /// avoid redundant calculations.
973 void setCostBasedWideningDecision(ElementCount VF);
974
975 /// A call may be vectorized in different ways depending on whether we have
976 /// vectorized variants available and whether the target supports masking.
977 /// This function analyzes all calls in the function at the supplied VF,
978 /// makes a decision based on the costs of available options, and stores that
979 /// decision in a map for use in planning and plan execution.
980 void setVectorizedCallDecision(ElementCount VF);
981
982 /// Collect values we want to ignore in the cost model.
983 void collectValuesToIgnore();
984
985 /// Collect all element types in the loop for which widening is needed.
986 void collectElementTypesForWidening();
987
988 /// Split reductions into those that happen in the loop, and those that happen
989 /// outside. In loop reductions are collected into InLoopReductions.
990 void collectInLoopReductions();
991
992 /// Returns true if we should use strict in-order reductions for the given
993 /// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
994 /// the IsOrdered flag of RdxDesc is set and we do not allow reordering
995 /// of FP operations.
996 bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
997 return !Hints->allowReordering() && RdxDesc.isOrdered();
998 }
999
1000 /// \returns The smallest bitwidth each instruction can be represented with.
1001 /// The vector equivalents of these instructions should be truncated to this
1002 /// type.
1003 const MapVector<Instruction *, uint64_t> &getMinimalBitwidths() const {
1004 return MinBWs;
1005 }
1006
1007 /// \returns True if it is more profitable to scalarize instruction \p I for
1008 /// vectorization factor \p VF.
1009 bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
1010 assert(VF.isVector() &&
1011 "Profitable to scalarize relevant only for VF > 1.");
1012 assert(
1013 TheLoop->isInnermost() &&
1014 "cost-model should not be used for outer loops (in VPlan-native path)");
1015
1016 auto Scalars = InstsToScalarize.find(Val: VF);
1017 assert(Scalars != InstsToScalarize.end() &&
1018 "VF not yet analyzed for scalarization profitability");
1019 return Scalars->second.contains(Val: I);
1020 }
1021
1022 /// Returns true if \p I is known to be uniform after vectorization.
1023 bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
1024 assert(
1025 TheLoop->isInnermost() &&
1026 "cost-model should not be used for outer loops (in VPlan-native path)");
1027 // Pseudo probe needs to be duplicated for each unrolled iteration and
1028 // vector lane so that profiled loop trip count can be accurately
1029 // accumulated instead of being under counted.
1030 if (isa<PseudoProbeInst>(Val: I))
1031 return false;
1032
1033 if (VF.isScalar())
1034 return true;
1035
1036 auto UniformsPerVF = Uniforms.find(Val: VF);
1037 assert(UniformsPerVF != Uniforms.end() &&
1038 "VF not yet analyzed for uniformity");
1039 return UniformsPerVF->second.count(Ptr: I);
1040 }
1041
1042 /// Returns true if \p I is known to be scalar after vectorization.
1043 bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
1044 assert(
1045 TheLoop->isInnermost() &&
1046 "cost-model should not be used for outer loops (in VPlan-native path)");
1047 if (VF.isScalar())
1048 return true;
1049
1050 auto ScalarsPerVF = Scalars.find(Val: VF);
1051 assert(ScalarsPerVF != Scalars.end() &&
1052 "Scalar values are not calculated for VF");
1053 return ScalarsPerVF->second.count(Ptr: I);
1054 }
1055
1056 /// \returns True if instruction \p I can be truncated to a smaller bitwidth
1057 /// for vectorization factor \p VF.
1058 bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
1059 return VF.isVector() && MinBWs.contains(Key: I) &&
1060 !isProfitableToScalarize(I, VF) &&
1061 !isScalarAfterVectorization(I, VF);
1062 }
1063
1064 /// Decision that was taken during cost calculation for memory instruction.
1065 enum InstWidening {
1066 CM_Unknown,
1067 CM_Widen, // For consecutive accesses with stride +1.
1068 CM_Widen_Reverse, // For consecutive accesses with stride -1.
1069 CM_Interleave,
1070 CM_GatherScatter,
1071 CM_Scalarize,
1072 CM_VectorCall,
1073 CM_IntrinsicCall
1074 };
1075
1076 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1077 /// instruction \p I and vector width \p VF.
1078 void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1079 InstructionCost Cost) {
1080 assert(VF.isVector() && "Expected VF >=2");
1081 WideningDecisions[{I, VF}] = {W, Cost};
1082 }
1083
1084 /// Save vectorization decision \p W and \p Cost taken by the cost model for
1085 /// interleaving group \p Grp and vector width \p VF.
1086 void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1087 ElementCount VF, InstWidening W,
1088 InstructionCost Cost) {
1089 assert(VF.isVector() && "Expected VF >=2");
1090 /// Broadcast this decicion to all instructions inside the group.
1091 /// When interleaving, the cost will only be assigned one instruction, the
1092 /// insert position. For other cases, add the appropriate fraction of the
1093 /// total cost to each instruction. This ensures accurate costs are used,
1094 /// even if the insert position instruction is not used.
1095 InstructionCost InsertPosCost = Cost;
1096 InstructionCost OtherMemberCost = 0;
1097 if (W != CM_Interleave)
1098 OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1099 ;
1100 for (unsigned Idx = 0; Idx < Grp->getFactor(); ++Idx) {
1101 if (auto *I = Grp->getMember(Index: Idx)) {
1102 if (Grp->getInsertPos() == I)
1103 WideningDecisions[{I, VF}] = {W, InsertPosCost};
1104 else
1105 WideningDecisions[{I, VF}] = {W, OtherMemberCost};
1106 }
1107 }
1108 }
1109
1110 /// Return the cost model decision for the given instruction \p I and vector
1111 /// width \p VF. Return CM_Unknown if this instruction did not pass
1112 /// through the cost modeling.
1113 InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
1114 assert(VF.isVector() && "Expected VF to be a vector VF");
1115 assert(
1116 TheLoop->isInnermost() &&
1117 "cost-model should not be used for outer loops (in VPlan-native path)");
1118
1119 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1120 auto Itr = WideningDecisions.find(Val: InstOnVF);
1121 if (Itr == WideningDecisions.end())
1122 return CM_Unknown;
1123 return Itr->second.first;
1124 }
1125
1126 /// Return the vectorization cost for the given instruction \p I and vector
1127 /// width \p VF.
1128 InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1129 assert(VF.isVector() && "Expected VF >=2");
1130 std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1131 assert(WideningDecisions.contains(InstOnVF) &&
1132 "The cost is not calculated");
1133 return WideningDecisions[InstOnVF].second;
1134 }
1135
1136 struct CallWideningDecision {
1137 InstWidening Kind;
1138 Function *Variant;
1139 Intrinsic::ID IID;
1140 std::optional<unsigned> MaskPos;
1141 InstructionCost Cost;
1142 };
1143
1144 void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1145 Function *Variant, Intrinsic::ID IID,
1146 std::optional<unsigned> MaskPos,
1147 InstructionCost Cost) {
1148 assert(!VF.isScalar() && "Expected vector VF");
1149 CallWideningDecisions[{CI, VF}] = {.Kind: Kind, .Variant: Variant, .IID: IID, .MaskPos: MaskPos, .Cost: Cost};
1150 }
1151
1152 CallWideningDecision getCallWideningDecision(CallInst *CI,
1153 ElementCount VF) const {
1154 assert(!VF.isScalar() && "Expected vector VF");
1155 return CallWideningDecisions.at(Val: {CI, VF});
1156 }
1157
1158 /// Return True if instruction \p I is an optimizable truncate whose operand
1159 /// is an induction variable. Such a truncate will be removed by adding a new
1160 /// induction variable with the destination type.
1161 bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1162 // If the instruction is not a truncate, return false.
1163 auto *Trunc = dyn_cast<TruncInst>(Val: I);
1164 if (!Trunc)
1165 return false;
1166
1167 // Get the source and destination types of the truncate.
1168 Type *SrcTy = toVectorTy(Scalar: Trunc->getSrcTy(), EC: VF);
1169 Type *DestTy = toVectorTy(Scalar: Trunc->getDestTy(), EC: VF);
1170
1171 // If the truncate is free for the given types, return false. Replacing a
1172 // free truncate with an induction variable would add an induction variable
1173 // update instruction to each iteration of the loop. We exclude from this
1174 // check the primary induction variable since it will need an update
1175 // instruction regardless.
1176 Value *Op = Trunc->getOperand(i_nocapture: 0);
1177 if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(Ty1: SrcTy, Ty2: DestTy))
1178 return false;
1179
1180 // If the truncated value is not an induction variable, return false.
1181 return Legal->isInductionPhi(V: Op);
1182 }
1183
1184 /// Collects the instructions to scalarize for each predicated instruction in
1185 /// the loop.
1186 void collectInstsToScalarize(ElementCount VF);
1187
1188 /// Collect values that will not be widened, including Uniforms, Scalars, and
1189 /// Instructions to Scalarize for the given \p VF.
1190 /// The sets depend on CM decision for Load/Store instructions
1191 /// that may be vectorized as interleave, gather-scatter or scalarized.
1192 /// Also make a decision on what to do about call instructions in the loop
1193 /// at that VF -- scalarize, call a known vector routine, or call a
1194 /// vector intrinsic.
1195 void collectNonVectorizedAndSetWideningDecisions(ElementCount VF) {
1196 // Do the analysis once.
1197 if (VF.isScalar() || Uniforms.contains(Val: VF))
1198 return;
1199 setCostBasedWideningDecision(VF);
1200 collectLoopUniforms(VF);
1201 setVectorizedCallDecision(VF);
1202 collectLoopScalars(VF);
1203 collectInstsToScalarize(VF);
1204 }
1205
1206 /// Returns true if the target machine supports masked store operation
1207 /// for the given \p DataType and kind of access to \p Ptr.
1208 bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment,
1209 unsigned AddressSpace) const {
1210 return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1211 TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace);
1212 }
1213
1214 /// Returns true if the target machine supports masked load operation
1215 /// for the given \p DataType and kind of access to \p Ptr.
1216 bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment,
1217 unsigned AddressSpace) const {
1218 return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1219 TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace);
1220 }
1221
1222 /// Returns true if the target machine can represent \p V as a masked gather
1223 /// or scatter operation.
1224 bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1225 bool LI = isa<LoadInst>(Val: V);
1226 bool SI = isa<StoreInst>(Val: V);
1227 if (!LI && !SI)
1228 return false;
1229 auto *Ty = getLoadStoreType(I: V);
1230 Align Align = getLoadStoreAlignment(I: V);
1231 if (VF.isVector())
1232 Ty = VectorType::get(ElementType: Ty, EC: VF);
1233 return (LI && TTI.isLegalMaskedGather(DataType: Ty, Alignment: Align)) ||
1234 (SI && TTI.isLegalMaskedScatter(DataType: Ty, Alignment: Align));
1235 }
1236
1237 /// Returns true if the target machine supports all of the reduction
1238 /// variables found for the given VF.
1239 bool canVectorizeReductions(ElementCount VF) const {
1240 return (all_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
1241 const RecurrenceDescriptor &RdxDesc = Reduction.second;
1242 return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1243 }));
1244 }
1245
1246 /// Given costs for both strategies, return true if the scalar predication
1247 /// lowering should be used for div/rem. This incorporates an override
1248 /// option so it is not simply a cost comparison.
1249 bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1250 InstructionCost SafeDivisorCost) const {
1251 switch (ForceSafeDivisor) {
1252 case cl::BOU_UNSET:
1253 return ScalarCost < SafeDivisorCost;
1254 case cl::BOU_TRUE:
1255 return false;
1256 case cl::BOU_FALSE:
1257 return true;
1258 }
1259 llvm_unreachable("impossible case value");
1260 }
1261
1262 /// Returns true if \p I is an instruction which requires predication and
1263 /// for which our chosen predication strategy is scalarization (i.e. we
1264 /// don't have an alternate strategy such as masking available).
1265 /// \p VF is the vectorization factor that will be used to vectorize \p I.
1266 bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
1267
1268 /// Returns true if \p I is an instruction that needs to be predicated
1269 /// at runtime. The result is independent of the predication mechanism.
1270 /// Superset of instructions that return true for isScalarWithPredication.
1271 bool isPredicatedInst(Instruction *I) const;
1272
1273 /// Return the costs for our two available strategies for lowering a
1274 /// div/rem operation which requires speculating at least one lane.
1275 /// First result is for scalarization (will be invalid for scalable
1276 /// vectors); second is for the safe-divisor strategy.
1277 std::pair<InstructionCost, InstructionCost>
1278 getDivRemSpeculationCost(Instruction *I,
1279 ElementCount VF) const;
1280
1281 /// Returns true if \p I is a memory instruction with consecutive memory
1282 /// access that can be widened.
1283 bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1284
1285 /// Returns true if \p I is a memory instruction in an interleaved-group
1286 /// of memory accesses that can be vectorized with wide vector loads/stores
1287 /// and shuffles.
1288 bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF) const;
1289
1290 /// Check if \p Instr belongs to any interleaved access group.
1291 bool isAccessInterleaved(Instruction *Instr) const {
1292 return InterleaveInfo.isInterleaved(Instr);
1293 }
1294
1295 /// Get the interleaved access group that \p Instr belongs to.
1296 const InterleaveGroup<Instruction> *
1297 getInterleavedAccessGroup(Instruction *Instr) const {
1298 return InterleaveInfo.getInterleaveGroup(Instr);
1299 }
1300
1301 /// Returns true if we're required to use a scalar epilogue for at least
1302 /// the final iteration of the original loop.
1303 bool requiresScalarEpilogue(bool IsVectorizing) const {
1304 if (!isScalarEpilogueAllowed()) {
1305 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1306 return false;
1307 }
1308 // If we might exit from anywhere but the latch and early exit vectorization
1309 // is disabled, we must run the exiting iteration in scalar form.
1310 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1311 !(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1312 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1313 "from latch block\n");
1314 return true;
1315 }
1316 if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1317 LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1318 "interleaved group requires scalar epilogue\n");
1319 return true;
1320 }
1321 LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1322 return false;
1323 }
1324
1325 /// Returns true if a scalar epilogue is not allowed due to optsize or a
1326 /// loop hint annotation.
1327 bool isScalarEpilogueAllowed() const {
1328 return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1329 }
1330
1331 /// Returns the TailFoldingStyle that is best for the current loop.
1332 TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1333 if (!ChosenTailFoldingStyle)
1334 return TailFoldingStyle::None;
1335 return IVUpdateMayOverflow ? ChosenTailFoldingStyle->first
1336 : ChosenTailFoldingStyle->second;
1337 }
1338
1339 /// Selects and saves TailFoldingStyle for 2 options - if IV update may
1340 /// overflow or not.
1341 /// \param IsScalableVF true if scalable vector factors enabled.
1342 /// \param UserIC User specific interleave count.
1343 void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1344 assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1345 if (!Legal->canFoldTailByMasking()) {
1346 ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1347 return;
1348 }
1349
1350 // Default to TTI preference, but allow command line override.
1351 ChosenTailFoldingStyle = {
1352 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/true),
1353 TTI.getPreferredTailFoldingStyle(/*IVUpdateMayOverflow=*/false)};
1354 if (ForceTailFoldingStyle.getNumOccurrences())
1355 ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
1356 ForceTailFoldingStyle.getValue()};
1357
1358 if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1359 return;
1360 // Override forced styles if needed.
1361 // FIXME: Investigate opportunity for fixed vector factor.
1362 bool EVLIsLegal = UserIC <= 1 && IsScalableVF &&
1363 TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1364 if (EVLIsLegal)
1365 return;
1366 // If for some reason EVL mode is unsupported, fallback to
1367 // DataWithoutLaneMask to try to vectorize the loop with folded tail
1368 // in a generic way.
1369 ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
1370 TailFoldingStyle::DataWithoutLaneMask};
1371 LLVM_DEBUG(
1372 dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1373 "not try to generate VP Intrinsics "
1374 << (UserIC > 1
1375 ? "since interleave count specified is greater than 1.\n"
1376 : "due to non-interleaving reasons.\n"));
1377 }
1378
1379 /// Returns true if all loop blocks should be masked to fold tail loop.
1380 bool foldTailByMasking() const {
1381 // TODO: check if it is possible to check for None style independent of
1382 // IVUpdateMayOverflow flag in getTailFoldingStyle.
1383 return getTailFoldingStyle() != TailFoldingStyle::None;
1384 }
1385
1386 /// Return maximum safe number of elements to be processed per vector
1387 /// iteration, which do not prevent store-load forwarding and are safe with
1388 /// regard to the memory dependencies. Required for EVL-based VPlans to
1389 /// correctly calculate AVL (application vector length) as min(remaining AVL,
1390 /// MaxSafeElements).
1391 /// TODO: need to consider adjusting cost model to use this value as a
1392 /// vectorization factor for EVL-based vectorization.
1393 std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1394
1395 /// Returns true if the instructions in this block requires predication
1396 /// for any reason, e.g. because tail folding now requires a predicate
1397 /// or because the block in the original loop was predicated.
1398 bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
1399 return foldTailByMasking() || Legal->blockNeedsPredication(BB);
1400 }
1401
1402 /// Returns true if VP intrinsics with explicit vector length support should
1403 /// be generated in the tail folded loop.
1404 bool foldTailWithEVL() const {
1405 return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1406 }
1407
1408 /// Returns true if the Phi is part of an inloop reduction.
1409 bool isInLoopReduction(PHINode *Phi) const {
1410 return InLoopReductions.contains(Ptr: Phi);
1411 }
1412
1413 /// Returns true if the predicated reduction select should be used to set the
1414 /// incoming value for the reduction phi.
1415 bool usePredicatedReductionSelect() const {
1416 // Force to use predicated reduction select since the EVL of the
1417 // second-to-last iteration might not be VF*UF.
1418 if (foldTailWithEVL())
1419 return true;
1420 return PreferPredicatedReductionSelect ||
1421 TTI.preferPredicatedReductionSelect();
1422 }
1423
1424 /// Estimate cost of an intrinsic call instruction CI if it were vectorized
1425 /// with factor VF. Return the cost of the instruction, including
1426 /// scalarization overhead if it's needed.
1427 InstructionCost getVectorIntrinsicCost(CallInst *CI, ElementCount VF) const;
1428
1429 /// Estimate cost of a call instruction CI if it were vectorized with factor
1430 /// VF. Return the cost of the instruction, including scalarization overhead
1431 /// if it's needed.
1432 InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF) const;
1433
1434 /// Invalidates decisions already taken by the cost model.
1435 void invalidateCostModelingDecisions() {
1436 WideningDecisions.clear();
1437 CallWideningDecisions.clear();
1438 Uniforms.clear();
1439 Scalars.clear();
1440 }
1441
1442 /// Returns the expected execution cost. The unit of the cost does
1443 /// not matter because we use the 'cost' units to compare different
1444 /// vector widths. The cost that is returned is *not* normalized by
1445 /// the factor width.
1446 InstructionCost expectedCost(ElementCount VF);
1447
1448 bool hasPredStores() const { return NumPredStores > 0; }
1449
1450 /// Returns true if epilogue vectorization is considered profitable, and
1451 /// false otherwise.
1452 /// \p VF is the vectorization factor chosen for the original loop.
1453 /// \p Multiplier is an aditional scaling factor applied to VF before
1454 /// comparing to EpilogueVectorizationMinVF.
1455 bool isEpilogueVectorizationProfitable(const ElementCount VF,
1456 const unsigned IC) const;
1457
1458 /// Returns the execution time cost of an instruction for a given vector
1459 /// width. Vector width of one means scalar.
1460 InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1461
1462 /// Return the cost of instructions in an inloop reduction pattern, if I is
1463 /// part of that pattern.
1464 std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1465 ElementCount VF,
1466 Type *VectorTy) const;
1467
1468 /// Returns true if \p Op should be considered invariant and if it is
1469 /// trivially hoistable.
1470 bool shouldConsiderInvariant(Value *Op);
1471
1472 /// Return the value of vscale used for tuning the cost model.
1473 std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1474
1475private:
1476 unsigned NumPredStores = 0;
1477
1478 /// Used to store the value of vscale used for tuning the cost model. It is
1479 /// initialized during object construction.
1480 std::optional<unsigned> VScaleForTuning;
1481
1482 /// Initializes the value of vscale used for tuning the cost model. If
1483 /// vscale_range.min == vscale_range.max then return vscale_range.max, else
1484 /// return the value returned by the corresponding TTI method.
1485 void initializeVScaleForTuning() {
1486 const Function *Fn = TheLoop->getHeader()->getParent();
1487 if (Fn->hasFnAttribute(Kind: Attribute::VScaleRange)) {
1488 auto Attr = Fn->getFnAttribute(Kind: Attribute::VScaleRange);
1489 auto Min = Attr.getVScaleRangeMin();
1490 auto Max = Attr.getVScaleRangeMax();
1491 if (Max && Min == Max) {
1492 VScaleForTuning = Max;
1493 return;
1494 }
1495 }
1496
1497 VScaleForTuning = TTI.getVScaleForTuning();
1498 }
1499
1500 /// \return An upper bound for the vectorization factors for both
1501 /// fixed and scalable vectorization, where the minimum-known number of
1502 /// elements is a power-of-2 larger than zero. If scalable vectorization is
1503 /// disabled or unsupported, then the scalable part will be equal to
1504 /// ElementCount::getScalable(0).
1505 FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1506 ElementCount UserVF,
1507 bool FoldTailByMasking);
1508
1509 /// \return the maximized element count based on the targets vector
1510 /// registers and the loop trip-count, but limited to a maximum safe VF.
1511 /// This is a helper function of computeFeasibleMaxVF.
1512 ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1513 unsigned SmallestType,
1514 unsigned WidestType,
1515 ElementCount MaxSafeVF,
1516 bool FoldTailByMasking);
1517
1518 /// Checks if scalable vectorization is supported and enabled. Caches the
1519 /// result to avoid repeated debug dumps for repeated queries.
1520 bool isScalableVectorizationAllowed();
1521
1522 /// \return the maximum legal scalable VF, based on the safe max number
1523 /// of elements.
1524 ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1525
1526 /// Calculate vectorization cost of memory instruction \p I.
1527 InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1528
1529 /// The cost computation for scalarized memory instruction.
1530 InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1531
1532 /// The cost computation for interleaving group of memory instructions.
1533 InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1534
1535 /// The cost computation for Gather/Scatter instruction.
1536 InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1537
1538 /// The cost computation for widening instruction \p I with consecutive
1539 /// memory access.
1540 InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1541
1542 /// The cost calculation for Load/Store instruction \p I with uniform pointer -
1543 /// Load: scalar load + broadcast.
1544 /// Store: scalar store + (loop invariant value stored? 0 : extract of last
1545 /// element)
1546 InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1547
1548 /// Estimate the overhead of scalarizing an instruction. This is a
1549 /// convenience wrapper for the type-based getScalarizationOverhead API.
1550 InstructionCost getScalarizationOverhead(Instruction *I,
1551 ElementCount VF) const;
1552
1553 /// Returns true if an artificially high cost for emulated masked memrefs
1554 /// should be used.
1555 bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1556
1557 /// Map of scalar integer values to the smallest bitwidth they can be legally
1558 /// represented as. The vector equivalents of these values should be truncated
1559 /// to this type.
1560 MapVector<Instruction *, uint64_t> MinBWs;
1561
1562 /// A type representing the costs for instructions if they were to be
1563 /// scalarized rather than vectorized. The entries are Instruction-Cost
1564 /// pairs.
1565 using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1566
1567 /// A set containing all BasicBlocks that are known to present after
1568 /// vectorization as a predicated block.
1569 DenseMap<ElementCount, SmallPtrSet<BasicBlock *, 4>>
1570 PredicatedBBsAfterVectorization;
1571
1572 /// Records whether it is allowed to have the original scalar loop execute at
1573 /// least once. This may be needed as a fallback loop in case runtime
1574 /// aliasing/dependence checks fail, or to handle the tail/remainder
1575 /// iterations when the trip count is unknown or doesn't divide by the VF,
1576 /// or as a peel-loop to handle gaps in interleave-groups.
1577 /// Under optsize and when the trip count is very small we don't allow any
1578 /// iterations to execute in the scalar loop.
1579 ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1580
1581 /// Control finally chosen tail folding style. The first element is used if
1582 /// the IV update may overflow, the second element - if it does not.
1583 std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1584 ChosenTailFoldingStyle;
1585
1586 /// true if scalable vectorization is supported and enabled.
1587 std::optional<bool> IsScalableVectorizationAllowed;
1588
1589 /// Maximum safe number of elements to be processed per vector iteration,
1590 /// which do not prevent store-load forwarding and are safe with regard to the
1591 /// memory dependencies. Required for EVL-based veectorization, where this
1592 /// value is used as the upper bound of the safe AVL.
1593 std::optional<unsigned> MaxSafeElements;
1594
1595 /// A map holding scalar costs for different vectorization factors. The
1596 /// presence of a cost for an instruction in the mapping indicates that the
1597 /// instruction will be scalarized when vectorizing with the associated
1598 /// vectorization factor. The entries are VF-ScalarCostTy pairs.
1599 DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1600
1601 /// Holds the instructions known to be uniform after vectorization.
1602 /// The data is collected per VF.
1603 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
1604
1605 /// Holds the instructions known to be scalar after vectorization.
1606 /// The data is collected per VF.
1607 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
1608
1609 /// Holds the instructions (address computations) that are forced to be
1610 /// scalarized.
1611 DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
1612
1613 /// PHINodes of the reductions that should be expanded in-loop.
1614 SmallPtrSet<PHINode *, 4> InLoopReductions;
1615
1616 /// A Map of inloop reduction operations and their immediate chain operand.
1617 /// FIXME: This can be removed once reductions can be costed correctly in
1618 /// VPlan. This was added to allow quick lookup of the inloop operations.
1619 DenseMap<Instruction *, Instruction *> InLoopReductionImmediateChains;
1620
1621 /// Returns the expected difference in cost from scalarizing the expression
1622 /// feeding a predicated instruction \p PredInst. The instructions to
1623 /// scalarize and their scalar costs are collected in \p ScalarCosts. A
1624 /// non-negative return value implies the expression will be scalarized.
1625 /// Currently, only single-use chains are considered for scalarization.
1626 InstructionCost computePredInstDiscount(Instruction *PredInst,
1627 ScalarCostsTy &ScalarCosts,
1628 ElementCount VF);
1629
1630 /// Collect the instructions that are uniform after vectorization. An
1631 /// instruction is uniform if we represent it with a single scalar value in
1632 /// the vectorized loop corresponding to each vector iteration. Examples of
1633 /// uniform instructions include pointer operands of consecutive or
1634 /// interleaved memory accesses. Note that although uniformity implies an
1635 /// instruction will be scalar, the reverse is not true. In general, a
1636 /// scalarized instruction will be represented by VF scalar values in the
1637 /// vectorized loop, each corresponding to an iteration of the original
1638 /// scalar loop.
1639 void collectLoopUniforms(ElementCount VF);
1640
1641 /// Collect the instructions that are scalar after vectorization. An
1642 /// instruction is scalar if it is known to be uniform or will be scalarized
1643 /// during vectorization. collectLoopScalars should only add non-uniform nodes
1644 /// to the list if they are used by a load/store instruction that is marked as
1645 /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1646 /// VF values in the vectorized loop, each corresponding to an iteration of
1647 /// the original scalar loop.
1648 void collectLoopScalars(ElementCount VF);
1649
1650 /// Keeps cost model vectorization decision and cost for instructions.
1651 /// Right now it is used for memory instructions only.
1652 using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1653 std::pair<InstWidening, InstructionCost>>;
1654
1655 DecisionList WideningDecisions;
1656
1657 using CallDecisionList =
1658 DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1659
1660 CallDecisionList CallWideningDecisions;
1661
1662 /// Returns true if \p V is expected to be vectorized and it needs to be
1663 /// extracted.
1664 bool needsExtract(Value *V, ElementCount VF) const {
1665 Instruction *I = dyn_cast<Instruction>(Val: V);
1666 if (VF.isScalar() || !I || !TheLoop->contains(Inst: I) ||
1667 TheLoop->isLoopInvariant(V: I) ||
1668 getWideningDecision(I, VF) == CM_Scalarize)
1669 return false;
1670
1671 // Assume we can vectorize V (and hence we need extraction) if the
1672 // scalars are not computed yet. This can happen, because it is called
1673 // via getScalarizationOverhead from setCostBasedWideningDecision, before
1674 // the scalars are collected. That should be a safe assumption in most
1675 // cases, because we check if the operands have vectorizable types
1676 // beforehand in LoopVectorizationLegality.
1677 return !Scalars.contains(Val: VF) || !isScalarAfterVectorization(I, VF);
1678 };
1679
1680 /// Returns a range containing only operands needing to be extracted.
1681 SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
1682 ElementCount VF) const {
1683 return SmallVector<Value *, 4>(make_filter_range(
1684 Range&: Ops, Pred: [this, VF](Value *V) { return this->needsExtract(V, VF); }));
1685 }
1686
1687public:
1688 /// The loop that we evaluate.
1689 Loop *TheLoop;
1690
1691 /// Predicated scalar evolution analysis.
1692 PredicatedScalarEvolution &PSE;
1693
1694 /// Loop Info analysis.
1695 LoopInfo *LI;
1696
1697 /// Vectorization legality.
1698 LoopVectorizationLegality *Legal;
1699
1700 /// Vector target information.
1701 const TargetTransformInfo &TTI;
1702
1703 /// Target Library Info.
1704 const TargetLibraryInfo *TLI;
1705
1706 /// Demanded bits analysis.
1707 DemandedBits *DB;
1708
1709 /// Assumption cache.
1710 AssumptionCache *AC;
1711
1712 /// Interface to emit optimization remarks.
1713 OptimizationRemarkEmitter *ORE;
1714
1715 const Function *TheFunction;
1716
1717 /// Loop Vectorize Hint.
1718 const LoopVectorizeHints *Hints;
1719
1720 /// The interleave access information contains groups of interleaved accesses
1721 /// with the same stride and close to each other.
1722 InterleavedAccessInfo &InterleaveInfo;
1723
1724 /// Values to ignore in the cost model.
1725 SmallPtrSet<const Value *, 16> ValuesToIgnore;
1726
1727 /// Values to ignore in the cost model when VF > 1.
1728 SmallPtrSet<const Value *, 16> VecValuesToIgnore;
1729
1730 /// All element types found in the loop.
1731 SmallPtrSet<Type *, 16> ElementTypesInLoop;
1732
1733 /// The kind of cost that we are calculating
1734 TTI::TargetCostKind CostKind;
1735
1736 /// Whether this loop should be optimized for size based on function attribute
1737 /// or profile information.
1738 bool OptForSize;
1739};
1740} // end namespace llvm
1741
1742namespace {
1743/// Helper struct to manage generating runtime checks for vectorization.
1744///
1745/// The runtime checks are created up-front in temporary blocks to allow better
1746/// estimating the cost and un-linked from the existing IR. After deciding to
1747/// vectorize, the checks are moved back. If deciding not to vectorize, the
1748/// temporary blocks are completely removed.
1749class GeneratedRTChecks {
1750 /// Basic block which contains the generated SCEV checks, if any.
1751 BasicBlock *SCEVCheckBlock = nullptr;
1752
1753 /// The value representing the result of the generated SCEV checks. If it is
1754 /// nullptr no SCEV checks have been generated.
1755 Value *SCEVCheckCond = nullptr;
1756
1757 /// Basic block which contains the generated memory runtime checks, if any.
1758 BasicBlock *MemCheckBlock = nullptr;
1759
1760 /// The value representing the result of the generated memory runtime checks.
1761 /// If it is nullptr no memory runtime checks have been generated.
1762 Value *MemRuntimeCheckCond = nullptr;
1763
1764 DominatorTree *DT;
1765 LoopInfo *LI;
1766 TargetTransformInfo *TTI;
1767
1768 SCEVExpander SCEVExp;
1769 SCEVExpander MemCheckExp;
1770
1771 bool CostTooHigh = false;
1772
1773 Loop *OuterLoop = nullptr;
1774
1775 PredicatedScalarEvolution &PSE;
1776
1777 /// The kind of cost that we are calculating
1778 TTI::TargetCostKind CostKind;
1779
1780public:
1781 GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1782 LoopInfo *LI, TargetTransformInfo *TTI,
1783 const DataLayout &DL, TTI::TargetCostKind CostKind)
1784 : DT(DT), LI(LI), TTI(TTI), SCEVExp(*PSE.getSE(), DL, "scev.check"),
1785 MemCheckExp(*PSE.getSE(), DL, "scev.check"), PSE(PSE),
1786 CostKind(CostKind) {}
1787
1788 /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1789 /// accurately estimate the cost of the runtime checks. The blocks are
1790 /// un-linked from the IR and are added back during vector code generation. If
1791 /// there is no vector code generation, the check blocks are removed
1792 /// completely.
1793 void create(Loop *L, const LoopAccessInfo &LAI,
1794 const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1795
1796 // Hard cutoff to limit compile-time increase in case a very large number of
1797 // runtime checks needs to be generated.
1798 // TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1799 // profile info.
1800 CostTooHigh =
1801 LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1802 if (CostTooHigh)
1803 return;
1804
1805 BasicBlock *LoopHeader = L->getHeader();
1806 BasicBlock *Preheader = L->getLoopPreheader();
1807
1808 // Use SplitBlock to create blocks for SCEV & memory runtime checks to
1809 // ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1810 // may be used by SCEVExpander. The blocks will be un-linked from their
1811 // predecessors and removed from LI & DT at the end of the function.
1812 if (!UnionPred.isAlwaysTrue()) {
1813 SCEVCheckBlock = SplitBlock(Old: Preheader, SplitPt: Preheader->getTerminator(), DT, LI,
1814 MSSAU: nullptr, BBName: "vector.scevcheck");
1815
1816 SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1817 Pred: &UnionPred, Loc: SCEVCheckBlock->getTerminator());
1818 if (isa<Constant>(Val: SCEVCheckCond)) {
1819 // Clean up directly after expanding the predicate to a constant, to
1820 // avoid further expansions re-using anything left over from SCEVExp.
1821 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1822 SCEVCleaner.cleanup();
1823 }
1824 }
1825
1826 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1827 if (RtPtrChecking.Need) {
1828 auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1829 MemCheckBlock = SplitBlock(Old: Pred, SplitPt: Pred->getTerminator(), DT, LI, MSSAU: nullptr,
1830 BBName: "vector.memcheck");
1831
1832 auto DiffChecks = RtPtrChecking.getDiffChecks();
1833 if (DiffChecks) {
1834 Value *RuntimeVF = nullptr;
1835 MemRuntimeCheckCond = addDiffRuntimeChecks(
1836 Loc: MemCheckBlock->getTerminator(), Checks: *DiffChecks, Expander&: MemCheckExp,
1837 GetVF: [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1838 if (!RuntimeVF)
1839 RuntimeVF = getRuntimeVF(B, Ty: B.getIntNTy(N: Bits), VF);
1840 return RuntimeVF;
1841 },
1842 IC);
1843 } else {
1844 MemRuntimeCheckCond = addRuntimeChecks(
1845 Loc: MemCheckBlock->getTerminator(), TheLoop: L, PointerChecks: RtPtrChecking.getChecks(),
1846 Expander&: MemCheckExp, HoistRuntimeChecks: VectorizerParams::HoistRuntimeChecks);
1847 }
1848 assert(MemRuntimeCheckCond &&
1849 "no RT checks generated although RtPtrChecking "
1850 "claimed checks are required");
1851 }
1852
1853 if (!MemCheckBlock && !SCEVCheckBlock)
1854 return;
1855
1856 // Unhook the temporary block with the checks, update various places
1857 // accordingly.
1858 if (SCEVCheckBlock)
1859 SCEVCheckBlock->replaceAllUsesWith(V: Preheader);
1860 if (MemCheckBlock)
1861 MemCheckBlock->replaceAllUsesWith(V: Preheader);
1862
1863 if (SCEVCheckBlock) {
1864 SCEVCheckBlock->getTerminator()->moveBefore(
1865 InsertPos: Preheader->getTerminator()->getIterator());
1866 auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock);
1867 UI->setDebugLoc(DebugLoc::getTemporary());
1868 Preheader->getTerminator()->eraseFromParent();
1869 }
1870 if (MemCheckBlock) {
1871 MemCheckBlock->getTerminator()->moveBefore(
1872 InsertPos: Preheader->getTerminator()->getIterator());
1873 auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock);
1874 UI->setDebugLoc(DebugLoc::getTemporary());
1875 Preheader->getTerminator()->eraseFromParent();
1876 }
1877
1878 DT->changeImmediateDominator(BB: LoopHeader, NewBB: Preheader);
1879 if (MemCheckBlock) {
1880 DT->eraseNode(BB: MemCheckBlock);
1881 LI->removeBlock(BB: MemCheckBlock);
1882 }
1883 if (SCEVCheckBlock) {
1884 DT->eraseNode(BB: SCEVCheckBlock);
1885 LI->removeBlock(BB: SCEVCheckBlock);
1886 }
1887
1888 // Outer loop is used as part of the later cost calculations.
1889 OuterLoop = L->getParentLoop();
1890 }
1891
1892 InstructionCost getCost() {
1893 if (SCEVCheckBlock || MemCheckBlock)
1894 LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1895
1896 if (CostTooHigh) {
1897 InstructionCost Cost;
1898 Cost.setInvalid();
1899 LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1900 return Cost;
1901 }
1902
1903 InstructionCost RTCheckCost = 0;
1904 if (SCEVCheckBlock)
1905 for (Instruction &I : *SCEVCheckBlock) {
1906 if (SCEVCheckBlock->getTerminator() == &I)
1907 continue;
1908 InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1909 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1910 RTCheckCost += C;
1911 }
1912 if (MemCheckBlock) {
1913 InstructionCost MemCheckCost = 0;
1914 for (Instruction &I : *MemCheckBlock) {
1915 if (MemCheckBlock->getTerminator() == &I)
1916 continue;
1917 InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1918 LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1919 MemCheckCost += C;
1920 }
1921
1922 // If the runtime memory checks are being created inside an outer loop
1923 // we should find out if these checks are outer loop invariant. If so,
1924 // the checks will likely be hoisted out and so the effective cost will
1925 // reduce according to the outer loop trip count.
1926 if (OuterLoop) {
1927 ScalarEvolution *SE = MemCheckExp.getSE();
1928 // TODO: If profitable, we could refine this further by analysing every
1929 // individual memory check, since there could be a mixture of loop
1930 // variant and invariant checks that mean the final condition is
1931 // variant.
1932 const SCEV *Cond = SE->getSCEV(V: MemRuntimeCheckCond);
1933 if (SE->isLoopInvariant(S: Cond, L: OuterLoop)) {
1934 // It seems reasonable to assume that we can reduce the effective
1935 // cost of the checks even when we know nothing about the trip
1936 // count. Assume that the outer loop executes at least twice.
1937 unsigned BestTripCount = 2;
1938
1939 // Get the best known TC estimate.
1940 if (auto EstimatedTC = getSmallBestKnownTC(
1941 PSE, L: OuterLoop, /* CanUseConstantMax = */ false))
1942 if (EstimatedTC->isFixed())
1943 BestTripCount = EstimatedTC->getFixedValue();
1944
1945 InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1946
1947 // Let's ensure the cost is always at least 1.
1948 NewMemCheckCost = std::max(a: NewMemCheckCost.getValue(),
1949 b: (InstructionCost::CostType)1);
1950
1951 if (BestTripCount > 1)
1952 LLVM_DEBUG(dbgs()
1953 << "We expect runtime memory checks to be hoisted "
1954 << "out of the outer loop. Cost reduced from "
1955 << MemCheckCost << " to " << NewMemCheckCost << '\n');
1956
1957 MemCheckCost = NewMemCheckCost;
1958 }
1959 }
1960
1961 RTCheckCost += MemCheckCost;
1962 }
1963
1964 if (SCEVCheckBlock || MemCheckBlock)
1965 LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
1966 << "\n");
1967
1968 return RTCheckCost;
1969 }
1970
1971 /// Remove the created SCEV & memory runtime check blocks & instructions, if
1972 /// unused.
1973 ~GeneratedRTChecks() {
1974 SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1975 SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
1976 bool SCEVChecksUsed = !SCEVCheckBlock || !pred_empty(BB: SCEVCheckBlock);
1977 bool MemChecksUsed = !MemCheckBlock || !pred_empty(BB: MemCheckBlock);
1978 if (SCEVChecksUsed)
1979 SCEVCleaner.markResultUsed();
1980
1981 if (MemChecksUsed) {
1982 MemCheckCleaner.markResultUsed();
1983 } else {
1984 auto &SE = *MemCheckExp.getSE();
1985 // Memory runtime check generation creates compares that use expanded
1986 // values. Remove them before running the SCEVExpanderCleaners.
1987 for (auto &I : make_early_inc_range(Range: reverse(C&: *MemCheckBlock))) {
1988 if (MemCheckExp.isInsertedInstruction(I: &I))
1989 continue;
1990 SE.forgetValue(V: &I);
1991 I.eraseFromParent();
1992 }
1993 }
1994 MemCheckCleaner.cleanup();
1995 SCEVCleaner.cleanup();
1996
1997 if (!SCEVChecksUsed)
1998 SCEVCheckBlock->eraseFromParent();
1999 if (!MemChecksUsed)
2000 MemCheckBlock->eraseFromParent();
2001 }
2002
2003 /// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
2004 /// outside VPlan.
2005 std::pair<Value *, BasicBlock *> getSCEVChecks() {
2006 using namespace llvm::PatternMatch;
2007 if (!SCEVCheckCond || match(V: SCEVCheckCond, P: m_ZeroInt()))
2008 return {nullptr, nullptr};
2009
2010 return {SCEVCheckCond, SCEVCheckBlock};
2011 }
2012
2013 /// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
2014 /// outside VPlan.
2015 std::pair<Value *, BasicBlock *> getMemRuntimeChecks() {
2016 return {MemRuntimeCheckCond, MemCheckBlock};
2017 }
2018
2019 /// Return true if any runtime checks have been added
2020 bool hasChecks() const {
2021 using namespace llvm::PatternMatch;
2022 return (SCEVCheckCond && !match(V: SCEVCheckCond, P: m_ZeroInt())) ||
2023 MemRuntimeCheckCond;
2024 }
2025};
2026} // namespace
2027
2028static bool useActiveLaneMask(TailFoldingStyle Style) {
2029 return Style == TailFoldingStyle::Data ||
2030 Style == TailFoldingStyle::DataAndControlFlow ||
2031 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2032}
2033
2034static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2035 return Style == TailFoldingStyle::DataAndControlFlow ||
2036 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2037}
2038
2039// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2040// vectorization. The loop needs to be annotated with #pragma omp simd
2041// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2042// vector length information is not provided, vectorization is not considered
2043// explicit. Interleave hints are not allowed either. These limitations will be
2044// relaxed in the future.
2045// Please, note that we are currently forced to abuse the pragma 'clang
2046// vectorize' semantics. This pragma provides *auto-vectorization hints*
2047// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2048// provides *explicit vectorization hints* (LV can bypass legal checks and
2049// assume that vectorization is legal). However, both hints are implemented
2050// using the same metadata (llvm.loop.vectorize, processed by
2051// LoopVectorizeHints). This will be fixed in the future when the native IR
2052// representation for pragma 'omp simd' is introduced.
2053static bool isExplicitVecOuterLoop(Loop *OuterLp,
2054 OptimizationRemarkEmitter *ORE) {
2055 assert(!OuterLp->isInnermost() && "This is not an outer loop");
2056 LoopVectorizeHints Hints(OuterLp, true /*DisableInterleaving*/, *ORE);
2057
2058 // Only outer loops with an explicit vectorization hint are supported.
2059 // Unannotated outer loops are ignored.
2060 if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2061 return false;
2062
2063 Function *Fn = OuterLp->getHeader()->getParent();
2064 if (!Hints.allowVectorization(F: Fn, L: OuterLp,
2065 VectorizeOnlyWhenForced: true /*VectorizeOnlyWhenForced*/)) {
2066 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2067 return false;
2068 }
2069
2070 if (Hints.getInterleave() > 1) {
2071 // TODO: Interleave support is future work.
2072 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2073 "outer loops.\n");
2074 Hints.emitRemarkWithHints();
2075 return false;
2076 }
2077
2078 return true;
2079}
2080
2081static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2082 OptimizationRemarkEmitter *ORE,
2083 SmallVectorImpl<Loop *> &V) {
2084 // Collect inner loops and outer loops without irreducible control flow. For
2085 // now, only collect outer loops that have explicit vectorization hints. If we
2086 // are stress testing the VPlan H-CFG construction, we collect the outermost
2087 // loop of every loop nest.
2088 if (L.isInnermost() || VPlanBuildStressTest ||
2089 (EnableVPlanNativePath && isExplicitVecOuterLoop(OuterLp: &L, ORE))) {
2090 LoopBlocksRPO RPOT(&L);
2091 RPOT.perform(LI);
2092 if (!containsIrreducibleCFG<const BasicBlock *>(RPOTraversal&: RPOT, LI: *LI)) {
2093 V.push_back(Elt: &L);
2094 // TODO: Collect inner loops inside marked outer loops in case
2095 // vectorization fails for the outer loop. Do not invoke
2096 // 'containsIrreducibleCFG' again for inner loops when the outer loop is
2097 // already known to be reducible. We can use an inherited attribute for
2098 // that.
2099 return;
2100 }
2101 }
2102 for (Loop *InnerL : L)
2103 collectSupportedLoops(L&: *InnerL, LI, ORE, V);
2104}
2105
2106//===----------------------------------------------------------------------===//
2107// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2108// LoopVectorizationCostModel and LoopVectorizationPlanner.
2109//===----------------------------------------------------------------------===//
2110
2111/// Compute the transformed value of Index at offset StartValue using step
2112/// StepValue.
2113/// For integer induction, returns StartValue + Index * StepValue.
2114/// For pointer induction, returns StartValue[Index * StepValue].
2115/// FIXME: The newly created binary instructions should contain nsw/nuw
2116/// flags, which can be found from the original scalar operations.
2117static Value *
2118emitTransformedIndex(IRBuilderBase &B, Value *Index, Value *StartValue,
2119 Value *Step,
2120 InductionDescriptor::InductionKind InductionKind,
2121 const BinaryOperator *InductionBinOp) {
2122 using namespace llvm::PatternMatch;
2123 Type *StepTy = Step->getType();
2124 Value *CastedIndex = StepTy->isIntegerTy()
2125 ? B.CreateSExtOrTrunc(V: Index, DestTy: StepTy)
2126 : B.CreateCast(Op: Instruction::SIToFP, V: Index, DestTy: StepTy);
2127 if (CastedIndex != Index) {
2128 CastedIndex->setName(CastedIndex->getName() + ".cast");
2129 Index = CastedIndex;
2130 }
2131
2132 // Note: the IR at this point is broken. We cannot use SE to create any new
2133 // SCEV and then expand it, hoping that SCEV's simplification will give us
2134 // a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2135 // lead to various SCEV crashes. So all we can do is to use builder and rely
2136 // on InstCombine for future simplifications. Here we handle some trivial
2137 // cases only.
2138 auto CreateAdd = [&B](Value *X, Value *Y) {
2139 assert(X->getType() == Y->getType() && "Types don't match!");
2140 if (match(V: X, P: m_ZeroInt()))
2141 return Y;
2142 if (match(V: Y, P: m_ZeroInt()))
2143 return X;
2144 return B.CreateAdd(LHS: X, RHS: Y);
2145 };
2146
2147 // We allow X to be a vector type, in which case Y will potentially be
2148 // splatted into a vector with the same element count.
2149 auto CreateMul = [&B](Value *X, Value *Y) {
2150 assert(X->getType()->getScalarType() == Y->getType() &&
2151 "Types don't match!");
2152 if (match(V: X, P: m_One()))
2153 return Y;
2154 if (match(V: Y, P: m_One()))
2155 return X;
2156 VectorType *XVTy = dyn_cast<VectorType>(Val: X->getType());
2157 if (XVTy && !isa<VectorType>(Val: Y->getType()))
2158 Y = B.CreateVectorSplat(EC: XVTy->getElementCount(), V: Y);
2159 return B.CreateMul(LHS: X, RHS: Y);
2160 };
2161
2162 switch (InductionKind) {
2163 case InductionDescriptor::IK_IntInduction: {
2164 assert(!isa<VectorType>(Index->getType()) &&
2165 "Vector indices not supported for integer inductions yet");
2166 assert(Index->getType() == StartValue->getType() &&
2167 "Index type does not match StartValue type");
2168 if (isa<ConstantInt>(Val: Step) && cast<ConstantInt>(Val: Step)->isMinusOne())
2169 return B.CreateSub(LHS: StartValue, RHS: Index);
2170 auto *Offset = CreateMul(Index, Step);
2171 return CreateAdd(StartValue, Offset);
2172 }
2173 case InductionDescriptor::IK_PtrInduction:
2174 return B.CreatePtrAdd(Ptr: StartValue, Offset: CreateMul(Index, Step));
2175 case InductionDescriptor::IK_FpInduction: {
2176 assert(!isa<VectorType>(Index->getType()) &&
2177 "Vector indices not supported for FP inductions yet");
2178 assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2179 assert(InductionBinOp &&
2180 (InductionBinOp->getOpcode() == Instruction::FAdd ||
2181 InductionBinOp->getOpcode() == Instruction::FSub) &&
2182 "Original bin op should be defined for FP induction");
2183
2184 Value *MulExp = B.CreateFMul(L: Step, R: Index);
2185 return B.CreateBinOp(Opc: InductionBinOp->getOpcode(), LHS: StartValue, RHS: MulExp,
2186 Name: "induction");
2187 }
2188 case InductionDescriptor::IK_NoInduction:
2189 return nullptr;
2190 }
2191 llvm_unreachable("invalid enum");
2192}
2193
2194static std::optional<unsigned> getMaxVScale(const Function &F,
2195 const TargetTransformInfo &TTI) {
2196 if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2197 return MaxVScale;
2198
2199 if (F.hasFnAttribute(Kind: Attribute::VScaleRange))
2200 return F.getFnAttribute(Kind: Attribute::VScaleRange).getVScaleRangeMax();
2201
2202 return std::nullopt;
2203}
2204
2205/// For the given VF and UF and maximum trip count computed for the loop, return
2206/// whether the induction variable might overflow in the vectorized loop. If not,
2207/// then we know a runtime overflow check always evaluates to false and can be
2208/// removed.
2209static bool isIndvarOverflowCheckKnownFalse(
2210 const LoopVectorizationCostModel *Cost,
2211 ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2212 // Always be conservative if we don't know the exact unroll factor.
2213 unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2214
2215 IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
2216 APInt MaxUIntTripCount = IdxTy->getMask();
2217
2218 // We know the runtime overflow check is known false iff the (max) trip-count
2219 // is known and (max) trip-count + (VF * UF) does not overflow in the type of
2220 // the vector loop induction variable.
2221 if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2222 uint64_t MaxVF = VF.getKnownMinValue();
2223 if (VF.isScalable()) {
2224 std::optional<unsigned> MaxVScale =
2225 getMaxVScale(F: *Cost->TheFunction, TTI: Cost->TTI);
2226 if (!MaxVScale)
2227 return false;
2228 MaxVF *= *MaxVScale;
2229 }
2230
2231 return (MaxUIntTripCount - TC).ugt(RHS: MaxVF * MaxUF);
2232 }
2233
2234 return false;
2235}
2236
2237// Return whether we allow using masked interleave-groups (for dealing with
2238// strided loads/stores that reside in predicated blocks, or for dealing
2239// with gaps).
2240static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2241 // If an override option has been passed in for interleaved accesses, use it.
2242 if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > 0)
2243 return EnableMaskedInterleavedMemAccesses;
2244
2245 return TTI.enableMaskedInterleavedAccessVectorization();
2246}
2247
2248Value *
2249InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2250 if (VectorTripCount)
2251 return VectorTripCount;
2252
2253 Value *TC = getTripCount();
2254 IRBuilder<> Builder(InsertBlock->getTerminator());
2255
2256 Type *Ty = TC->getType();
2257 // This is where we can make the step a runtime constant.
2258 Value *Step = createStepForVF(B&: Builder, Ty, VF, Step: UF);
2259
2260 // If the tail is to be folded by masking, round the number of iterations N
2261 // up to a multiple of Step instead of rounding down. This is done by first
2262 // adding Step-1 and then rounding down. Note that it's ok if this addition
2263 // overflows: the vector induction variable will eventually wrap to zero given
2264 // that it starts at zero and its Step is a power of two; the loop will then
2265 // exit, with the last early-exit vector comparison also producing all-true.
2266 // For scalable vectors the VF is not guaranteed to be a power of 2, but this
2267 // is accounted for in emitIterationCountCheck that adds an overflow check.
2268 if (Cost->foldTailByMasking()) {
2269 assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2270 "VF*UF must be a power of 2 when folding tail by masking");
2271 TC = Builder.CreateAdd(LHS: TC, RHS: Builder.CreateSub(LHS: Step, RHS: ConstantInt::get(Ty, V: 1)),
2272 Name: "n.rnd.up");
2273 }
2274
2275 // Now we need to generate the expression for the part of the loop that the
2276 // vectorized body will execute. This is equal to N - (N % Step) if scalar
2277 // iterations are not required for correctness, or N - Step, otherwise. Step
2278 // is equal to the vectorization factor (number of SIMD elements) times the
2279 // unroll factor (number of SIMD instructions).
2280 Value *R = Builder.CreateURem(LHS: TC, RHS: Step, Name: "n.mod.vf");
2281
2282 // There are cases where we *must* run at least one iteration in the remainder
2283 // loop. See the cost model for when this can happen. If the step evenly
2284 // divides the trip count, we set the remainder to be equal to the step. If
2285 // the step does not evenly divide the trip count, no adjustment is necessary
2286 // since there will already be scalar iterations. Note that the minimum
2287 // iterations check ensures that N >= Step.
2288 if (Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())) {
2289 auto *IsZero = Builder.CreateICmpEQ(LHS: R, RHS: ConstantInt::get(Ty: R->getType(), V: 0));
2290 R = Builder.CreateSelect(C: IsZero, True: Step, False: R);
2291 }
2292
2293 VectorTripCount = Builder.CreateSub(LHS: TC, RHS: R, Name: "n.vec");
2294
2295 return VectorTripCount;
2296}
2297
2298void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
2299 // Note: The block with the minimum trip-count check is already connected
2300 // during earlier VPlan construction.
2301 VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2302 VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
2303 assert(PreVectorPH->getNumSuccessors() == 2 && "Expected 2 successors");
2304 assert(PreVectorPH->getSuccessors()[0] == ScalarPH && "Unexpected successor");
2305 VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(IRBB: CheckIRBB);
2306 VPBlockUtils::insertOnEdge(From: PreVectorPH, To: VectorPHVPB, BlockPtr: CheckVPIRBB);
2307 PreVectorPH = CheckVPIRBB;
2308 VPBlockUtils::connectBlocks(From: PreVectorPH, To: ScalarPH);
2309 PreVectorPH->swapSuccessors();
2310
2311 // We just connected a new block to the scalar preheader. Update all
2312 // VPPhis by adding an incoming value for it, replicating the last value.
2313 unsigned NumPredecessors = ScalarPH->getNumPredecessors();
2314 for (VPRecipeBase &R : cast<VPBasicBlock>(Val: ScalarPH)->phis()) {
2315 assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
2316 assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - 1 &&
2317 "must have incoming values for all operands");
2318 R.addOperand(Operand: R.getOperand(N: NumPredecessors - 2));
2319 }
2320}
2321
2322Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
2323 unsigned UF) const {
2324 // Generate code to check if the loop's trip count is less than VF * UF, or
2325 // equal to it in case a scalar epilogue is required; this implies that the
2326 // vector trip count is zero. This check also covers the case where adding one
2327 // to the backedge-taken count overflowed leading to an incorrect trip count
2328 // of zero. In this case we will also jump to the scalar loop.
2329 auto P = Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()) ? ICmpInst::ICMP_ULE
2330 : ICmpInst::ICMP_ULT;
2331
2332 // Reuse existing vector loop preheader for TC checks.
2333 // Note that new preheader block is generated for vector loop.
2334 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2335 IRBuilder<> Builder(TCCheckBlock->getTerminator());
2336
2337 // If tail is to be folded, vector loop takes care of all iterations.
2338 Value *Count = getTripCount();
2339 Type *CountTy = Count->getType();
2340 Value *CheckMinIters = Builder.getFalse();
2341 auto CreateStep = [&]() -> Value * {
2342 // Create step with max(MinProTripCount, UF * VF).
2343 if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2344 return createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF);
2345
2346 Value *MinProfTC =
2347 createStepForVF(B&: Builder, Ty: CountTy, VF: MinProfitableTripCount, Step: 1);
2348 if (!VF.isScalable())
2349 return MinProfTC;
2350 return Builder.CreateBinaryIntrinsic(
2351 ID: Intrinsic::umax, LHS: MinProfTC, RHS: createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF));
2352 };
2353
2354 TailFoldingStyle Style = Cost->getTailFoldingStyle();
2355 if (Style == TailFoldingStyle::None) {
2356 Value *Step = CreateStep();
2357 ScalarEvolution &SE = *PSE.getSE();
2358 // TODO: Emit unconditional branch to vector preheader instead of
2359 // conditional branch with known condition.
2360 const SCEV *TripCountSCEV = SE.applyLoopGuards(Expr: SE.getSCEV(V: Count), L: OrigLoop);
2361 // Check if the trip count is < the step.
2362 if (SE.isKnownPredicate(Pred: P, LHS: TripCountSCEV, RHS: SE.getSCEV(V: Step))) {
2363 // TODO: Ensure step is at most the trip count when determining max VF and
2364 // UF, w/o tail folding.
2365 CheckMinIters = Builder.getTrue();
2366 } else if (!SE.isKnownPredicate(Pred: CmpInst::getInversePredicate(pred: P),
2367 LHS: TripCountSCEV, RHS: SE.getSCEV(V: Step))) {
2368 // Generate the minimum iteration check only if we cannot prove the
2369 // check is known to be true, or known to be false.
2370 CheckMinIters = Builder.CreateICmp(P, LHS: Count, RHS: Step, Name: "min.iters.check");
2371 } // else step known to be < trip count, use CheckMinIters preset to false.
2372 } else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() &&
2373 !isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2374 Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2375 // vscale is not necessarily a power-of-2, which means we cannot guarantee
2376 // an overflow to zero when updating induction variables and so an
2377 // additional overflow check is required before entering the vector loop.
2378
2379 // Get the maximum unsigned value for the type.
2380 Value *MaxUIntTripCount =
2381 ConstantInt::get(Ty: CountTy, V: cast<IntegerType>(Val: CountTy)->getMask());
2382 Value *LHS = Builder.CreateSub(LHS: MaxUIntTripCount, RHS: Count);
2383
2384 // Don't execute the vector loop if (UMax - n) < (VF * UF).
2385 CheckMinIters = Builder.CreateICmp(P: ICmpInst::ICMP_ULT, LHS, RHS: CreateStep());
2386 }
2387 return CheckMinIters;
2388}
2389
2390void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2391 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2392 Value *CheckMinIters = createIterationCountCheck(VF, UF);
2393 // Create new preheader for vector loop.
2394 LoopVectorPreHeader = SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(),
2395 DT: static_cast<DominatorTree *>(nullptr), LI,
2396 MSSAU: nullptr, BBName: "vector.ph");
2397
2398 BranchInst &BI =
2399 *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
2400 if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator()))
2401 setBranchWeights(I&: BI, Weights: MinItersBypassWeights, /*IsExpected=*/false);
2402 ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI);
2403
2404 assert(cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock() ==
2405 TCCheckBlock &&
2406 "Plan's entry must be TCCCheckBlock");
2407}
2408
2409/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2410/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2411/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2412/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
2413static void replaceVPBBWithIRVPBB(VPBasicBlock *VPBB, BasicBlock *IRBB) {
2414 VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2415 for (auto &R : make_early_inc_range(Range&: *VPBB)) {
2416 assert((IRVPBB->empty() || IRVPBB->back().isPhi() || !R.isPhi()) &&
2417 "Tried to move phi recipe after a non-phi recipe");
2418 R.moveBefore(BB&: *IRVPBB, I: IRVPBB->end());
2419 }
2420
2421 VPBlockUtils::reassociateBlocks(Old: VPBB, New: IRVPBB);
2422 // VPBB is now dead and will be cleaned up when the plan gets destroyed.
2423}
2424
2425void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2426 LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2427 assert(LoopVectorPreHeader && "Invalid loop structure");
2428 assert((OrigLoop->getUniqueLatchExitBlock() ||
2429 Cost->requiresScalarEpilogue(VF.isVector())) &&
2430 "loops not exiting via the latch without required epilogue?");
2431
2432 LoopScalarPreHeader =
2433 SplitBlock(Old: LoopVectorPreHeader, SplitPt: LoopVectorPreHeader->getTerminator(), DT,
2434 LI, MSSAU: nullptr, BBName: Twine(Prefix) + "scalar.ph");
2435 // NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
2436 // wrapping LoopScalarPreHeader here at the moment, because the Plan's scalar
2437 // preheader may be unreachable at this point. Instead it is replaced in
2438 // createVectorizedLoopSkeleton.
2439}
2440
2441/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2442/// expansion results.
2443static Value *getExpandedStep(const InductionDescriptor &ID,
2444 const SCEV2ValueTy &ExpandedSCEVs) {
2445 const SCEV *Step = ID.getStep();
2446 if (auto *C = dyn_cast<SCEVConstant>(Val: Step))
2447 return C->getValue();
2448 if (auto *U = dyn_cast<SCEVUnknown>(Val: Step))
2449 return U->getValue();
2450 Value *V = ExpandedSCEVs.lookup(Val: Step);
2451 assert(V && "SCEV must be expanded at this point");
2452 return V;
2453}
2454
2455/// Knowing that loop \p L executes a single vector iteration, add instructions
2456/// that will get simplified and thus should not have any cost to \p
2457/// InstsToIgnore.
2458static void addFullyUnrolledInstructionsToIgnore(
2459 Loop *L, const LoopVectorizationLegality::InductionList &IL,
2460 SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2461 auto *Cmp = L->getLatchCmpInst();
2462 if (Cmp)
2463 InstsToIgnore.insert(Ptr: Cmp);
2464 for (const auto &KV : IL) {
2465 // Extract the key by hand so that it can be used in the lambda below. Note
2466 // that captured structured bindings are a C++20 extension.
2467 const PHINode *IV = KV.first;
2468
2469 // Get next iteration value of the induction variable.
2470 Instruction *IVInst =
2471 cast<Instruction>(Val: IV->getIncomingValueForBlock(BB: L->getLoopLatch()));
2472 if (all_of(Range: IVInst->users(),
2473 P: [&](const User *U) { return U == IV || U == Cmp; }))
2474 InstsToIgnore.insert(Ptr: IVInst);
2475 }
2476}
2477
2478BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2479 /*
2480 In this function we generate a new loop. The new loop will contain
2481 the vectorized instructions while the old loop will continue to run the
2482 scalar remainder.
2483
2484 [ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2485 / | preheader are expanded here. Eventually all required SCEV
2486 / | expansion should happen here.
2487 / v
2488 | [ ] <-- vector loop bypass (may consist of multiple blocks).
2489 | / |
2490 | / v
2491 || [ ] <-- vector pre header.
2492 |/ |
2493 | v
2494 | [ ] \
2495 | [ ]_| <-- vector loop (created during VPlan execution).
2496 | |
2497 | v
2498 \ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2499 | | successors created during VPlan execution)
2500 \/ |
2501 /\ v
2502 | ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
2503 | |
2504 (opt) v <-- edge from middle to exit iff epilogue is not required.
2505 | [ ] \
2506 | [ ]_| <-- old scalar loop to handle remainder (scalar epilogue, header
2507 | | wrapped in VPIRBasicBlock).
2508 \ |
2509 \ v
2510 >[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
2511 ...
2512 */
2513
2514 // Create an empty vector loop, and prepare basic blocks for the runtime
2515 // checks.
2516 createVectorLoopSkeleton(Prefix: "");
2517
2518 // Now, compare the new count to zero. If it is zero skip the vector loop and
2519 // jump to the scalar loop. This check also covers the case where the
2520 // backedge-taken count is uint##_max: adding one to it will overflow leading
2521 // to an incorrect trip count of zero. In this (rare) case we will also jump
2522 // to the scalar loop.
2523 emitIterationCountCheck(Bypass: LoopScalarPreHeader);
2524
2525 replaceVPBBWithIRVPBB(VPBB: Plan.getScalarPreheader(), IRBB: LoopScalarPreHeader);
2526 return LoopVectorPreHeader;
2527}
2528
2529namespace {
2530
2531struct CSEDenseMapInfo {
2532 static bool canHandle(const Instruction *I) {
2533 return isa<InsertElementInst>(Val: I) || isa<ExtractElementInst>(Val: I) ||
2534 isa<ShuffleVectorInst>(Val: I) || isa<GetElementPtrInst>(Val: I);
2535 }
2536
2537 static inline Instruction *getEmptyKey() {
2538 return DenseMapInfo<Instruction *>::getEmptyKey();
2539 }
2540
2541 static inline Instruction *getTombstoneKey() {
2542 return DenseMapInfo<Instruction *>::getTombstoneKey();
2543 }
2544
2545 static unsigned getHashValue(const Instruction *I) {
2546 assert(canHandle(I) && "Unknown instruction!");
2547 return hash_combine(args: I->getOpcode(),
2548 args: hash_combine_range(R: I->operand_values()));
2549 }
2550
2551 static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
2552 if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
2553 LHS == getTombstoneKey() || RHS == getTombstoneKey())
2554 return LHS == RHS;
2555 return LHS->isIdenticalTo(I: RHS);
2556 }
2557};
2558
2559} // end anonymous namespace
2560
2561///Perform cse of induction variable instructions.
2562static void cse(BasicBlock *BB) {
2563 // Perform simple cse.
2564 SmallDenseMap<Instruction *, Instruction *, 4, CSEDenseMapInfo> CSEMap;
2565 for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
2566 if (!CSEDenseMapInfo::canHandle(I: &In))
2567 continue;
2568
2569 // Check if we can replace this instruction with any of the
2570 // visited instructions.
2571 if (Instruction *V = CSEMap.lookup(Val: &In)) {
2572 In.replaceAllUsesWith(V);
2573 In.eraseFromParent();
2574 continue;
2575 }
2576
2577 CSEMap[&In] = &In;
2578 }
2579}
2580
2581/// This function attempts to return a value that represents the vectorization
2582/// factor at runtime. For fixed-width VFs we know this precisely at compile
2583/// time, but for scalable VFs we calculate it based on an estimate of the
2584/// vscale value.
2585static unsigned getEstimatedRuntimeVF(ElementCount VF,
2586 std::optional<unsigned> VScale) {
2587 unsigned EstimatedVF = VF.getKnownMinValue();
2588 if (VF.isScalable())
2589 if (VScale)
2590 EstimatedVF *= *VScale;
2591 assert(EstimatedVF >= 1 && "Estimated VF shouldn't be less than 1");
2592 return EstimatedVF;
2593}
2594
2595InstructionCost
2596LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2597 ElementCount VF) const {
2598 // We only need to calculate a cost if the VF is scalar; for actual vectors
2599 // we should already have a pre-calculated cost at each VF.
2600 if (!VF.isScalar())
2601 return getCallWideningDecision(CI, VF).Cost;
2602
2603 Type *RetTy = CI->getType();
2604 if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
2605 if (auto RedCost = getReductionPatternCost(I: CI, VF, VectorTy: RetTy))
2606 return *RedCost;
2607
2608 SmallVector<Type *, 4> Tys;
2609 for (auto &ArgOp : CI->args())
2610 Tys.push_back(Elt: ArgOp->getType());
2611
2612 InstructionCost ScalarCallCost =
2613 TTI.getCallInstrCost(F: CI->getCalledFunction(), RetTy, Tys, CostKind);
2614
2615 // If this is an intrinsic we may have a lower cost for it.
2616 if (getVectorIntrinsicIDForCall(CI, TLI)) {
2617 InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2618 return std::min(a: ScalarCallCost, b: IntrinsicCost);
2619 }
2620 return ScalarCallCost;
2621}
2622
2623static Type *maybeVectorizeType(Type *Ty, ElementCount VF) {
2624 if (VF.isScalar() || !canVectorizeTy(Ty))
2625 return Ty;
2626 return toVectorizedTy(Ty, EC: VF);
2627}
2628
2629InstructionCost
2630LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2631 ElementCount VF) const {
2632 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2633 assert(ID && "Expected intrinsic call!");
2634 Type *RetTy = maybeVectorizeType(Ty: CI->getType(), VF);
2635 FastMathFlags FMF;
2636 if (auto *FPMO = dyn_cast<FPMathOperator>(Val: CI))
2637 FMF = FPMO->getFastMathFlags();
2638
2639 SmallVector<const Value *> Arguments(CI->args());
2640 FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2641 SmallVector<Type *> ParamTys;
2642 std::transform(first: FTy->param_begin(), last: FTy->param_end(),
2643 result: std::back_inserter(x&: ParamTys),
2644 unary_op: [&](Type *Ty) { return maybeVectorizeType(Ty, VF); });
2645
2646 IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2647 dyn_cast<IntrinsicInst>(Val: CI),
2648 InstructionCost::getInvalid(), TLI);
2649 return TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
2650}
2651
2652void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2653 // Fix widened non-induction PHIs by setting up the PHI operands.
2654 fixNonInductionPHIs(State);
2655
2656 // After vectorization, the exit blocks of the original loop will have
2657 // additional predecessors. Invalidate SCEVs for the exit phis in case SE
2658 // looked through single-entry phis.
2659 SmallVector<BasicBlock *> ExitBlocks;
2660 OrigLoop->getExitBlocks(ExitBlocks);
2661 for (BasicBlock *Exit : ExitBlocks)
2662 for (PHINode &PN : Exit->phis())
2663 PSE.getSE()->forgetLcssaPhiWithNewPredecessor(L: OrigLoop, V: &PN);
2664
2665 // Forget the original basic block.
2666 PSE.getSE()->forgetLoop(L: OrigLoop);
2667 PSE.getSE()->forgetBlockAndLoopDispositions();
2668
2669 // Don't apply optimizations below when no (vector) loop remains, as they all
2670 // require one at the moment.
2671 VPBasicBlock *HeaderVPBB =
2672 vputils::getFirstLoopHeader(Plan&: *State.Plan, VPDT&: State.VPDT);
2673 if (!HeaderVPBB)
2674 return;
2675
2676 BasicBlock *HeaderBB = State.CFG.VPBB2IRBB[HeaderVPBB];
2677
2678 // Remove redundant induction instructions.
2679 cse(BB: HeaderBB);
2680
2681 // Set/update profile weights for the vector and remainder loops as original
2682 // loop iterations are now distributed among them. Note that original loop
2683 // becomes the scalar remainder loop after vectorization.
2684 //
2685 // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
2686 // end up getting slightly roughened result but that should be OK since
2687 // profile is not inherently precise anyway. Note also possible bypass of
2688 // vector code caused by legality checks is ignored, assigning all the weight
2689 // to the vector loop, optimistically.
2690 //
2691 // For scalable vectorization we can't know at compile time how many
2692 // iterations of the loop are handled in one vector iteration, so instead
2693 // use the value of vscale used for tuning.
2694 Loop *VectorLoop = LI->getLoopFor(BB: HeaderBB);
2695 unsigned EstimatedVFxUF =
2696 getEstimatedRuntimeVF(VF: VF * UF, VScale: Cost->getVScaleForTuning());
2697 setProfileInfoAfterUnrolling(OrigLoop, UnrolledLoop: VectorLoop, RemainderLoop: OrigLoop, UF: EstimatedVFxUF);
2698}
2699
2700void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
2701 auto Iter = vp_depth_first_shallow(G: Plan.getEntry());
2702 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
2703 for (VPRecipeBase &P : VPBB->phis()) {
2704 VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &P);
2705 if (!VPPhi)
2706 continue;
2707 PHINode *NewPhi = cast<PHINode>(Val: State.get(Def: VPPhi));
2708 // Make sure the builder has a valid insert point.
2709 Builder.SetInsertPoint(NewPhi);
2710 for (unsigned Idx = 0; Idx < VPPhi->getNumIncoming(); ++Idx) {
2711 VPValue *Inc = VPPhi->getIncomingValue(Idx);
2712 const VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
2713 NewPhi->addIncoming(V: State.get(Def: Inc), BB: State.CFG.VPBB2IRBB[VPBB]);
2714 }
2715 }
2716 }
2717}
2718
2719void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2720 // We should not collect Scalars more than once per VF. Right now, this
2721 // function is called from collectUniformsAndScalars(), which already does
2722 // this check. Collecting Scalars for VF=1 does not make any sense.
2723 assert(VF.isVector() && !Scalars.contains(VF) &&
2724 "This function should not be visited twice for the same VF");
2725
2726 // This avoids any chances of creating a REPLICATE recipe during planning
2727 // since that would result in generation of scalarized code during execution,
2728 // which is not supported for scalable vectors.
2729 if (VF.isScalable()) {
2730 Scalars[VF].insert_range(R&: Uniforms[VF]);
2731 return;
2732 }
2733
2734 SmallSetVector<Instruction *, 8> Worklist;
2735
2736 // These sets are used to seed the analysis with pointers used by memory
2737 // accesses that will remain scalar.
2738 SmallSetVector<Instruction *, 8> ScalarPtrs;
2739 SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
2740 auto *Latch = TheLoop->getLoopLatch();
2741
2742 // A helper that returns true if the use of Ptr by MemAccess will be scalar.
2743 // The pointer operands of loads and stores will be scalar as long as the
2744 // memory access is not a gather or scatter operation. The value operand of a
2745 // store will remain scalar if the store is scalarized.
2746 auto IsScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
2747 InstWidening WideningDecision = getWideningDecision(I: MemAccess, VF);
2748 assert(WideningDecision != CM_Unknown &&
2749 "Widening decision should be ready at this moment");
2750 if (auto *Store = dyn_cast<StoreInst>(Val: MemAccess))
2751 if (Ptr == Store->getValueOperand())
2752 return WideningDecision == CM_Scalarize;
2753 assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2754 "Ptr is neither a value or pointer operand");
2755 return WideningDecision != CM_GatherScatter;
2756 };
2757
2758 // A helper that returns true if the given value is a getelementptr
2759 // instruction contained in the loop.
2760 auto IsLoopVaryingGEP = [&](Value *V) {
2761 return isa<GetElementPtrInst>(Val: V) && !TheLoop->isLoopInvariant(V);
2762 };
2763
2764 // A helper that evaluates a memory access's use of a pointer. If the use will
2765 // be a scalar use and the pointer is only used by memory accesses, we place
2766 // the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2767 // PossibleNonScalarPtrs.
2768 auto EvaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
2769 // We only care about bitcast and getelementptr instructions contained in
2770 // the loop.
2771 if (!IsLoopVaryingGEP(Ptr))
2772 return;
2773
2774 // If the pointer has already been identified as scalar (e.g., if it was
2775 // also identified as uniform), there's nothing to do.
2776 auto *I = cast<Instruction>(Val: Ptr);
2777 if (Worklist.count(key: I))
2778 return;
2779
2780 // If the use of the pointer will be a scalar use, and all users of the
2781 // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2782 // place the pointer in PossibleNonScalarPtrs.
2783 if (IsScalarUse(MemAccess, Ptr) &&
2784 all_of(Range: I->users(), P: IsaPred<LoadInst, StoreInst>))
2785 ScalarPtrs.insert(X: I);
2786 else
2787 PossibleNonScalarPtrs.insert(Ptr: I);
2788 };
2789
2790 // We seed the scalars analysis with three classes of instructions: (1)
2791 // instructions marked uniform-after-vectorization and (2) bitcast,
2792 // getelementptr and (pointer) phi instructions used by memory accesses
2793 // requiring a scalar use.
2794 //
2795 // (1) Add to the worklist all instructions that have been identified as
2796 // uniform-after-vectorization.
2797 Worklist.insert_range(R&: Uniforms[VF]);
2798
2799 // (2) Add to the worklist all bitcast and getelementptr instructions used by
2800 // memory accesses requiring a scalar use. The pointer operands of loads and
2801 // stores will be scalar unless the operation is a gather or scatter.
2802 // The value operand of a store will remain scalar if the store is scalarized.
2803 for (auto *BB : TheLoop->blocks())
2804 for (auto &I : *BB) {
2805 if (auto *Load = dyn_cast<LoadInst>(Val: &I)) {
2806 EvaluatePtrUse(Load, Load->getPointerOperand());
2807 } else if (auto *Store = dyn_cast<StoreInst>(Val: &I)) {
2808 EvaluatePtrUse(Store, Store->getPointerOperand());
2809 EvaluatePtrUse(Store, Store->getValueOperand());
2810 }
2811 }
2812 for (auto *I : ScalarPtrs)
2813 if (!PossibleNonScalarPtrs.count(Ptr: I)) {
2814 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2815 Worklist.insert(X: I);
2816 }
2817
2818 // Insert the forced scalars.
2819 // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2820 // induction variable when the PHI user is scalarized.
2821 auto ForcedScalar = ForcedScalars.find(Val: VF);
2822 if (ForcedScalar != ForcedScalars.end())
2823 for (auto *I : ForcedScalar->second) {
2824 LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2825 Worklist.insert(X: I);
2826 }
2827
2828 // Expand the worklist by looking through any bitcasts and getelementptr
2829 // instructions we've already identified as scalar. This is similar to the
2830 // expansion step in collectLoopUniforms(); however, here we're only
2831 // expanding to include additional bitcasts and getelementptr instructions.
2832 unsigned Idx = 0;
2833 while (Idx != Worklist.size()) {
2834 Instruction *Dst = Worklist[Idx++];
2835 if (!IsLoopVaryingGEP(Dst->getOperand(i: 0)))
2836 continue;
2837 auto *Src = cast<Instruction>(Val: Dst->getOperand(i: 0));
2838 if (llvm::all_of(Range: Src->users(), P: [&](User *U) -> bool {
2839 auto *J = cast<Instruction>(Val: U);
2840 return !TheLoop->contains(Inst: J) || Worklist.count(key: J) ||
2841 ((isa<LoadInst>(Val: J) || isa<StoreInst>(Val: J)) &&
2842 IsScalarUse(J, Src));
2843 })) {
2844 Worklist.insert(X: Src);
2845 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2846 }
2847 }
2848
2849 // An induction variable will remain scalar if all users of the induction
2850 // variable and induction variable update remain scalar.
2851 for (const auto &Induction : Legal->getInductionVars()) {
2852 auto *Ind = Induction.first;
2853 auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
2854
2855 // If tail-folding is applied, the primary induction variable will be used
2856 // to feed a vector compare.
2857 if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2858 continue;
2859
2860 // Returns true if \p Indvar is a pointer induction that is used directly by
2861 // load/store instruction \p I.
2862 auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2863 Instruction *I) {
2864 return Induction.second.getKind() ==
2865 InductionDescriptor::IK_PtrInduction &&
2866 (isa<LoadInst>(Val: I) || isa<StoreInst>(Val: I)) &&
2867 Indvar == getLoadStorePointerOperand(V: I) && IsScalarUse(I, Indvar);
2868 };
2869
2870 // Determine if all users of the induction variable are scalar after
2871 // vectorization.
2872 bool ScalarInd = all_of(Range: Ind->users(), P: [&](User *U) -> bool {
2873 auto *I = cast<Instruction>(Val: U);
2874 return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
2875 IsDirectLoadStoreFromPtrIndvar(Ind, I);
2876 });
2877 if (!ScalarInd)
2878 continue;
2879
2880 // If the induction variable update is a fixed-order recurrence, neither the
2881 // induction variable or its update should be marked scalar after
2882 // vectorization.
2883 auto *IndUpdatePhi = dyn_cast<PHINode>(Val: IndUpdate);
2884 if (IndUpdatePhi && Legal->isFixedOrderRecurrence(Phi: IndUpdatePhi))
2885 continue;
2886
2887 // Determine if all users of the induction variable update instruction are
2888 // scalar after vectorization.
2889 bool ScalarIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool {
2890 auto *I = cast<Instruction>(Val: U);
2891 return I == Ind || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
2892 IsDirectLoadStoreFromPtrIndvar(IndUpdate, I);
2893 });
2894 if (!ScalarIndUpdate)
2895 continue;
2896
2897 // The induction variable and its update instruction will remain scalar.
2898 Worklist.insert(X: Ind);
2899 Worklist.insert(X: IndUpdate);
2900 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
2901 LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
2902 << "\n");
2903 }
2904
2905 Scalars[VF].insert_range(R&: Worklist);
2906}
2907
2908bool LoopVectorizationCostModel::isScalarWithPredication(
2909 Instruction *I, ElementCount VF) const {
2910 if (!isPredicatedInst(I))
2911 return false;
2912
2913 // Do we have a non-scalar lowering for this predicated
2914 // instruction? No - it is scalar with predication.
2915 switch(I->getOpcode()) {
2916 default:
2917 return true;
2918 case Instruction::Call:
2919 if (VF.isScalar())
2920 return true;
2921 return getCallWideningDecision(CI: cast<CallInst>(Val: I), VF).Kind == CM_Scalarize;
2922 case Instruction::Load:
2923 case Instruction::Store: {
2924 auto *Ptr = getLoadStorePointerOperand(V: I);
2925 auto *Ty = getLoadStoreType(I);
2926 unsigned AS = getLoadStoreAddressSpace(I);
2927 Type *VTy = Ty;
2928 if (VF.isVector())
2929 VTy = VectorType::get(ElementType: Ty, EC: VF);
2930 const Align Alignment = getLoadStoreAlignment(I);
2931 return isa<LoadInst>(Val: I) ? !(isLegalMaskedLoad(DataType: Ty, Ptr, Alignment, AddressSpace: AS) ||
2932 TTI.isLegalMaskedGather(DataType: VTy, Alignment))
2933 : !(isLegalMaskedStore(DataType: Ty, Ptr, Alignment, AddressSpace: AS) ||
2934 TTI.isLegalMaskedScatter(DataType: VTy, Alignment));
2935 }
2936 case Instruction::UDiv:
2937 case Instruction::SDiv:
2938 case Instruction::SRem:
2939 case Instruction::URem: {
2940 // We have the option to use the safe-divisor idiom to avoid predication.
2941 // The cost based decision here will always select safe-divisor for
2942 // scalable vectors as scalarization isn't legal.
2943 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
2944 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
2945 }
2946 }
2947}
2948
2949// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
2950bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
2951 // TODO: We can use the loop-preheader as context point here and get
2952 // context sensitive reasoning for isSafeToSpeculativelyExecute.
2953 if (isSafeToSpeculativelyExecute(I) ||
2954 (isa<LoadInst, StoreInst, CallInst>(Val: I) && !Legal->isMaskRequired(I)) ||
2955 isa<BranchInst, SwitchInst, PHINode, AllocaInst>(Val: I))
2956 return false;
2957
2958 // If the instruction was executed conditionally in the original scalar loop,
2959 // predication is needed with a mask whose lanes are all possibly inactive.
2960 if (Legal->blockNeedsPredication(BB: I->getParent()))
2961 return true;
2962
2963 // If we're not folding the tail by masking, predication is unnecessary.
2964 if (!foldTailByMasking())
2965 return false;
2966
2967 // All that remain are instructions with side-effects originally executed in
2968 // the loop unconditionally, but now execute under a tail-fold mask (only)
2969 // having at least one active lane (the first). If the side-effects of the
2970 // instruction are invariant, executing it w/o (the tail-folding) mask is safe
2971 // - it will cause the same side-effects as when masked.
2972 switch(I->getOpcode()) {
2973 default:
2974 llvm_unreachable(
2975 "instruction should have been considered by earlier checks");
2976 case Instruction::Call:
2977 // Side-effects of a Call are assumed to be non-invariant, needing a
2978 // (fold-tail) mask.
2979 assert(Legal->isMaskRequired(I) &&
2980 "should have returned earlier for calls not needing a mask");
2981 return true;
2982 case Instruction::Load:
2983 // If the address is loop invariant no predication is needed.
2984 return !Legal->isInvariant(V: getLoadStorePointerOperand(V: I));
2985 case Instruction::Store: {
2986 // For stores, we need to prove both speculation safety (which follows from
2987 // the same argument as loads), but also must prove the value being stored
2988 // is correct. The easiest form of the later is to require that all values
2989 // stored are the same.
2990 return !(Legal->isInvariant(V: getLoadStorePointerOperand(V: I)) &&
2991 Legal->isInvariant(V: cast<StoreInst>(Val: I)->getValueOperand()));
2992 }
2993 case Instruction::UDiv:
2994 case Instruction::SDiv:
2995 case Instruction::SRem:
2996 case Instruction::URem:
2997 // If the divisor is loop-invariant no predication is needed.
2998 return !Legal->isInvariant(V: I->getOperand(i: 1));
2999 }
3000}
3001
3002std::pair<InstructionCost, InstructionCost>
3003LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3004 ElementCount VF) const {
3005 assert(I->getOpcode() == Instruction::UDiv ||
3006 I->getOpcode() == Instruction::SDiv ||
3007 I->getOpcode() == Instruction::SRem ||
3008 I->getOpcode() == Instruction::URem);
3009 assert(!isSafeToSpeculativelyExecute(I));
3010
3011 // Scalarization isn't legal for scalable vector types
3012 InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3013 if (!VF.isScalable()) {
3014 // Get the scalarization cost and scale this amount by the probability of
3015 // executing the predicated block. If the instruction is not predicated,
3016 // we fall through to the next case.
3017 ScalarizationCost = 0;
3018
3019 // These instructions have a non-void type, so account for the phi nodes
3020 // that we will create. This cost is likely to be zero. The phi node
3021 // cost, if any, should be scaled by the block probability because it
3022 // models a copy at the end of each predicated block.
3023 ScalarizationCost +=
3024 VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
3025
3026 // The cost of the non-predicated instruction.
3027 ScalarizationCost +=
3028 VF.getFixedValue() *
3029 TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: I->getType(), CostKind);
3030
3031 // The cost of insertelement and extractelement instructions needed for
3032 // scalarization.
3033 ScalarizationCost += getScalarizationOverhead(I, VF);
3034
3035 // Scale the cost by the probability of executing the predicated blocks.
3036 // This assumes the predicated block for each vector lane is equally
3037 // likely.
3038 ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
3039 }
3040 InstructionCost SafeDivisorCost = 0;
3041
3042 auto *VecTy = toVectorTy(Scalar: I->getType(), EC: VF);
3043
3044 // The cost of the select guard to ensure all lanes are well defined
3045 // after we speculate above any internal control flow.
3046 SafeDivisorCost +=
3047 TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy,
3048 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
3049 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
3050
3051 // Certain instructions can be cheaper to vectorize if they have a constant
3052 // second vector operand. One example of this are shifts on x86.
3053 Value *Op2 = I->getOperand(i: 1);
3054 auto Op2Info = TTI.getOperandInfo(V: Op2);
3055 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3056 Legal->isInvariant(V: Op2))
3057 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
3058
3059 SmallVector<const Value *, 4> Operands(I->operand_values());
3060 SafeDivisorCost += TTI.getArithmeticInstrCost(
3061 Opcode: I->getOpcode(), Ty: VecTy, CostKind,
3062 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
3063 Opd2Info: Op2Info, Args: Operands, CxtI: I);
3064 return {ScalarizationCost, SafeDivisorCost};
3065}
3066
3067bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
3068 Instruction *I, ElementCount VF) const {
3069 assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3070 assert(getWideningDecision(I, VF) == CM_Unknown &&
3071 "Decision should not be set yet.");
3072 auto *Group = getInterleavedAccessGroup(Instr: I);
3073 assert(Group && "Must have a group.");
3074 unsigned InterleaveFactor = Group->getFactor();
3075
3076 // If the instruction's allocated size doesn't equal its type size, it
3077 // requires padding and will be scalarized.
3078 auto &DL = I->getDataLayout();
3079 auto *ScalarTy = getLoadStoreType(I);
3080 if (hasIrregularType(Ty: ScalarTy, DL))
3081 return false;
3082
3083 // For scalable vectors, the interleave factors must be <= 8 since we require
3084 // the (de)interleaveN intrinsics instead of shufflevectors.
3085 if (VF.isScalable() && InterleaveFactor > 8)
3086 return false;
3087
3088 // If the group involves a non-integral pointer, we may not be able to
3089 // losslessly cast all values to a common type.
3090 bool ScalarNI = DL.isNonIntegralPointerType(Ty: ScalarTy);
3091 for (unsigned Idx = 0; Idx < InterleaveFactor; Idx++) {
3092 Instruction *Member = Group->getMember(Index: Idx);
3093 if (!Member)
3094 continue;
3095 auto *MemberTy = getLoadStoreType(I: Member);
3096 bool MemberNI = DL.isNonIntegralPointerType(Ty: MemberTy);
3097 // Don't coerce non-integral pointers to integers or vice versa.
3098 if (MemberNI != ScalarNI)
3099 // TODO: Consider adding special nullptr value case here
3100 return false;
3101 if (MemberNI && ScalarNI &&
3102 ScalarTy->getPointerAddressSpace() !=
3103 MemberTy->getPointerAddressSpace())
3104 return false;
3105 }
3106
3107 // Check if masking is required.
3108 // A Group may need masking for one of two reasons: it resides in a block that
3109 // needs predication, or it was decided to use masking to deal with gaps
3110 // (either a gap at the end of a load-access that may result in a speculative
3111 // load, or any gaps in a store-access).
3112 bool PredicatedAccessRequiresMasking =
3113 blockNeedsPredicationForAnyReason(BB: I->getParent()) &&
3114 Legal->isMaskRequired(I);
3115 bool LoadAccessWithGapsRequiresEpilogMasking =
3116 isa<LoadInst>(Val: I) && Group->requiresScalarEpilogue() &&
3117 !isScalarEpilogueAllowed();
3118 bool StoreAccessWithGapsRequiresMasking =
3119 isa<StoreInst>(Val: I) && (Group->getNumMembers() < Group->getFactor());
3120 if (!PredicatedAccessRequiresMasking &&
3121 !LoadAccessWithGapsRequiresEpilogMasking &&
3122 !StoreAccessWithGapsRequiresMasking)
3123 return true;
3124
3125 // If masked interleaving is required, we expect that the user/target had
3126 // enabled it, because otherwise it either wouldn't have been created or
3127 // it should have been invalidated by the CostModel.
3128 assert(useMaskedInterleavedAccesses(TTI) &&
3129 "Masked interleave-groups for predicated accesses are not enabled.");
3130
3131 if (Group->isReverse())
3132 return false;
3133
3134 auto *Ty = getLoadStoreType(I);
3135 const Align Alignment = getLoadStoreAlignment(I);
3136 unsigned AS = getLoadStoreAddressSpace(I);
3137 return isa<LoadInst>(Val: I) ? TTI.isLegalMaskedLoad(DataType: Ty, Alignment, AddressSpace: AS)
3138 : TTI.isLegalMaskedStore(DataType: Ty, Alignment, AddressSpace: AS);
3139}
3140
3141bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3142 Instruction *I, ElementCount VF) {
3143 // Get and ensure we have a valid memory instruction.
3144 assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3145
3146 auto *Ptr = getLoadStorePointerOperand(V: I);
3147 auto *ScalarTy = getLoadStoreType(I);
3148
3149 // In order to be widened, the pointer should be consecutive, first of all.
3150 if (!Legal->isConsecutivePtr(AccessTy: ScalarTy, Ptr))
3151 return false;
3152
3153 // If the instruction is a store located in a predicated block, it will be
3154 // scalarized.
3155 if (isScalarWithPredication(I, VF))
3156 return false;
3157
3158 // If the instruction's allocated size doesn't equal it's type size, it
3159 // requires padding and will be scalarized.
3160 auto &DL = I->getDataLayout();
3161 if (hasIrregularType(Ty: ScalarTy, DL))
3162 return false;
3163
3164 return true;
3165}
3166
3167void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3168 // We should not collect Uniforms more than once per VF. Right now,
3169 // this function is called from collectUniformsAndScalars(), which
3170 // already does this check. Collecting Uniforms for VF=1 does not make any
3171 // sense.
3172
3173 assert(VF.isVector() && !Uniforms.contains(VF) &&
3174 "This function should not be visited twice for the same VF");
3175
3176 // Visit the list of Uniforms. If we find no uniform value, we won't
3177 // analyze again. Uniforms.count(VF) will return 1.
3178 Uniforms[VF].clear();
3179
3180 // Now we know that the loop is vectorizable!
3181 // Collect instructions inside the loop that will remain uniform after
3182 // vectorization.
3183
3184 // Global values, params and instructions outside of current loop are out of
3185 // scope.
3186 auto IsOutOfScope = [&](Value *V) -> bool {
3187 Instruction *I = dyn_cast<Instruction>(Val: V);
3188 return (!I || !TheLoop->contains(Inst: I));
3189 };
3190
3191 // Worklist containing uniform instructions demanding lane 0.
3192 SetVector<Instruction *> Worklist;
3193
3194 // Add uniform instructions demanding lane 0 to the worklist. Instructions
3195 // that require predication must not be considered uniform after
3196 // vectorization, because that would create an erroneous replicating region
3197 // where only a single instance out of VF should be formed.
3198 auto AddToWorklistIfAllowed = [&](Instruction *I) -> void {
3199 if (IsOutOfScope(I)) {
3200 LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3201 << *I << "\n");
3202 return;
3203 }
3204 if (isPredicatedInst(I)) {
3205 LLVM_DEBUG(
3206 dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3207 << "\n");
3208 return;
3209 }
3210 LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3211 Worklist.insert(X: I);
3212 };
3213
3214 // Start with the conditional branches exiting the loop. If the branch
3215 // condition is an instruction contained in the loop that is only used by the
3216 // branch, it is uniform. Note conditions from uncountable early exits are not
3217 // uniform.
3218 SmallVector<BasicBlock *> Exiting;
3219 TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
3220 for (BasicBlock *E : Exiting) {
3221 if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3222 continue;
3223 auto *Cmp = dyn_cast<Instruction>(Val: E->getTerminator()->getOperand(i: 0));
3224 if (Cmp && TheLoop->contains(Inst: Cmp) && Cmp->hasOneUse())
3225 AddToWorklistIfAllowed(Cmp);
3226 }
3227
3228 auto PrevVF = VF.divideCoefficientBy(RHS: 2);
3229 // Return true if all lanes perform the same memory operation, and we can
3230 // thus choose to execute only one.
3231 auto IsUniformMemOpUse = [&](Instruction *I) {
3232 // If the value was already known to not be uniform for the previous
3233 // (smaller VF), it cannot be uniform for the larger VF.
3234 if (PrevVF.isVector()) {
3235 auto Iter = Uniforms.find(Val: PrevVF);
3236 if (Iter != Uniforms.end() && !Iter->second.contains(Ptr: I))
3237 return false;
3238 }
3239 if (!Legal->isUniformMemOp(I&: *I, VF))
3240 return false;
3241 if (isa<LoadInst>(Val: I))
3242 // Loading the same address always produces the same result - at least
3243 // assuming aliasing and ordering which have already been checked.
3244 return true;
3245 // Storing the same value on every iteration.
3246 return TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand());
3247 };
3248
3249 auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3250 InstWidening WideningDecision = getWideningDecision(I, VF);
3251 assert(WideningDecision != CM_Unknown &&
3252 "Widening decision should be ready at this moment");
3253
3254 if (IsUniformMemOpUse(I))
3255 return true;
3256
3257 return (WideningDecision == CM_Widen ||
3258 WideningDecision == CM_Widen_Reverse ||
3259 WideningDecision == CM_Interleave);
3260 };
3261
3262 // Returns true if Ptr is the pointer operand of a memory access instruction
3263 // I, I is known to not require scalarization, and the pointer is not also
3264 // stored.
3265 auto IsVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
3266 if (isa<StoreInst>(Val: I) && I->getOperand(i: 0) == Ptr)
3267 return false;
3268 return getLoadStorePointerOperand(V: I) == Ptr &&
3269 (IsUniformDecision(I, VF) || Legal->isInvariant(V: Ptr));
3270 };
3271
3272 // Holds a list of values which are known to have at least one uniform use.
3273 // Note that there may be other uses which aren't uniform. A "uniform use"
3274 // here is something which only demands lane 0 of the unrolled iterations;
3275 // it does not imply that all lanes produce the same value (e.g. this is not
3276 // the usual meaning of uniform)
3277 SetVector<Value *> HasUniformUse;
3278
3279 // Scan the loop for instructions which are either a) known to have only
3280 // lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3281 for (auto *BB : TheLoop->blocks())
3282 for (auto &I : *BB) {
3283 if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &I)) {
3284 switch (II->getIntrinsicID()) {
3285 case Intrinsic::sideeffect:
3286 case Intrinsic::experimental_noalias_scope_decl:
3287 case Intrinsic::assume:
3288 case Intrinsic::lifetime_start:
3289 case Intrinsic::lifetime_end:
3290 if (TheLoop->hasLoopInvariantOperands(I: &I))
3291 AddToWorklistIfAllowed(&I);
3292 break;
3293 default:
3294 break;
3295 }
3296 }
3297
3298 if (auto *EVI = dyn_cast<ExtractValueInst>(Val: &I)) {
3299 if (IsOutOfScope(EVI->getAggregateOperand())) {
3300 AddToWorklistIfAllowed(EVI);
3301 continue;
3302 }
3303 // Only ExtractValue instructions where the aggregate value comes from a
3304 // call are allowed to be non-uniform.
3305 assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3306 "Expected aggregate value to be call return value");
3307 }
3308
3309 // If there's no pointer operand, there's nothing to do.
3310 auto *Ptr = getLoadStorePointerOperand(V: &I);
3311 if (!Ptr)
3312 continue;
3313
3314 if (IsUniformMemOpUse(&I))
3315 AddToWorklistIfAllowed(&I);
3316
3317 if (IsVectorizedMemAccessUse(&I, Ptr))
3318 HasUniformUse.insert(X: Ptr);
3319 }
3320
3321 // Add to the worklist any operands which have *only* uniform (e.g. lane 0
3322 // demanding) users. Since loops are assumed to be in LCSSA form, this
3323 // disallows uses outside the loop as well.
3324 for (auto *V : HasUniformUse) {
3325 if (IsOutOfScope(V))
3326 continue;
3327 auto *I = cast<Instruction>(Val: V);
3328 bool UsersAreMemAccesses = all_of(Range: I->users(), P: [&](User *U) -> bool {
3329 auto *UI = cast<Instruction>(Val: U);
3330 return TheLoop->contains(Inst: UI) && IsVectorizedMemAccessUse(UI, V);
3331 });
3332 if (UsersAreMemAccesses)
3333 AddToWorklistIfAllowed(I);
3334 }
3335
3336 // Expand Worklist in topological order: whenever a new instruction
3337 // is added , its users should be already inside Worklist. It ensures
3338 // a uniform instruction will only be used by uniform instructions.
3339 unsigned Idx = 0;
3340 while (Idx != Worklist.size()) {
3341 Instruction *I = Worklist[Idx++];
3342
3343 for (auto *OV : I->operand_values()) {
3344 // isOutOfScope operands cannot be uniform instructions.
3345 if (IsOutOfScope(OV))
3346 continue;
3347 // First order recurrence Phi's should typically be considered
3348 // non-uniform.
3349 auto *OP = dyn_cast<PHINode>(Val: OV);
3350 if (OP && Legal->isFixedOrderRecurrence(Phi: OP))
3351 continue;
3352 // If all the users of the operand are uniform, then add the
3353 // operand into the uniform worklist.
3354 auto *OI = cast<Instruction>(Val: OV);
3355 if (llvm::all_of(Range: OI->users(), P: [&](User *U) -> bool {
3356 auto *J = cast<Instruction>(Val: U);
3357 return Worklist.count(key: J) || IsVectorizedMemAccessUse(J, OI);
3358 }))
3359 AddToWorklistIfAllowed(OI);
3360 }
3361 }
3362
3363 // For an instruction to be added into Worklist above, all its users inside
3364 // the loop should also be in Worklist. However, this condition cannot be
3365 // true for phi nodes that form a cyclic dependence. We must process phi
3366 // nodes separately. An induction variable will remain uniform if all users
3367 // of the induction variable and induction variable update remain uniform.
3368 // The code below handles both pointer and non-pointer induction variables.
3369 BasicBlock *Latch = TheLoop->getLoopLatch();
3370 for (const auto &Induction : Legal->getInductionVars()) {
3371 auto *Ind = Induction.first;
3372 auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
3373
3374 // Determine if all users of the induction variable are uniform after
3375 // vectorization.
3376 bool UniformInd = all_of(Range: Ind->users(), P: [&](User *U) -> bool {
3377 auto *I = cast<Instruction>(Val: U);
3378 return I == IndUpdate || !TheLoop->contains(Inst: I) || Worklist.count(key: I) ||
3379 IsVectorizedMemAccessUse(I, Ind);
3380 });
3381 if (!UniformInd)
3382 continue;
3383
3384 // Determine if all users of the induction variable update instruction are
3385 // uniform after vectorization.
3386 bool UniformIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User *U) -> bool {
3387 auto *I = cast<Instruction>(Val: U);
3388 return I == Ind || Worklist.count(key: I) ||
3389 IsVectorizedMemAccessUse(I, IndUpdate);
3390 });
3391 if (!UniformIndUpdate)
3392 continue;
3393
3394 // The induction variable and its update instruction will remain uniform.
3395 AddToWorklistIfAllowed(Ind);
3396 AddToWorklistIfAllowed(IndUpdate);
3397 }
3398
3399 Uniforms[VF].insert_range(R&: Worklist);
3400}
3401
3402bool LoopVectorizationCostModel::runtimeChecksRequired() {
3403 LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3404
3405 if (Legal->getRuntimePointerChecking()->Need) {
3406 reportVectorizationFailure(DebugMsg: "Runtime ptr check is required with -Os/-Oz",
3407 OREMsg: "runtime pointer checks needed. Enable vectorization of this "
3408 "loop with '#pragma clang loop vectorize(enable)' when "
3409 "compiling with -Os/-Oz",
3410 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3411 return true;
3412 }
3413
3414 if (!PSE.getPredicate().isAlwaysTrue()) {
3415 reportVectorizationFailure(DebugMsg: "Runtime SCEV check is required with -Os/-Oz",
3416 OREMsg: "runtime SCEV checks needed. Enable vectorization of this "
3417 "loop with '#pragma clang loop vectorize(enable)' when "
3418 "compiling with -Os/-Oz",
3419 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3420 return true;
3421 }
3422
3423 // FIXME: Avoid specializing for stride==1 instead of bailing out.
3424 if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3425 reportVectorizationFailure(DebugMsg: "Runtime stride check for small trip count",
3426 OREMsg: "runtime stride == 1 checks needed. Enable vectorization of "
3427 "this loop without such check by compiling with -Os/-Oz",
3428 ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3429 return true;
3430 }
3431
3432 return false;
3433}
3434
3435bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3436 if (IsScalableVectorizationAllowed)
3437 return *IsScalableVectorizationAllowed;
3438
3439 IsScalableVectorizationAllowed = false;
3440 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3441 return false;
3442
3443 if (Hints->isScalableVectorizationDisabled()) {
3444 reportVectorizationInfo(Msg: "Scalable vectorization is explicitly disabled",
3445 ORETag: "ScalableVectorizationDisabled", ORE, TheLoop);
3446 return false;
3447 }
3448
3449 LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3450
3451 auto MaxScalableVF = ElementCount::getScalable(
3452 MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3453
3454 // Test that the loop-vectorizer can legalize all operations for this MaxVF.
3455 // FIXME: While for scalable vectors this is currently sufficient, this should
3456 // be replaced by a more detailed mechanism that filters out specific VFs,
3457 // instead of invalidating vectorization for a whole set of VFs based on the
3458 // MaxVF.
3459
3460 // Disable scalable vectorization if the loop contains unsupported reductions.
3461 if (!canVectorizeReductions(VF: MaxScalableVF)) {
3462 reportVectorizationInfo(
3463 Msg: "Scalable vectorization not supported for the reduction "
3464 "operations found in this loop.",
3465 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3466 return false;
3467 }
3468
3469 // Disable scalable vectorization if the loop contains any instructions
3470 // with element types not supported for scalable vectors.
3471 if (any_of(Range&: ElementTypesInLoop, P: [&](Type *Ty) {
3472 return !Ty->isVoidTy() &&
3473 !this->TTI.isElementTypeLegalForScalableVector(Ty);
3474 })) {
3475 reportVectorizationInfo(Msg: "Scalable vectorization is not supported "
3476 "for all element types found in this loop.",
3477 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3478 return false;
3479 }
3480
3481 if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F: *TheFunction, TTI)) {
3482 reportVectorizationInfo(Msg: "The target does not provide maximum vscale value "
3483 "for safe distance analysis.",
3484 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3485 return false;
3486 }
3487
3488 IsScalableVectorizationAllowed = true;
3489 return true;
3490}
3491
3492ElementCount
3493LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3494 if (!isScalableVectorizationAllowed())
3495 return ElementCount::getScalable(MinVal: 0);
3496
3497 auto MaxScalableVF = ElementCount::getScalable(
3498 MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3499 if (Legal->isSafeForAnyVectorWidth())
3500 return MaxScalableVF;
3501
3502 std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3503 // Limit MaxScalableVF by the maximum safe dependence distance.
3504 MaxScalableVF = ElementCount::getScalable(MinVal: MaxSafeElements / *MaxVScale);
3505
3506 if (!MaxScalableVF)
3507 reportVectorizationInfo(
3508 Msg: "Max legal vector width too small, scalable vectorization "
3509 "unfeasible.",
3510 ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3511
3512 return MaxScalableVF;
3513}
3514
3515FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3516 unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3517 MinBWs = computeMinimumValueSizes(Blocks: TheLoop->getBlocks(), DB&: *DB, TTI: &TTI);
3518 unsigned SmallestType, WidestType;
3519 std::tie(args&: SmallestType, args&: WidestType) = getSmallestAndWidestTypes();
3520
3521 // Get the maximum safe dependence distance in bits computed by LAA.
3522 // It is computed by MaxVF * sizeOf(type) * 8, where type is taken from
3523 // the memory accesses that is most restrictive (involved in the smallest
3524 // dependence distance).
3525 unsigned MaxSafeElementsPowerOf2 =
3526 bit_floor(Value: Legal->getMaxSafeVectorWidthInBits() / WidestType);
3527 if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
3528 unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
3529 MaxSafeElementsPowerOf2 =
3530 std::min(a: MaxSafeElementsPowerOf2, b: SLDist / WidestType);
3531 }
3532 auto MaxSafeFixedVF = ElementCount::getFixed(MinVal: MaxSafeElementsPowerOf2);
3533 auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements: MaxSafeElementsPowerOf2);
3534
3535 if (!Legal->isSafeForAnyVectorWidth())
3536 this->MaxSafeElements = MaxSafeElementsPowerOf2;
3537
3538 LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3539 << ".\n");
3540 LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3541 << ".\n");
3542
3543 // First analyze the UserVF, fall back if the UserVF should be ignored.
3544 if (UserVF) {
3545 auto MaxSafeUserVF =
3546 UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3547
3548 if (ElementCount::isKnownLE(LHS: UserVF, RHS: MaxSafeUserVF)) {
3549 // If `VF=vscale x N` is safe, then so is `VF=N`
3550 if (UserVF.isScalable())
3551 return FixedScalableVFPair(
3552 ElementCount::getFixed(MinVal: UserVF.getKnownMinValue()), UserVF);
3553
3554 return UserVF;
3555 }
3556
3557 assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3558
3559 // Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3560 // is better to ignore the hint and let the compiler choose a suitable VF.
3561 if (!UserVF.isScalable()) {
3562 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3563 << " is unsafe, clamping to max safe VF="
3564 << MaxSafeFixedVF << ".\n");
3565 ORE->emit(RemarkBuilder: [&]() {
3566 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3567 TheLoop->getStartLoc(),
3568 TheLoop->getHeader())
3569 << "User-specified vectorization factor "
3570 << ore::NV("UserVectorizationFactor", UserVF)
3571 << " is unsafe, clamping to maximum safe vectorization factor "
3572 << ore::NV("VectorizationFactor", MaxSafeFixedVF);
3573 });
3574 return MaxSafeFixedVF;
3575 }
3576
3577 if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3578 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3579 << " is ignored because scalable vectors are not "
3580 "available.\n");
3581 ORE->emit(RemarkBuilder: [&]() {
3582 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3583 TheLoop->getStartLoc(),
3584 TheLoop->getHeader())
3585 << "User-specified vectorization factor "
3586 << ore::NV("UserVectorizationFactor", UserVF)
3587 << " is ignored because the target does not support scalable "
3588 "vectors. The compiler will pick a more suitable value.";
3589 });
3590 } else {
3591 LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3592 << " is unsafe. Ignoring scalable UserVF.\n");
3593 ORE->emit(RemarkBuilder: [&]() {
3594 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationFactor",
3595 TheLoop->getStartLoc(),
3596 TheLoop->getHeader())
3597 << "User-specified vectorization factor "
3598 << ore::NV("UserVectorizationFactor", UserVF)
3599 << " is unsafe. Ignoring the hint to let the compiler pick a "
3600 "more suitable value.";
3601 });
3602 }
3603 }
3604
3605 LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3606 << " / " << WidestType << " bits.\n");
3607
3608 FixedScalableVFPair Result(ElementCount::getFixed(MinVal: 1),
3609 ElementCount::getScalable(MinVal: 0));
3610 if (auto MaxVF =
3611 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3612 MaxSafeVF: MaxSafeFixedVF, FoldTailByMasking))
3613 Result.FixedVF = MaxVF;
3614
3615 if (auto MaxVF =
3616 getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3617 MaxSafeVF: MaxSafeScalableVF, FoldTailByMasking))
3618 if (MaxVF.isScalable()) {
3619 Result.ScalableVF = MaxVF;
3620 LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3621 << "\n");
3622 }
3623
3624 return Result;
3625}
3626
3627FixedScalableVFPair
3628LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3629 if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3630 // TODO: It may be useful to do since it's still likely to be dynamically
3631 // uniform if the target can skip.
3632 reportVectorizationFailure(
3633 DebugMsg: "Not inserting runtime ptr check for divergent target",
3634 OREMsg: "runtime pointer checks needed. Not enabled for divergent target",
3635 ORETag: "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3636 return FixedScalableVFPair::getNone();
3637 }
3638
3639 ScalarEvolution *SE = PSE.getSE();
3640 ElementCount TC = getSmallConstantTripCount(SE, L: TheLoop);
3641 unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3642 LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
3643 if (TC != ElementCount::getFixed(MinVal: MaxTC))
3644 LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << '\n');
3645 if (TC.isScalar()) {
3646 reportVectorizationFailure(DebugMsg: "Single iteration (non) loop",
3647 OREMsg: "loop trip count is one, irrelevant for vectorization",
3648 ORETag: "SingleIterationLoop", ORE, TheLoop);
3649 return FixedScalableVFPair::getNone();
3650 }
3651
3652 // If BTC matches the widest induction type and is -1 then the trip count
3653 // computation will wrap to 0 and the vector trip count will be 0. Do not try
3654 // to vectorize.
3655 const SCEV *BTC = SE->getBackedgeTakenCount(L: TheLoop);
3656 if (!isa<SCEVCouldNotCompute>(Val: BTC) &&
3657 BTC->getType()->getScalarSizeInBits() >=
3658 Legal->getWidestInductionType()->getScalarSizeInBits() &&
3659 SE->isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: BTC,
3660 RHS: SE->getMinusOne(Ty: BTC->getType()))) {
3661 reportVectorizationFailure(
3662 DebugMsg: "Trip count computation wrapped",
3663 OREMsg: "backedge-taken count is -1, loop trip count wrapped to 0",
3664 ORETag: "TripCountWrapped", ORE, TheLoop);
3665 return FixedScalableVFPair::getNone();
3666 }
3667
3668 switch (ScalarEpilogueStatus) {
3669 case CM_ScalarEpilogueAllowed:
3670 return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: false);
3671 case CM_ScalarEpilogueNotAllowedUsePredicate:
3672 [[fallthrough]];
3673 case CM_ScalarEpilogueNotNeededUsePredicate:
3674 LLVM_DEBUG(
3675 dbgs() << "LV: vector predicate hint/switch found.\n"
3676 << "LV: Not allowing scalar epilogue, creating predicated "
3677 << "vector loop.\n");
3678 break;
3679 case CM_ScalarEpilogueNotAllowedLowTripLoop:
3680 // fallthrough as a special case of OptForSize
3681 case CM_ScalarEpilogueNotAllowedOptSize:
3682 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3683 LLVM_DEBUG(
3684 dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3685 else
3686 LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3687 << "count.\n");
3688
3689 // Bail if runtime checks are required, which are not good when optimising
3690 // for size.
3691 if (runtimeChecksRequired())
3692 return FixedScalableVFPair::getNone();
3693
3694 break;
3695 }
3696
3697 // Now try the tail folding
3698
3699 // Invalidate interleave groups that require an epilogue if we can't mask
3700 // the interleave-group.
3701 if (!useMaskedInterleavedAccesses(TTI)) {
3702 assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
3703 "No decisions should have been taken at this point");
3704 // Note: There is no need to invalidate any cost modeling decisions here, as
3705 // none were taken so far.
3706 InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
3707 }
3708
3709 FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: true);
3710
3711 // Avoid tail folding if the trip count is known to be a multiple of any VF
3712 // we choose.
3713 std::optional<unsigned> MaxPowerOf2RuntimeVF =
3714 MaxFactors.FixedVF.getFixedValue();
3715 if (MaxFactors.ScalableVF) {
3716 std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3717 if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
3718 MaxPowerOf2RuntimeVF = std::max<unsigned>(
3719 a: *MaxPowerOf2RuntimeVF,
3720 b: *MaxVScale * MaxFactors.ScalableVF.getKnownMinValue());
3721 } else
3722 MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
3723 }
3724
3725 auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
3726 // Return false if the loop is neither a single-latch-exit loop nor an
3727 // early-exit loop as tail-folding is not supported in that case.
3728 if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3729 !Legal->hasUncountableEarlyExit())
3730 return false;
3731 unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3732 ScalarEvolution *SE = PSE.getSE();
3733 // Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3734 // with uncountable exits. For countable loops, the symbolic maximum must
3735 // remain identical to the known back-edge taken count.
3736 const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3737 assert((Legal->hasUncountableEarlyExit() ||
3738 BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3739 "Invalid loop count");
3740 const SCEV *ExitCount = SE->getAddExpr(
3741 LHS: BackedgeTakenCount, RHS: SE->getOne(Ty: BackedgeTakenCount->getType()));
3742 const SCEV *Rem = SE->getURemExpr(
3743 LHS: SE->applyLoopGuards(Expr: ExitCount, L: TheLoop),
3744 RHS: SE->getConstant(Ty: BackedgeTakenCount->getType(), V: MaxVFtimesIC));
3745 return Rem->isZero();
3746 };
3747
3748 if (MaxPowerOf2RuntimeVF > 0u) {
3749 assert((UserVF.isNonZero() || isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3750 "MaxFixedVF must be a power of 2");
3751 if (NoScalarEpilogueNeeded(*MaxPowerOf2RuntimeVF)) {
3752 // Accept MaxFixedVF if we do not have a tail.
3753 LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3754 return MaxFactors;
3755 }
3756 }
3757
3758 auto ExpectedTC = getSmallBestKnownTC(PSE, L: TheLoop);
3759 if (ExpectedTC && ExpectedTC->isFixed() &&
3760 ExpectedTC->getFixedValue() <=
3761 TTI.getMinTripCountTailFoldingThreshold()) {
3762 if (MaxPowerOf2RuntimeVF > 0u) {
3763 // If we have a low-trip-count, and the fixed-width VF is known to divide
3764 // the trip count but the scalable factor does not, use the fixed-width
3765 // factor in preference to allow the generation of a non-predicated loop.
3766 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
3767 NoScalarEpilogueNeeded(MaxFactors.FixedVF.getFixedValue())) {
3768 LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3769 "remain for any chosen VF.\n");
3770 MaxFactors.ScalableVF = ElementCount::getScalable(MinVal: 0);
3771 return MaxFactors;
3772 }
3773 }
3774
3775 reportVectorizationFailure(
3776 DebugMsg: "The trip count is below the minial threshold value.",
3777 OREMsg: "loop trip count is too low, avoiding vectorization", ORETag: "LowTripCount",
3778 ORE, TheLoop);
3779 return FixedScalableVFPair::getNone();
3780 }
3781
3782 // If we don't know the precise trip count, or if the trip count that we
3783 // found modulo the vectorization factor is not zero, try to fold the tail
3784 // by masking.
3785 // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3786 bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3787 setTailFoldingStyles(IsScalableVF: ContainsScalableVF, UserIC);
3788 if (foldTailByMasking()) {
3789 if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
3790 LLVM_DEBUG(
3791 dbgs()
3792 << "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3793 "try to generate VP Intrinsics with scalable vector "
3794 "factors only.\n");
3795 // Tail folded loop using VP intrinsics restricts the VF to be scalable
3796 // for now.
3797 // TODO: extend it for fixed vectors, if required.
3798 assert(ContainsScalableVF && "Expected scalable vector factor.");
3799
3800 MaxFactors.FixedVF = ElementCount::getFixed(MinVal: 1);
3801 }
3802 return MaxFactors;
3803 }
3804
3805 // If there was a tail-folding hint/switch, but we can't fold the tail by
3806 // masking, fallback to a vectorization with a scalar epilogue.
3807 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
3808 LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
3809 "scalar epilogue instead.\n");
3810 ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
3811 return MaxFactors;
3812 }
3813
3814 if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
3815 LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3816 return FixedScalableVFPair::getNone();
3817 }
3818
3819 if (TC.isZero()) {
3820 reportVectorizationFailure(
3821 DebugMsg: "unable to calculate the loop count due to complex control flow",
3822 ORETag: "UnknownLoopCountComplexCFG", ORE, TheLoop);
3823 return FixedScalableVFPair::getNone();
3824 }
3825
3826 reportVectorizationFailure(
3827 DebugMsg: "Cannot optimize for size and vectorize at the same time.",
3828 OREMsg: "cannot optimize for size and vectorize at the same time. "
3829 "Enable vectorization of this loop with '#pragma clang loop "
3830 "vectorize(enable)' when compiling with -Os/-Oz",
3831 ORETag: "NoTailLoopWithOptForSize", ORE, TheLoop);
3832 return FixedScalableVFPair::getNone();
3833}
3834
3835bool LoopVectorizationCostModel::useMaxBandwidth(ElementCount VF) {
3836 return useMaxBandwidth(RegKind: VF.isScalable()
3837 ? TargetTransformInfo::RGK_ScalableVector
3838 : TargetTransformInfo::RGK_FixedWidthVector);
3839}
3840
3841bool LoopVectorizationCostModel::useMaxBandwidth(
3842 TargetTransformInfo::RegisterKind RegKind) {
3843 return MaximizeBandwidth || (MaximizeBandwidth.getNumOccurrences() == 0 &&
3844 (TTI.shouldMaximizeVectorBandwidth(K: RegKind) ||
3845 (UseWiderVFIfCallVariantsPresent &&
3846 Legal->hasVectorCallVariants())));
3847}
3848
3849ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3850 unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
3851 ElementCount MaxSafeVF, bool FoldTailByMasking) {
3852 bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
3853 const TypeSize WidestRegister = TTI.getRegisterBitWidth(
3854 K: ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3855 : TargetTransformInfo::RGK_FixedWidthVector);
3856
3857 // Convenience function to return the minimum of two ElementCounts.
3858 auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
3859 assert((LHS.isScalable() == RHS.isScalable()) &&
3860 "Scalable flags must match");
3861 return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
3862 };
3863
3864 // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
3865 // Note that both WidestRegister and WidestType may not be a powers of 2.
3866 auto MaxVectorElementCount = ElementCount::get(
3867 MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / WidestType),
3868 Scalable: ComputeScalableMaxVF);
3869 MaxVectorElementCount = MinVF(MaxVectorElementCount, MaxSafeVF);
3870 LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
3871 << (MaxVectorElementCount * WidestType) << " bits.\n");
3872
3873 if (!MaxVectorElementCount) {
3874 LLVM_DEBUG(dbgs() << "LV: The target has no "
3875 << (ComputeScalableMaxVF ? "scalable" : "fixed")
3876 << " vector registers.\n");
3877 return ElementCount::getFixed(MinVal: 1);
3878 }
3879
3880 unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
3881 if (MaxVectorElementCount.isScalable() &&
3882 TheFunction->hasFnAttribute(Kind: Attribute::VScaleRange)) {
3883 auto Attr = TheFunction->getFnAttribute(Kind: Attribute::VScaleRange);
3884 auto Min = Attr.getVScaleRangeMin();
3885 WidestRegisterMinEC *= Min;
3886 }
3887
3888 // When a scalar epilogue is required, at least one iteration of the scalar
3889 // loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3890 // max VF that results in a dead vector loop.
3891 if (MaxTripCount > 0 && requiresScalarEpilogue(IsVectorizing: true))
3892 MaxTripCount -= 1;
3893
3894 if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
3895 (!FoldTailByMasking || isPowerOf2_32(Value: MaxTripCount))) {
3896 // If upper bound loop trip count (TC) is known at compile time there is no
3897 // point in choosing VF greater than TC (as done in the loop below). Select
3898 // maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
3899 // scalable, we only fall back on a fixed VF when the TC is less than or
3900 // equal to the known number of lanes.
3901 auto ClampedUpperTripCount = llvm::bit_floor(Value: MaxTripCount);
3902 LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
3903 "exceeding the constant trip count: "
3904 << ClampedUpperTripCount << "\n");
3905 return ElementCount::get(
3906 MinVal: ClampedUpperTripCount,
3907 Scalable: FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
3908 }
3909
3910 TargetTransformInfo::RegisterKind RegKind =
3911 ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3912 : TargetTransformInfo::RGK_FixedWidthVector;
3913 ElementCount MaxVF = MaxVectorElementCount;
3914 if (useMaxBandwidth(RegKind)) {
3915 auto MaxVectorElementCountMaxBW = ElementCount::get(
3916 MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / SmallestType),
3917 Scalable: ComputeScalableMaxVF);
3918 MaxVF = MinVF(MaxVectorElementCountMaxBW, MaxSafeVF);
3919
3920 if (ElementCount MinVF =
3921 TTI.getMinimumVF(ElemWidth: SmallestType, IsScalable: ComputeScalableMaxVF)) {
3922 if (ElementCount::isKnownLT(LHS: MaxVF, RHS: MinVF)) {
3923 LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
3924 << ") with target's minimum: " << MinVF << '\n');
3925 MaxVF = MinVF;
3926 }
3927 }
3928
3929 // Invalidate any widening decisions we might have made, in case the loop
3930 // requires prediction (decided later), but we have already made some
3931 // load/store widening decisions.
3932 invalidateCostModelingDecisions();
3933 }
3934 return MaxVF;
3935}
3936
3937bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3938 const VectorizationFactor &B,
3939 const unsigned MaxTripCount,
3940 bool HasTail) const {
3941 InstructionCost CostA = A.Cost;
3942 InstructionCost CostB = B.Cost;
3943
3944 // Improve estimate for the vector width if it is scalable.
3945 unsigned EstimatedWidthA = A.Width.getKnownMinValue();
3946 unsigned EstimatedWidthB = B.Width.getKnownMinValue();
3947 if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
3948 if (A.Width.isScalable())
3949 EstimatedWidthA *= *VScale;
3950 if (B.Width.isScalable())
3951 EstimatedWidthB *= *VScale;
3952 }
3953
3954 // When optimizing for size choose whichever is smallest, which will be the
3955 // one with the smallest cost for the whole loop. On a tie pick the larger
3956 // vector width, on the assumption that throughput will be greater.
3957 if (CM.CostKind == TTI::TCK_CodeSize)
3958 return CostA < CostB ||
3959 (CostA == CostB && EstimatedWidthA > EstimatedWidthB);
3960
3961 // Assume vscale may be larger than 1 (or the value being tuned for),
3962 // so that scalable vectorization is slightly favorable over fixed-width
3963 // vectorization.
3964 bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
3965 A.Width.isScalable() && !B.Width.isScalable();
3966
3967 auto CmpFn = [PreferScalable](const InstructionCost &LHS,
3968 const InstructionCost &RHS) {
3969 return PreferScalable ? LHS <= RHS : LHS < RHS;
3970 };
3971
3972 // To avoid the need for FP division:
3973 // (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
3974 // <=> (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA)
3975 if (!MaxTripCount)
3976 return CmpFn(CostA * EstimatedWidthB, CostB * EstimatedWidthA);
3977
3978 auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
3979 InstructionCost VectorCost,
3980 InstructionCost ScalarCost) {
3981 // If the trip count is a known (possibly small) constant, the trip count
3982 // will be rounded up to an integer number of iterations under
3983 // FoldTailByMasking. The total cost in that case will be
3984 // VecCost*ceil(TripCount/VF). When not folding the tail, the total
3985 // cost will be VecCost*floor(TC/VF) + ScalarCost*(TC%VF). There will be
3986 // some extra overheads, but for the purpose of comparing the costs of
3987 // different VFs we can use this to compare the total loop-body cost
3988 // expected after vectorization.
3989 if (HasTail)
3990 return VectorCost * (MaxTripCount / VF) +
3991 ScalarCost * (MaxTripCount % VF);
3992 return VectorCost * divideCeil(Numerator: MaxTripCount, Denominator: VF);
3993 };
3994
3995 auto RTCostA = GetCostForTC(EstimatedWidthA, CostA, A.ScalarCost);
3996 auto RTCostB = GetCostForTC(EstimatedWidthB, CostB, B.ScalarCost);
3997 return CmpFn(RTCostA, RTCostB);
3998}
3999
4000bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
4001 const VectorizationFactor &B,
4002 bool HasTail) const {
4003 const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4004 return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount,
4005 HasTail);
4006}
4007
4008void LoopVectorizationPlanner::emitInvalidCostRemarks(
4009 OptimizationRemarkEmitter *ORE) {
4010 using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4011 SmallVector<RecipeVFPair> InvalidCosts;
4012 for (const auto &Plan : VPlans) {
4013 for (ElementCount VF : Plan->vectorFactors()) {
4014 // The VPlan-based cost model is designed for computing vector cost.
4015 // Querying VPlan-based cost model with a scarlar VF will cause some
4016 // errors because we expect the VF is vector for most of the widen
4017 // recipes.
4018 if (VF.isScalar())
4019 continue;
4020
4021 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4022 CM, CM.CostKind);
4023 precomputeCosts(Plan&: *Plan, VF, CostCtx);
4024 auto Iter = vp_depth_first_deep(G: Plan->getVectorLoopRegion()->getEntry());
4025 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
4026 for (auto &R : *VPBB) {
4027 if (!R.cost(VF, Ctx&: CostCtx).isValid())
4028 InvalidCosts.emplace_back(Args: &R, Args&: VF);
4029 }
4030 }
4031 }
4032 }
4033 if (InvalidCosts.empty())
4034 return;
4035
4036 // Emit a report of VFs with invalid costs in the loop.
4037
4038 // Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4039 DenseMap<VPRecipeBase *, unsigned> Numbering;
4040 unsigned I = 0;
4041 for (auto &Pair : InvalidCosts)
4042 if (Numbering.try_emplace(Key: Pair.first, Args&: I).second)
4043 ++I;
4044
4045 // Sort the list, first on recipe(number) then on VF.
4046 sort(C&: InvalidCosts, Comp: [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4047 unsigned NA = Numbering[A.first];
4048 unsigned NB = Numbering[B.first];
4049 if (NA != NB)
4050 return NA < NB;
4051 return ElementCount::isKnownLT(LHS: A.second, RHS: B.second);
4052 });
4053
4054 // For a list of ordered recipe-VF pairs:
4055 // [(load, VF1), (load, VF2), (store, VF1)]
4056 // group the recipes together to emit separate remarks for:
4057 // load (VF1, VF2)
4058 // store (VF1)
4059 auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4060 auto Subset = ArrayRef<RecipeVFPair>();
4061 do {
4062 if (Subset.empty())
4063 Subset = Tail.take_front(N: 1);
4064
4065 VPRecipeBase *R = Subset.front().first;
4066
4067 unsigned Opcode =
4068 TypeSwitch<const VPRecipeBase *, unsigned>(R)
4069 .Case<VPHeaderPHIRecipe>(
4070 caseFn: [](const auto *R) { return Instruction::PHI; })
4071 .Case<VPWidenSelectRecipe>(
4072 caseFn: [](const auto *R) { return Instruction::Select; })
4073 .Case<VPWidenStoreRecipe>(
4074 caseFn: [](const auto *R) { return Instruction::Store; })
4075 .Case<VPWidenLoadRecipe>(
4076 caseFn: [](const auto *R) { return Instruction::Load; })
4077 .Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4078 caseFn: [](const auto *R) { return Instruction::Call; })
4079 .Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
4080 VPWidenCastRecipe>(
4081 caseFn: [](const auto *R) { return R->getOpcode(); })
4082 .Case<VPInterleaveRecipe>(caseFn: [](const VPInterleaveRecipe *R) {
4083 return R->getStoredValues().empty() ? Instruction::Load
4084 : Instruction::Store;
4085 });
4086
4087 // If the next recipe is different, or if there are no other pairs,
4088 // emit a remark for the collated subset. e.g.
4089 // [(load, VF1), (load, VF2))]
4090 // to emit:
4091 // remark: invalid costs for 'load' at VF=(VF1, VF2)
4092 if (Subset == Tail || Tail[Subset.size()].first != R) {
4093 std::string OutString;
4094 raw_string_ostream OS(OutString);
4095 assert(!Subset.empty() && "Unexpected empty range");
4096 OS << "Recipe with invalid costs prevented vectorization at VF=(";
4097 for (const auto &Pair : Subset)
4098 OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4099 OS << "):";
4100 if (Opcode == Instruction::Call) {
4101 StringRef Name = "";
4102 if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(Val: R)) {
4103 Name = Int->getIntrinsicName();
4104 } else {
4105 auto *WidenCall = dyn_cast<VPWidenCallRecipe>(Val: R);
4106 Function *CalledFn =
4107 WidenCall ? WidenCall->getCalledScalarFunction()
4108 : cast<Function>(Val: R->getOperand(N: R->getNumOperands() - 1)
4109 ->getLiveInIRValue());
4110 Name = CalledFn->getName();
4111 }
4112 OS << " call to " << Name;
4113 } else
4114 OS << " " << Instruction::getOpcodeName(Opcode);
4115 reportVectorizationInfo(Msg: OutString, ORETag: "InvalidCost", ORE, TheLoop: OrigLoop, I: nullptr,
4116 DL: R->getDebugLoc());
4117 Tail = Tail.drop_front(N: Subset.size());
4118 Subset = {};
4119 } else
4120 // Grow the subset by one element
4121 Subset = Tail.take_front(N: Subset.size() + 1);
4122 } while (!Tail.empty());
4123}
4124
4125/// Check if any recipe of \p Plan will generate a vector value, which will be
4126/// assigned a vector register.
4127static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4128 const TargetTransformInfo &TTI) {
4129 assert(VF.isVector() && "Checking a scalar VF?");
4130 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4131 DenseSet<VPRecipeBase *> EphemeralRecipes;
4132 collectEphemeralRecipesForVPlan(Plan, EphRecipes&: EphemeralRecipes);
4133 // Set of already visited types.
4134 DenseSet<Type *> Visited;
4135 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4136 Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
4137 for (VPRecipeBase &R : *VPBB) {
4138 if (EphemeralRecipes.contains(V: &R))
4139 continue;
4140 // Continue early if the recipe is considered to not produce a vector
4141 // result. Note that this includes VPInstruction where some opcodes may
4142 // produce a vector, to preserve existing behavior as VPInstructions model
4143 // aspects not directly mapped to existing IR instructions.
4144 switch (R.getVPDefID()) {
4145 case VPDef::VPDerivedIVSC:
4146 case VPDef::VPScalarIVStepsSC:
4147 case VPDef::VPReplicateSC:
4148 case VPDef::VPInstructionSC:
4149 case VPDef::VPCanonicalIVPHISC:
4150 case VPDef::VPVectorPointerSC:
4151 case VPDef::VPVectorEndPointerSC:
4152 case VPDef::VPExpandSCEVSC:
4153 case VPDef::VPEVLBasedIVPHISC:
4154 case VPDef::VPPredInstPHISC:
4155 case VPDef::VPBranchOnMaskSC:
4156 continue;
4157 case VPDef::VPReductionSC:
4158 case VPDef::VPActiveLaneMaskPHISC:
4159 case VPDef::VPWidenCallSC:
4160 case VPDef::VPWidenCanonicalIVSC:
4161 case VPDef::VPWidenCastSC:
4162 case VPDef::VPWidenGEPSC:
4163 case VPDef::VPWidenIntrinsicSC:
4164 case VPDef::VPWidenSC:
4165 case VPDef::VPWidenSelectSC:
4166 case VPDef::VPBlendSC:
4167 case VPDef::VPFirstOrderRecurrencePHISC:
4168 case VPDef::VPHistogramSC:
4169 case VPDef::VPWidenPHISC:
4170 case VPDef::VPWidenIntOrFpInductionSC:
4171 case VPDef::VPWidenPointerInductionSC:
4172 case VPDef::VPReductionPHISC:
4173 case VPDef::VPInterleaveSC:
4174 case VPDef::VPWidenLoadEVLSC:
4175 case VPDef::VPWidenLoadSC:
4176 case VPDef::VPWidenStoreEVLSC:
4177 case VPDef::VPWidenStoreSC:
4178 break;
4179 default:
4180 llvm_unreachable("unhandled recipe");
4181 }
4182
4183 auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
4184 unsigned NumLegalParts = TTI.getNumberOfParts(Tp: VectorTy);
4185 if (!NumLegalParts)
4186 return false;
4187 if (VF.isScalable()) {
4188 // <vscale x 1 x iN> is assumed to be profitable over iN because
4189 // scalable registers are a distinct register class from scalar
4190 // ones. If we ever find a target which wants to lower scalable
4191 // vectors back to scalars, we'll need to update this code to
4192 // explicitly ask TTI about the register class uses for each part.
4193 return NumLegalParts <= VF.getKnownMinValue();
4194 }
4195 // Two or more elements that share a register - are vectorized.
4196 return NumLegalParts < VF.getFixedValue();
4197 };
4198
4199 // If no def nor is a store, e.g., branches, continue - no value to check.
4200 if (R.getNumDefinedValues() == 0 &&
4201 !isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4202 Val: &R))
4203 continue;
4204 // For multi-def recipes, currently only interleaved loads, suffice to
4205 // check first def only.
4206 // For stores check their stored value; for interleaved stores suffice
4207 // the check first stored value only. In all cases this is the second
4208 // operand.
4209 VPValue *ToCheck =
4210 R.getNumDefinedValues() >= 1 ? R.getVPValue(I: 0) : R.getOperand(N: 1);
4211 Type *ScalarTy = TypeInfo.inferScalarType(V: ToCheck);
4212 if (!Visited.insert(V: {ScalarTy}).second)
4213 continue;
4214 Type *WideTy = toVectorizedTy(Ty: ScalarTy, EC: VF);
4215 if (any_of(Range: getContainedTypes(Ty: WideTy), P: WillGenerateTargetVectors))
4216 return true;
4217 }
4218 }
4219
4220 return false;
4221}
4222
4223static bool hasReplicatorRegion(VPlan &Plan) {
4224 return any_of(Range: VPBlockUtils::blocksOnly<VPRegionBlock>(Range: vp_depth_first_shallow(
4225 G: Plan.getVectorLoopRegion()->getEntry())),
4226 P: [](auto *VPRB) { return VPRB->isReplicator(); });
4227}
4228
4229#ifndef NDEBUG
4230VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4231 InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(1));
4232 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4233 assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4234 assert(
4235 any_of(VPlans,
4236 [](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }) &&
4237 "Expected Scalar VF to be a candidate");
4238
4239 const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
4240 ExpectedCost);
4241 VectorizationFactor ChosenFactor = ScalarCost;
4242
4243 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4244 if (ForceVectorization &&
4245 (VPlans.size() > 1 || !VPlans[0]->hasScalarVFOnly())) {
4246 // Ignore scalar width, because the user explicitly wants vectorization.
4247 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
4248 // evaluation.
4249 ChosenFactor.Cost = InstructionCost::getMax();
4250 }
4251
4252 for (auto &P : VPlans) {
4253 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
4254 P->vectorFactors().end());
4255
4256 SmallVector<VPRegisterUsage, 8> RUs;
4257 if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) ||
4258 CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
4259 RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
4260
4261 for (unsigned I = 0; I < VFs.size(); I++) {
4262 ElementCount VF = VFs[I];
4263 // The cost for scalar VF=1 is already calculated, so ignore it.
4264 if (VF.isScalar())
4265 continue;
4266
4267 /// Don't consider the VF if it exceeds the number of registers for the
4268 /// target.
4269 if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI))
4270 continue;
4271
4272 InstructionCost C = CM.expectedCost(VF);
4273
4274 // Add on other costs that are modelled in VPlan, but not in the legacy
4275 // cost model.
4276 VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
4277 CM, CM.CostKind);
4278 VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
4279 assert(VectorRegion && "Expected to have a vector region!");
4280 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4281 vp_depth_first_shallow(VectorRegion->getEntry()))) {
4282 for (VPRecipeBase &R : *VPBB) {
4283 auto *VPI = dyn_cast<VPInstruction>(&R);
4284 if (!VPI)
4285 continue;
4286 switch (VPI->getOpcode()) {
4287 case VPInstruction::ActiveLaneMask:
4288 case VPInstruction::ExplicitVectorLength:
4289 C += VPI->cost(VF, CostCtx);
4290 break;
4291 default:
4292 break;
4293 }
4294 }
4295 }
4296
4297 VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4298 unsigned Width =
4299 getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
4300 LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4301 << " costs: " << (Candidate.Cost / Width));
4302 if (VF.isScalable())
4303 LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4304 << CM.getVScaleForTuning().value_or(1) << ")");
4305 LLVM_DEBUG(dbgs() << ".\n");
4306
4307 if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4308 LLVM_DEBUG(
4309 dbgs()
4310 << "LV: Not considering vector loop of width " << VF
4311 << " because it will not generate any vector instructions.\n");
4312 continue;
4313 }
4314
4315 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4316 LLVM_DEBUG(
4317 dbgs()
4318 << "LV: Not considering vector loop of width " << VF
4319 << " because it would cause replicated blocks to be generated,"
4320 << " which isn't allowed when optimizing for size.\n");
4321 continue;
4322 }
4323
4324 if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
4325 ChosenFactor = Candidate;
4326 }
4327 }
4328
4329 if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4330 reportVectorizationFailure(
4331 "There are conditional stores.",
4332 "store that is conditionally executed prevents vectorization",
4333 "ConditionalStore", ORE, OrigLoop);
4334 ChosenFactor = ScalarCost;
4335 }
4336
4337 LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4338 !isMoreProfitable(ChosenFactor, ScalarCost,
4339 !CM.foldTailByMasking())) dbgs()
4340 << "LV: Vectorization seems to be not beneficial, "
4341 << "but was forced by a user.\n");
4342 return ChosenFactor;
4343}
4344#endif
4345
4346bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4347 ElementCount VF) const {
4348 // Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
4349 // reductions need special handling and are currently unsupported.
4350 if (any_of(Range: OrigLoop->getHeader()->phis(), P: [&](PHINode &Phi) {
4351 if (!Legal->isReductionVariable(PN: &Phi))
4352 return Legal->isFixedOrderRecurrence(Phi: &Phi);
4353 RecurKind RK = Legal->getRecurrenceDescriptor(PN: &Phi).getRecurrenceKind();
4354 return RK == RecurKind::FMinNum || RK == RecurKind::FMaxNum;
4355 }))
4356 return false;
4357
4358 // Phis with uses outside of the loop require special handling and are
4359 // currently unsupported.
4360 for (const auto &Entry : Legal->getInductionVars()) {
4361 // Look for uses of the value of the induction at the last iteration.
4362 Value *PostInc =
4363 Entry.first->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch());
4364 for (User *U : PostInc->users())
4365 if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4366 return false;
4367 // Look for uses of penultimate value of the induction.
4368 for (User *U : Entry.first->users())
4369 if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4370 return false;
4371 }
4372
4373 // Epilogue vectorization code has not been auditted to ensure it handles
4374 // non-latch exits properly. It may be fine, but it needs auditted and
4375 // tested.
4376 // TODO: Add support for loops with an early exit.
4377 if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4378 return false;
4379
4380 return true;
4381}
4382
4383bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4384 const ElementCount VF, const unsigned IC) const {
4385 // FIXME: We need a much better cost-model to take different parameters such
4386 // as register pressure, code size increase and cost of extra branches into
4387 // account. For now we apply a very crude heuristic and only consider loops
4388 // with vectorization factors larger than a certain value.
4389
4390 // Allow the target to opt out entirely.
4391 if (!TTI.preferEpilogueVectorization())
4392 return false;
4393
4394 // We also consider epilogue vectorization unprofitable for targets that don't
4395 // consider interleaving beneficial (eg. MVE).
4396 if (TTI.getMaxInterleaveFactor(VF) <= 1)
4397 return false;
4398
4399 // TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4400 // VFs when deciding profitability.
4401 // See related "TODO: extend to support scalable VFs." in
4402 // selectEpilogueVectorizationFactor.
4403 unsigned Multiplier = VF.isFixed() ? IC : 1;
4404 unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > 0
4405 ? EpilogueVectorizationMinVF
4406 : TTI.getEpilogueVectorizationMinVF();
4407 return getEstimatedRuntimeVF(VF: VF * Multiplier, VScale: VScaleForTuning) >=
4408 MinVFThreshold;
4409}
4410
4411VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4412 const ElementCount MainLoopVF, unsigned IC) {
4413 VectorizationFactor Result = VectorizationFactor::Disabled();
4414 if (!EnableEpilogueVectorization) {
4415 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4416 return Result;
4417 }
4418
4419 if (!CM.isScalarEpilogueAllowed()) {
4420 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4421 "epilogue is allowed.\n");
4422 return Result;
4423 }
4424
4425 // Not really a cost consideration, but check for unsupported cases here to
4426 // simplify the logic.
4427 if (!isCandidateForEpilogueVectorization(VF: MainLoopVF)) {
4428 LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4429 "is not a supported candidate.\n");
4430 return Result;
4431 }
4432
4433 if (EpilogueVectorizationForceVF > 1) {
4434 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4435 ElementCount ForcedEC = ElementCount::getFixed(MinVal: EpilogueVectorizationForceVF);
4436 if (hasPlanWithVF(VF: ForcedEC))
4437 return {ForcedEC, 0, 0};
4438
4439 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4440 "viable.\n");
4441 return Result;
4442 }
4443
4444 if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
4445 LLVM_DEBUG(
4446 dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4447 return Result;
4448 }
4449
4450 if (!CM.isEpilogueVectorizationProfitable(VF: MainLoopVF, IC)) {
4451 LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4452 "this loop\n");
4453 return Result;
4454 }
4455
4456 // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4457 // the main loop handles 8 lanes per iteration. We could still benefit from
4458 // vectorizing the epilogue loop with VF=4.
4459 ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4460 MinVal: getEstimatedRuntimeVF(VF: MainLoopVF, VScale: CM.getVScaleForTuning()));
4461
4462 ScalarEvolution &SE = *PSE.getSE();
4463 Type *TCType = Legal->getWidestInductionType();
4464 const SCEV *RemainingIterations = nullptr;
4465 unsigned MaxTripCount = 0;
4466 for (auto &NextVF : ProfitableVFs) {
4467 // Skip candidate VFs without a corresponding VPlan.
4468 if (!hasPlanWithVF(VF: NextVF.Width))
4469 continue;
4470
4471 // Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4472 // vectors) or > the VF of the main loop (fixed vectors).
4473 if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4474 ElementCount::isKnownGE(LHS: NextVF.Width, RHS: EstimatedRuntimeVF)) ||
4475 (NextVF.Width.isScalable() &&
4476 ElementCount::isKnownGE(LHS: NextVF.Width, RHS: MainLoopVF)) ||
4477 (!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4478 ElementCount::isKnownGT(LHS: NextVF.Width, RHS: MainLoopVF)))
4479 continue;
4480
4481 // If NextVF is greater than the number of remaining iterations, the
4482 // epilogue loop would be dead. Skip such factors.
4483 if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4484 // TODO: extend to support scalable VFs.
4485 if (!RemainingIterations) {
4486 const SCEV *TC = vputils::getSCEVExprForVPValue(
4487 V: getPlanFor(VF: NextVF.Width).getTripCount(), SE);
4488 assert(!isa<SCEVCouldNotCompute>(TC) &&
4489 "Trip count SCEV must be computable");
4490 RemainingIterations = SE.getURemExpr(
4491 LHS: TC, RHS: SE.getConstant(Ty: TCType, V: MainLoopVF.getFixedValue() * IC));
4492 MaxTripCount = MainLoopVF.getFixedValue() * IC - 1;
4493 if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULT, LHS: RemainingIterations,
4494 RHS: SE.getConstant(Ty: TCType, V: MaxTripCount))) {
4495 MaxTripCount =
4496 SE.getUnsignedRangeMax(S: RemainingIterations).getZExtValue();
4497 }
4498 LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4499 << MaxTripCount << "\n");
4500 }
4501 if (SE.isKnownPredicate(
4502 Pred: CmpInst::ICMP_UGT,
4503 LHS: SE.getConstant(Ty: TCType, V: NextVF.Width.getFixedValue()),
4504 RHS: RemainingIterations))
4505 continue;
4506 }
4507
4508 if (Result.Width.isScalar() ||
4509 isMoreProfitable(A: NextVF, B: Result, MaxTripCount, HasTail: !CM.foldTailByMasking()))
4510 Result = NextVF;
4511 }
4512
4513 if (Result != VectorizationFactor::Disabled())
4514 LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4515 << Result.Width << "\n");
4516 return Result;
4517}
4518
4519std::pair<unsigned, unsigned>
4520LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4521 unsigned MinWidth = -1U;
4522 unsigned MaxWidth = 8;
4523 const DataLayout &DL = TheFunction->getDataLayout();
4524 // For in-loop reductions, no element types are added to ElementTypesInLoop
4525 // if there are no loads/stores in the loop. In this case, check through the
4526 // reduction variables to determine the maximum width.
4527 if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4528 for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4529 const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4530 // When finding the min width used by the recurrence we need to account
4531 // for casts on the input operands of the recurrence.
4532 MinWidth = std::min(
4533 a: MinWidth,
4534 b: std::min(a: RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4535 b: RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4536 MaxWidth = std::max(a: MaxWidth,
4537 b: RdxDesc.getRecurrenceType()->getScalarSizeInBits());
4538 }
4539 } else {
4540 for (Type *T : ElementTypesInLoop) {
4541 MinWidth = std::min<unsigned>(
4542 a: MinWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4543 MaxWidth = std::max<unsigned>(
4544 a: MaxWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4545 }
4546 }
4547 return {MinWidth, MaxWidth};
4548}
4549
4550void LoopVectorizationCostModel::collectElementTypesForWidening() {
4551 ElementTypesInLoop.clear();
4552 // For each block.
4553 for (BasicBlock *BB : TheLoop->blocks()) {
4554 // For each instruction in the loop.
4555 for (Instruction &I : BB->instructionsWithoutDebug()) {
4556 Type *T = I.getType();
4557
4558 // Skip ignored values.
4559 if (ValuesToIgnore.count(Ptr: &I))
4560 continue;
4561
4562 // Only examine Loads, Stores and PHINodes.
4563 if (!isa<LoadInst>(Val: I) && !isa<StoreInst>(Val: I) && !isa<PHINode>(Val: I))
4564 continue;
4565
4566 // Examine PHI nodes that are reduction variables. Update the type to
4567 // account for the recurrence type.
4568 if (auto *PN = dyn_cast<PHINode>(Val: &I)) {
4569 if (!Legal->isReductionVariable(PN))
4570 continue;
4571 const RecurrenceDescriptor &RdxDesc =
4572 Legal->getRecurrenceDescriptor(PN);
4573 if (PreferInLoopReductions || useOrderedReductions(RdxDesc) ||
4574 TTI.preferInLoopReduction(Kind: RdxDesc.getRecurrenceKind(),
4575 Ty: RdxDesc.getRecurrenceType()))
4576 continue;
4577 T = RdxDesc.getRecurrenceType();
4578 }
4579
4580 // Examine the stored values.
4581 if (auto *ST = dyn_cast<StoreInst>(Val: &I))
4582 T = ST->getValueOperand()->getType();
4583
4584 assert(T->isSized() &&
4585 "Expected the load/store/recurrence type to be sized");
4586
4587 ElementTypesInLoop.insert(Ptr: T);
4588 }
4589 }
4590}
4591
4592unsigned
4593LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4594 InstructionCost LoopCost) {
4595 // -- The interleave heuristics --
4596 // We interleave the loop in order to expose ILP and reduce the loop overhead.
4597 // There are many micro-architectural considerations that we can't predict
4598 // at this level. For example, frontend pressure (on decode or fetch) due to
4599 // code size, or the number and capabilities of the execution ports.
4600 //
4601 // We use the following heuristics to select the interleave count:
4602 // 1. If the code has reductions, then we interleave to break the cross
4603 // iteration dependency.
4604 // 2. If the loop is really small, then we interleave to reduce the loop
4605 // overhead.
4606 // 3. We don't interleave if we think that we will spill registers to memory
4607 // due to the increased register pressure.
4608
4609 if (!isScalarEpilogueAllowed())
4610 return 1;
4611
4612 // Do not interleave if EVL is preferred and no User IC is specified.
4613 if (foldTailWithEVL()) {
4614 LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4615 "Unroll factor forced to be 1.\n");
4616 return 1;
4617 }
4618
4619 // We used the distance for the interleave count.
4620 if (!Legal->isSafeForAnyVectorWidth())
4621 return 1;
4622
4623 // We don't attempt to perform interleaving for loops with uncountable early
4624 // exits because the VPInstruction::AnyOf code cannot currently handle
4625 // multiple parts.
4626 if (Legal->hasUncountableEarlyExit())
4627 return 1;
4628
4629 const bool HasReductions = !Legal->getReductionVars().empty();
4630
4631 // If we did not calculate the cost for VF (because the user selected the VF)
4632 // then we calculate the cost of VF here.
4633 if (LoopCost == 0) {
4634 LoopCost = expectedCost(VF);
4635 assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4636
4637 // Loop body is free and there is no need for interleaving.
4638 if (LoopCost == 0)
4639 return 1;
4640 }
4641
4642 VPRegisterUsage R =
4643 calculateRegisterUsageForPlan(Plan, VFs: {VF}, TTI, ValuesToIgnore)[0];
4644 // We divide by these constants so assume that we have at least one
4645 // instruction that uses at least one register.
4646 for (auto &Pair : R.MaxLocalUsers) {
4647 Pair.second = std::max(a: Pair.second, b: 1U);
4648 }
4649
4650 // We calculate the interleave count using the following formula.
4651 // Subtract the number of loop invariants from the number of available
4652 // registers. These registers are used by all of the interleaved instances.
4653 // Next, divide the remaining registers by the number of registers that is
4654 // required by the loop, in order to estimate how many parallel instances
4655 // fit without causing spills. All of this is rounded down if necessary to be
4656 // a power of two. We want power of two interleave count to simplify any
4657 // addressing operations or alignment considerations.
4658 // We also want power of two interleave counts to ensure that the induction
4659 // variable of the vector loop wraps to zero, when tail is folded by masking;
4660 // this currently happens when OptForSize, in which case IC is set to 1 above.
4661 unsigned IC = UINT_MAX;
4662
4663 for (const auto &Pair : R.MaxLocalUsers) {
4664 unsigned TargetNumRegisters = TTI.getNumberOfRegisters(ClassID: Pair.first);
4665 LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4666 << " registers of "
4667 << TTI.getRegisterClassName(Pair.first)
4668 << " register class\n");
4669 if (VF.isScalar()) {
4670 if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
4671 TargetNumRegisters = ForceTargetNumScalarRegs;
4672 } else {
4673 if (ForceTargetNumVectorRegs.getNumOccurrences() > 0)
4674 TargetNumRegisters = ForceTargetNumVectorRegs;
4675 }
4676 unsigned MaxLocalUsers = Pair.second;
4677 unsigned LoopInvariantRegs = 0;
4678 if (R.LoopInvariantRegs.contains(Key: Pair.first))
4679 LoopInvariantRegs = R.LoopInvariantRegs[Pair.first];
4680
4681 unsigned TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs) /
4682 MaxLocalUsers);
4683 // Don't count the induction variable as interleaved.
4684 if (EnableIndVarRegisterHeur) {
4685 TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs - 1) /
4686 std::max(a: 1U, b: (MaxLocalUsers - 1)));
4687 }
4688
4689 IC = std::min(a: IC, b: TmpIC);
4690 }
4691
4692 // Clamp the interleave ranges to reasonable counts.
4693 unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4694
4695 // Check if the user has overridden the max.
4696 if (VF.isScalar()) {
4697 if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
4698 MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4699 } else {
4700 if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > 0)
4701 MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4702 }
4703
4704 unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScale: VScaleForTuning);
4705
4706 // Try to get the exact trip count, or an estimate based on profiling data or
4707 // ConstantMax from PSE, failing that.
4708 if (auto BestKnownTC = getSmallBestKnownTC(PSE, L: TheLoop)) {
4709 // At least one iteration must be scalar when this constraint holds. So the
4710 // maximum available iterations for interleaving is one less.
4711 unsigned AvailableTC = requiresScalarEpilogue(IsVectorizing: VF.isVector())
4712 ? BestKnownTC->getFixedValue() - 1
4713 : BestKnownTC->getFixedValue();
4714
4715 unsigned InterleaveCountLB = bit_floor(Value: std::max(
4716 a: 1u, b: std::min(a: AvailableTC / (EstimatedVF * 2), b: MaxInterleaveCount)));
4717
4718 if (getSmallConstantTripCount(SE: PSE.getSE(), L: TheLoop).isNonZero()) {
4719 // If the best known trip count is exact, we select between two
4720 // prospective ICs, where
4721 //
4722 // 1) the aggressive IC is capped by the trip count divided by VF
4723 // 2) the conservative IC is capped by the trip count divided by (VF * 2)
4724 //
4725 // The final IC is selected in a way that the epilogue loop trip count is
4726 // minimized while maximizing the IC itself, so that we either run the
4727 // vector loop at least once if it generates a small epilogue loop, or
4728 // else we run the vector loop at least twice.
4729
4730 unsigned InterleaveCountUB = bit_floor(Value: std::max(
4731 a: 1u, b: std::min(a: AvailableTC / EstimatedVF, b: MaxInterleaveCount)));
4732 MaxInterleaveCount = InterleaveCountLB;
4733
4734 if (InterleaveCountUB != InterleaveCountLB) {
4735 unsigned TailTripCountUB =
4736 (AvailableTC % (EstimatedVF * InterleaveCountUB));
4737 unsigned TailTripCountLB =
4738 (AvailableTC % (EstimatedVF * InterleaveCountLB));
4739 // If both produce same scalar tail, maximize the IC to do the same work
4740 // in fewer vector loop iterations
4741 if (TailTripCountUB == TailTripCountLB)
4742 MaxInterleaveCount = InterleaveCountUB;
4743 }
4744 } else {
4745 // If trip count is an estimated compile time constant, limit the
4746 // IC to be capped by the trip count divided by VF * 2, such that the
4747 // vector loop runs at least twice to make interleaving seem profitable
4748 // when there is an epilogue loop present. Since exact Trip count is not
4749 // known we choose to be conservative in our IC estimate.
4750 MaxInterleaveCount = InterleaveCountLB;
4751 }
4752 }
4753
4754 assert(MaxInterleaveCount > 0 &&
4755 "Maximum interleave count must be greater than 0");
4756
4757 // Clamp the calculated IC to be between the 1 and the max interleave count
4758 // that the target and trip count allows.
4759 if (IC > MaxInterleaveCount)
4760 IC = MaxInterleaveCount;
4761 else
4762 // Make sure IC is greater than 0.
4763 IC = std::max(a: 1u, b: IC);
4764
4765 assert(IC > 0 && "Interleave count must be greater than 0.");
4766
4767 // Interleave if we vectorized this loop and there is a reduction that could
4768 // benefit from interleaving.
4769 if (VF.isVector() && HasReductions) {
4770 LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4771 return IC;
4772 }
4773
4774 // For any scalar loop that either requires runtime checks or predication we
4775 // are better off leaving this to the unroller. Note that if we've already
4776 // vectorized the loop we will have done the runtime check and so interleaving
4777 // won't require further checks.
4778 bool ScalarInterleavingRequiresPredication =
4779 (VF.isScalar() && any_of(Range: TheLoop->blocks(), P: [this](BasicBlock *BB) {
4780 return Legal->blockNeedsPredication(BB);
4781 }));
4782 bool ScalarInterleavingRequiresRuntimePointerCheck =
4783 (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
4784
4785 // We want to interleave small loops in order to reduce the loop overhead and
4786 // potentially expose ILP opportunities.
4787 LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
4788 << "LV: IC is " << IC << '\n'
4789 << "LV: VF is " << VF << '\n');
4790 const bool AggressivelyInterleaveReductions =
4791 TTI.enableAggressiveInterleaving(LoopHasReductions: HasReductions);
4792 if (!ScalarInterleavingRequiresRuntimePointerCheck &&
4793 !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
4794 // We assume that the cost overhead is 1 and we use the cost model
4795 // to estimate the cost of the loop and interleave until the cost of the
4796 // loop overhead is about 5% of the cost of the loop.
4797 unsigned SmallIC = std::min(a: IC, b: (unsigned)llvm::bit_floor<uint64_t>(
4798 Value: SmallLoopCost / LoopCost.getValue()));
4799
4800 // Interleave until store/load ports (estimated by max interleave count) are
4801 // saturated.
4802 unsigned NumStores = Legal->getNumStores();
4803 unsigned NumLoads = Legal->getNumLoads();
4804 unsigned StoresIC = IC / (NumStores ? NumStores : 1);
4805 unsigned LoadsIC = IC / (NumLoads ? NumLoads : 1);
4806
4807 // There is little point in interleaving for reductions containing selects
4808 // and compares when VF=1 since it may just create more overhead than it's
4809 // worth for loops with small trip counts. This is because we still have to
4810 // do the final reduction after the loop.
4811 bool HasSelectCmpReductions =
4812 HasReductions &&
4813 any_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
4814 const RecurrenceDescriptor &RdxDesc = Reduction.second;
4815 RecurKind RK = RdxDesc.getRecurrenceKind();
4816 return RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK) ||
4817 RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RK);
4818 });
4819 if (HasSelectCmpReductions) {
4820 LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
4821 return 1;
4822 }
4823
4824 // If we have a scalar reduction (vector reductions are already dealt with
4825 // by this point), we can increase the critical path length if the loop
4826 // we're interleaving is inside another loop. For tree-wise reductions
4827 // set the limit to 2, and for ordered reductions it's best to disable
4828 // interleaving entirely.
4829 if (HasReductions && TheLoop->getLoopDepth() > 1) {
4830 bool HasOrderedReductions =
4831 any_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
4832 const RecurrenceDescriptor &RdxDesc = Reduction.second;
4833 return RdxDesc.isOrdered();
4834 });
4835 if (HasOrderedReductions) {
4836 LLVM_DEBUG(
4837 dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
4838 return 1;
4839 }
4840
4841 unsigned F = MaxNestedScalarReductionIC;
4842 SmallIC = std::min(a: SmallIC, b: F);
4843 StoresIC = std::min(a: StoresIC, b: F);
4844 LoadsIC = std::min(a: LoadsIC, b: F);
4845 }
4846
4847 if (EnableLoadStoreRuntimeInterleave &&
4848 std::max(a: StoresIC, b: LoadsIC) > SmallIC) {
4849 LLVM_DEBUG(
4850 dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4851 return std::max(a: StoresIC, b: LoadsIC);
4852 }
4853
4854 // If there are scalar reductions and TTI has enabled aggressive
4855 // interleaving for reductions, we will interleave to expose ILP.
4856 if (VF.isScalar() && AggressivelyInterleaveReductions) {
4857 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4858 // Interleave no less than SmallIC but not as aggressive as the normal IC
4859 // to satisfy the rare situation when resources are too limited.
4860 return std::max(a: IC / 2, b: SmallIC);
4861 }
4862
4863 LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4864 return SmallIC;
4865 }
4866
4867 // Interleave if this is a large loop (small loops are already dealt with by
4868 // this point) that could benefit from interleaving.
4869 if (AggressivelyInterleaveReductions) {
4870 LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4871 return IC;
4872 }
4873
4874 LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4875 return 1;
4876}
4877
4878bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
4879 ElementCount VF) {
4880 // TODO: Cost model for emulated masked load/store is completely
4881 // broken. This hack guides the cost model to use an artificially
4882 // high enough value to practically disable vectorization with such
4883 // operations, except where previously deployed legality hack allowed
4884 // using very low cost values. This is to avoid regressions coming simply
4885 // from moving "masked load/store" check from legality to cost model.
4886 // Masked Load/Gather emulation was previously never allowed.
4887 // Limited number of Masked Store/Scatter emulation was allowed.
4888 assert((isPredicatedInst(I)) &&
4889 "Expecting a scalar emulated instruction");
4890 return isa<LoadInst>(Val: I) ||
4891 (isa<StoreInst>(Val: I) &&
4892 NumPredStores > NumberOfStoresToPredicate);
4893}
4894
4895void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
4896 assert(VF.isVector() && "Expected VF >= 2");
4897
4898 // If we've already collected the instructions to scalarize or the predicated
4899 // BBs after vectorization, there's nothing to do. Collection may already have
4900 // occurred if we have a user-selected VF and are now computing the expected
4901 // cost for interleaving.
4902 if (InstsToScalarize.contains(Val: VF) ||
4903 PredicatedBBsAfterVectorization.contains(Val: VF))
4904 return;
4905
4906 // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
4907 // not profitable to scalarize any instructions, the presence of VF in the
4908 // map will indicate that we've analyzed it already.
4909 ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
4910
4911 // Find all the instructions that are scalar with predication in the loop and
4912 // determine if it would be better to not if-convert the blocks they are in.
4913 // If so, we also record the instructions to scalarize.
4914 for (BasicBlock *BB : TheLoop->blocks()) {
4915 if (!blockNeedsPredicationForAnyReason(BB))
4916 continue;
4917 for (Instruction &I : *BB)
4918 if (isScalarWithPredication(I: &I, VF)) {
4919 ScalarCostsTy ScalarCosts;
4920 // Do not apply discount logic for:
4921 // 1. Scalars after vectorization, as there will only be a single copy
4922 // of the instruction.
4923 // 2. Scalable VF, as that would lead to invalid scalarization costs.
4924 // 3. Emulated masked memrefs, if a hacked cost is needed.
4925 if (!isScalarAfterVectorization(I: &I, VF) && !VF.isScalable() &&
4926 !useEmulatedMaskMemRefHack(I: &I, VF) &&
4927 computePredInstDiscount(PredInst: &I, ScalarCosts, VF) >= 0) {
4928 ScalarCostsVF.insert_range(R&: ScalarCosts);
4929 // Check if we decided to scalarize a call. If so, update the widening
4930 // decision of the call to CM_Scalarize with the computed scalar cost.
4931 for (const auto &[I, Cost] : ScalarCosts) {
4932 auto *CI = dyn_cast<CallInst>(Val: I);
4933 if (!CI || !CallWideningDecisions.contains(Val: {CI, VF}))
4934 continue;
4935 CallWideningDecisions[{CI, VF}].Kind = CM_Scalarize;
4936 CallWideningDecisions[{CI, VF}].Cost = Cost;
4937 }
4938 }
4939 // Remember that BB will remain after vectorization.
4940 PredicatedBBsAfterVectorization[VF].insert(Ptr: BB);
4941 for (auto *Pred : predecessors(BB)) {
4942 if (Pred->getSingleSuccessor() == BB)
4943 PredicatedBBsAfterVectorization[VF].insert(Ptr: Pred);
4944 }
4945 }
4946 }
4947}
4948
4949InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
4950 Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
4951 assert(!isUniformAfterVectorization(PredInst, VF) &&
4952 "Instruction marked uniform-after-vectorization will be predicated");
4953
4954 // Initialize the discount to zero, meaning that the scalar version and the
4955 // vector version cost the same.
4956 InstructionCost Discount = 0;
4957
4958 // Holds instructions to analyze. The instructions we visit are mapped in
4959 // ScalarCosts. Those instructions are the ones that would be scalarized if
4960 // we find that the scalar version costs less.
4961 SmallVector<Instruction *, 8> Worklist;
4962
4963 // Returns true if the given instruction can be scalarized.
4964 auto CanBeScalarized = [&](Instruction *I) -> bool {
4965 // We only attempt to scalarize instructions forming a single-use chain
4966 // from the original predicated block that would otherwise be vectorized.
4967 // Although not strictly necessary, we give up on instructions we know will
4968 // already be scalar to avoid traversing chains that are unlikely to be
4969 // beneficial.
4970 if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
4971 isScalarAfterVectorization(I, VF))
4972 return false;
4973
4974 // If the instruction is scalar with predication, it will be analyzed
4975 // separately. We ignore it within the context of PredInst.
4976 if (isScalarWithPredication(I, VF))
4977 return false;
4978
4979 // If any of the instruction's operands are uniform after vectorization,
4980 // the instruction cannot be scalarized. This prevents, for example, a
4981 // masked load from being scalarized.
4982 //
4983 // We assume we will only emit a value for lane zero of an instruction
4984 // marked uniform after vectorization, rather than VF identical values.
4985 // Thus, if we scalarize an instruction that uses a uniform, we would
4986 // create uses of values corresponding to the lanes we aren't emitting code
4987 // for. This behavior can be changed by allowing getScalarValue to clone
4988 // the lane zero values for uniforms rather than asserting.
4989 for (Use &U : I->operands())
4990 if (auto *J = dyn_cast<Instruction>(Val: U.get()))
4991 if (isUniformAfterVectorization(I: J, VF))
4992 return false;
4993
4994 // Otherwise, we can scalarize the instruction.
4995 return true;
4996 };
4997
4998 // Compute the expected cost discount from scalarizing the entire expression
4999 // feeding the predicated instruction. We currently only consider expressions
5000 // that are single-use instruction chains.
5001 Worklist.push_back(Elt: PredInst);
5002 while (!Worklist.empty()) {
5003 Instruction *I = Worklist.pop_back_val();
5004
5005 // If we've already analyzed the instruction, there's nothing to do.
5006 if (ScalarCosts.contains(Val: I))
5007 continue;
5008
5009 // Cannot scalarize fixed-order recurrence phis at the moment.
5010 if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
5011 continue;
5012
5013 // Compute the cost of the vector instruction. Note that this cost already
5014 // includes the scalarization overhead of the predicated instruction.
5015 InstructionCost VectorCost = getInstructionCost(I, VF);
5016
5017 // Compute the cost of the scalarized instruction. This cost is the cost of
5018 // the instruction as if it wasn't if-converted and instead remained in the
5019 // predicated block. We will scale this cost by block probability after
5020 // computing the scalarization overhead.
5021 InstructionCost ScalarCost =
5022 VF.getFixedValue() * getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1));
5023
5024 // Compute the scalarization overhead of needed insertelement instructions
5025 // and phi nodes.
5026 if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5027 Type *WideTy = toVectorizedTy(Ty: I->getType(), EC: VF);
5028 for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
5029 ScalarCost += TTI.getScalarizationOverhead(
5030 Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5031 /*Insert=*/true,
5032 /*Extract=*/false, CostKind);
5033 }
5034 ScalarCost +=
5035 VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
5036 }
5037
5038 // Compute the scalarization overhead of needed extractelement
5039 // instructions. For each of the instruction's operands, if the operand can
5040 // be scalarized, add it to the worklist; otherwise, account for the
5041 // overhead.
5042 for (Use &U : I->operands())
5043 if (auto *J = dyn_cast<Instruction>(Val: U.get())) {
5044 assert(canVectorizeTy(J->getType()) &&
5045 "Instruction has non-scalar type");
5046 if (CanBeScalarized(J))
5047 Worklist.push_back(Elt: J);
5048 else if (needsExtract(V: J, VF)) {
5049 Type *WideTy = toVectorizedTy(Ty: J->getType(), EC: VF);
5050 for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
5051 ScalarCost += TTI.getScalarizationOverhead(
5052 Ty: cast<VectorType>(Val: VectorTy),
5053 DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /*Insert*/ false,
5054 /*Extract*/ true, CostKind);
5055 }
5056 }
5057 }
5058
5059 // Scale the total scalar cost by block probability.
5060 ScalarCost /= getPredBlockCostDivisor(CostKind);
5061
5062 // Compute the discount. A non-negative discount means the vector version
5063 // of the instruction costs more, and scalarizing would be beneficial.
5064 Discount += VectorCost - ScalarCost;
5065 ScalarCosts[I] = ScalarCost;
5066 }
5067
5068 return Discount;
5069}
5070
5071InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5072 InstructionCost Cost;
5073
5074 // If the vector loop gets executed exactly once with the given VF, ignore the
5075 // costs of comparison and induction instructions, as they'll get simplified
5076 // away.
5077 SmallPtrSet<Instruction *, 2> ValuesToIgnoreForVF;
5078 auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: TheLoop);
5079 if (TC == VF && !foldTailByMasking())
5080 addFullyUnrolledInstructionsToIgnore(L: TheLoop, IL: Legal->getInductionVars(),
5081 InstsToIgnore&: ValuesToIgnoreForVF);
5082
5083 // For each block.
5084 for (BasicBlock *BB : TheLoop->blocks()) {
5085 InstructionCost BlockCost;
5086
5087 // For each instruction in the old loop.
5088 for (Instruction &I : BB->instructionsWithoutDebug()) {
5089 // Skip ignored values.
5090 if (ValuesToIgnore.count(Ptr: &I) || ValuesToIgnoreForVF.count(Ptr: &I) ||
5091 (VF.isVector() && VecValuesToIgnore.count(Ptr: &I)))
5092 continue;
5093
5094 InstructionCost C = getInstructionCost(I: &I, VF);
5095
5096 // Check if we should override the cost.
5097 if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > 0)
5098 C = InstructionCost(ForceTargetInstructionCost);
5099
5100 BlockCost += C;
5101 LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5102 << VF << " For instruction: " << I << '\n');
5103 }
5104
5105 // If we are vectorizing a predicated block, it will have been
5106 // if-converted. This means that the block's instructions (aside from
5107 // stores and instructions that may divide by zero) will now be
5108 // unconditionally executed. For the scalar case, we may not always execute
5109 // the predicated block, if it is an if-else block. Thus, scale the block's
5110 // cost by the probability of executing it. blockNeedsPredication from
5111 // Legal is used so as to not include all blocks in tail folded loops.
5112 if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5113 BlockCost /= getPredBlockCostDivisor(CostKind);
5114
5115 Cost += BlockCost;
5116 }
5117
5118 return Cost;
5119}
5120
5121/// Gets Address Access SCEV after verifying that the access pattern
5122/// is loop invariant except the induction variable dependence.
5123///
5124/// This SCEV can be sent to the Target in order to estimate the address
5125/// calculation cost.
5126static const SCEV *getAddressAccessSCEV(
5127 Value *Ptr,
5128 LoopVectorizationLegality *Legal,
5129 PredicatedScalarEvolution &PSE,
5130 const Loop *TheLoop) {
5131
5132 auto *Gep = dyn_cast<GetElementPtrInst>(Val: Ptr);
5133 if (!Gep)
5134 return nullptr;
5135
5136 // We are looking for a gep with all loop invariant indices except for one
5137 // which should be an induction variable.
5138 auto *SE = PSE.getSE();
5139 unsigned NumOperands = Gep->getNumOperands();
5140 for (unsigned Idx = 1; Idx < NumOperands; ++Idx) {
5141 Value *Opd = Gep->getOperand(i_nocapture: Idx);
5142 if (!SE->isLoopInvariant(S: SE->getSCEV(V: Opd), L: TheLoop) &&
5143 !Legal->isInductionVariable(V: Opd))
5144 return nullptr;
5145 }
5146
5147 // Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5148 return PSE.getSCEV(V: Ptr);
5149}
5150
5151InstructionCost
5152LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5153 ElementCount VF) {
5154 assert(VF.isVector() &&
5155 "Scalarization cost of instruction implies vectorization.");
5156 if (VF.isScalable())
5157 return InstructionCost::getInvalid();
5158
5159 Type *ValTy = getLoadStoreType(I);
5160 auto *SE = PSE.getSE();
5161
5162 unsigned AS = getLoadStoreAddressSpace(I);
5163 Value *Ptr = getLoadStorePointerOperand(V: I);
5164 Type *PtrTy = toVectorTy(Scalar: Ptr->getType(), EC: VF);
5165 // NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5166 // that it is being called from this specific place.
5167
5168 // Figure out whether the access is strided and get the stride value
5169 // if it's known in compile time
5170 const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5171
5172 // Get the cost of the scalar memory instruction and address computation.
5173 InstructionCost Cost =
5174 VF.getFixedValue() * TTI.getAddressComputationCost(Ty: PtrTy, SE, Ptr: PtrSCEV);
5175
5176 // Don't pass *I here, since it is scalar but will actually be part of a
5177 // vectorized loop where the user of it is a vectorized instruction.
5178 const Align Alignment = getLoadStoreAlignment(I);
5179 Cost += VF.getFixedValue() * TTI.getMemoryOpCost(Opcode: I->getOpcode(),
5180 Src: ValTy->getScalarType(),
5181 Alignment, AddressSpace: AS, CostKind);
5182
5183 // Get the overhead of the extractelement and insertelement instructions
5184 // we might create due to scalarization.
5185 Cost += getScalarizationOverhead(I, VF);
5186
5187 // If we have a predicated load/store, it will need extra i1 extracts and
5188 // conditional branches, but may not be executed for each vector lane. Scale
5189 // the cost by the probability of executing the predicated block.
5190 if (isPredicatedInst(I)) {
5191 Cost /= getPredBlockCostDivisor(CostKind);
5192
5193 // Add the cost of an i1 extract and a branch
5194 auto *VecI1Ty =
5195 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: ValTy->getContext()), EC: VF);
5196 Cost += TTI.getScalarizationOverhead(
5197 Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5198 /*Insert=*/false, /*Extract=*/true, CostKind);
5199 Cost += TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind);
5200
5201 if (useEmulatedMaskMemRefHack(I, VF))
5202 // Artificially setting to a high enough value to practically disable
5203 // vectorization with such operations.
5204 Cost = 3000000;
5205 }
5206
5207 return Cost;
5208}
5209
5210InstructionCost
5211LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5212 ElementCount VF) {
5213 Type *ValTy = getLoadStoreType(I);
5214 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5215 Value *Ptr = getLoadStorePointerOperand(V: I);
5216 unsigned AS = getLoadStoreAddressSpace(I);
5217 int ConsecutiveStride = Legal->isConsecutivePtr(AccessTy: ValTy, Ptr);
5218
5219 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5220 "Stride should be 1 or -1 for consecutive memory access");
5221 const Align Alignment = getLoadStoreAlignment(I);
5222 InstructionCost Cost = 0;
5223 if (Legal->isMaskRequired(I)) {
5224 Cost += TTI.getMaskedMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
5225 CostKind);
5226 } else {
5227 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
5228 Cost += TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
5229 CostKind, OpdInfo: OpInfo, I);
5230 }
5231
5232 bool Reverse = ConsecutiveStride < 0;
5233 if (Reverse)
5234 Cost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
5235 SrcTy: VectorTy, Mask: {}, CostKind, Index: 0);
5236 return Cost;
5237}
5238
5239InstructionCost
5240LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5241 ElementCount VF) {
5242 assert(Legal->isUniformMemOp(*I, VF));
5243
5244 Type *ValTy = getLoadStoreType(I);
5245 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5246 const Align Alignment = getLoadStoreAlignment(I);
5247 unsigned AS = getLoadStoreAddressSpace(I);
5248 if (isa<LoadInst>(Val: I)) {
5249 return TTI.getAddressComputationCost(Ty: ValTy) +
5250 TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: ValTy, Alignment, AddressSpace: AS,
5251 CostKind) +
5252 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, DstTy: VectorTy,
5253 SrcTy: VectorTy, Mask: {}, CostKind);
5254 }
5255 StoreInst *SI = cast<StoreInst>(Val: I);
5256
5257 bool IsLoopInvariantStoreValue = Legal->isInvariant(V: SI->getValueOperand());
5258 // TODO: We have existing tests that request the cost of extracting element
5259 // VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
5260 // the actual generated code, which involves extracting the last element of
5261 // a scalable vector where the lane to extract is unknown at compile time.
5262 return TTI.getAddressComputationCost(Ty: ValTy) +
5263 TTI.getMemoryOpCost(Opcode: Instruction::Store, Src: ValTy, Alignment, AddressSpace: AS,
5264 CostKind) +
5265 (IsLoopInvariantStoreValue
5266 ? 0
5267 : TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VectorTy,
5268 CostKind, Index: VF.getKnownMinValue() - 1));
5269}
5270
5271InstructionCost
5272LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5273 ElementCount VF) {
5274 Type *ValTy = getLoadStoreType(I);
5275 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5276 const Align Alignment = getLoadStoreAlignment(I);
5277 const Value *Ptr = getLoadStorePointerOperand(V: I);
5278
5279 return TTI.getAddressComputationCost(Ty: VectorTy) +
5280 TTI.getGatherScatterOpCost(Opcode: I->getOpcode(), DataTy: VectorTy, Ptr,
5281 VariableMask: Legal->isMaskRequired(I), Alignment,
5282 CostKind, I);
5283}
5284
5285InstructionCost
5286LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5287 ElementCount VF) {
5288 const auto *Group = getInterleavedAccessGroup(Instr: I);
5289 assert(Group && "Fail to get an interleaved access group.");
5290
5291 Instruction *InsertPos = Group->getInsertPos();
5292 Type *ValTy = getLoadStoreType(I: InsertPos);
5293 auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5294 unsigned AS = getLoadStoreAddressSpace(I: InsertPos);
5295
5296 unsigned InterleaveFactor = Group->getFactor();
5297 auto *WideVecTy = VectorType::get(ElementType: ValTy, EC: VF * InterleaveFactor);
5298
5299 // Holds the indices of existing members in the interleaved group.
5300 SmallVector<unsigned, 4> Indices;
5301 for (unsigned IF = 0; IF < InterleaveFactor; IF++)
5302 if (Group->getMember(Index: IF))
5303 Indices.push_back(Elt: IF);
5304
5305 // Calculate the cost of the whole interleaved group.
5306 bool UseMaskForGaps =
5307 (Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) ||
5308 (isa<StoreInst>(Val: I) && (Group->getNumMembers() < Group->getFactor()));
5309 InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5310 Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: Group->getFactor(), Indices,
5311 Alignment: Group->getAlign(), AddressSpace: AS, CostKind, UseMaskForCond: Legal->isMaskRequired(I),
5312 UseMaskForGaps);
5313
5314 if (Group->isReverse()) {
5315 // TODO: Add support for reversed masked interleaved access.
5316 assert(!Legal->isMaskRequired(I) &&
5317 "Reverse masked interleaved access not supported.");
5318 Cost += Group->getNumMembers() *
5319 TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
5320 SrcTy: VectorTy, Mask: {}, CostKind, Index: 0);
5321 }
5322 return Cost;
5323}
5324
5325std::optional<InstructionCost>
5326LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
5327 ElementCount VF,
5328 Type *Ty) const {
5329 using namespace llvm::PatternMatch;
5330 // Early exit for no inloop reductions
5331 if (InLoopReductions.empty() || VF.isScalar() || !isa<VectorType>(Val: Ty))
5332 return std::nullopt;
5333 auto *VectorTy = cast<VectorType>(Val: Ty);
5334
5335 // We are looking for a pattern of, and finding the minimal acceptable cost:
5336 // reduce(mul(ext(A), ext(B))) or
5337 // reduce(mul(A, B)) or
5338 // reduce(ext(A)) or
5339 // reduce(A).
5340 // The basic idea is that we walk down the tree to do that, finding the root
5341 // reduction instruction in InLoopReductionImmediateChains. From there we find
5342 // the pattern of mul/ext and test the cost of the entire pattern vs the cost
5343 // of the components. If the reduction cost is lower then we return it for the
5344 // reduction instruction and 0 for the other instructions in the pattern. If
5345 // it is not we return an invalid cost specifying the orignal cost method
5346 // should be used.
5347 Instruction *RetI = I;
5348 if (match(V: RetI, P: m_ZExtOrSExt(Op: m_Value()))) {
5349 if (!RetI->hasOneUser())
5350 return std::nullopt;
5351 RetI = RetI->user_back();
5352 }
5353
5354 if (match(V: RetI, P: m_OneUse(SubPattern: m_Mul(L: m_Value(), R: m_Value()))) &&
5355 RetI->user_back()->getOpcode() == Instruction::Add) {
5356 RetI = RetI->user_back();
5357 }
5358
5359 // Test if the found instruction is a reduction, and if not return an invalid
5360 // cost specifying the parent to use the original cost modelling.
5361 Instruction *LastChain = InLoopReductionImmediateChains.lookup(Val: RetI);
5362 if (!LastChain)
5363 return std::nullopt;
5364
5365 // Find the reduction this chain is a part of and calculate the basic cost of
5366 // the reduction on its own.
5367 Instruction *ReductionPhi = LastChain;
5368 while (!isa<PHINode>(Val: ReductionPhi))
5369 ReductionPhi = InLoopReductionImmediateChains.at(Val: ReductionPhi);
5370
5371 const RecurrenceDescriptor &RdxDesc =
5372 Legal->getRecurrenceDescriptor(PN: cast<PHINode>(Val: ReductionPhi));
5373
5374 InstructionCost BaseCost;
5375 RecurKind RK = RdxDesc.getRecurrenceKind();
5376 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK)) {
5377 Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5378 BaseCost = TTI.getMinMaxReductionCost(IID: MinMaxID, Ty: VectorTy,
5379 FMF: RdxDesc.getFastMathFlags(), CostKind);
5380 } else {
5381 BaseCost = TTI.getArithmeticReductionCost(
5382 Opcode: RdxDesc.getOpcode(), Ty: VectorTy, FMF: RdxDesc.getFastMathFlags(), CostKind);
5383 }
5384
5385 // For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5386 // normal fmul instruction to the cost of the fadd reduction.
5387 if (RK == RecurKind::FMulAdd)
5388 BaseCost +=
5389 TTI.getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: VectorTy, CostKind);
5390
5391 // If we're using ordered reductions then we can just return the base cost
5392 // here, since getArithmeticReductionCost calculates the full ordered
5393 // reduction cost when FP reassociation is not allowed.
5394 if (useOrderedReductions(RdxDesc))
5395 return BaseCost;
5396
5397 // Get the operand that was not the reduction chain and match it to one of the
5398 // patterns, returning the better cost if it is found.
5399 Instruction *RedOp = RetI->getOperand(i: 1) == LastChain
5400 ? dyn_cast<Instruction>(Val: RetI->getOperand(i: 0))
5401 : dyn_cast<Instruction>(Val: RetI->getOperand(i: 1));
5402
5403 VectorTy = VectorType::get(ElementType: I->getOperand(i: 0)->getType(), Other: VectorTy);
5404
5405 Instruction *Op0, *Op1;
5406 if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5407 match(V: RedOp,
5408 P: m_ZExtOrSExt(Op: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) &&
5409 match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5410 Op0->getOpcode() == Op1->getOpcode() &&
5411 Op0->getOperand(i: 0)->getType() == Op1->getOperand(i: 0)->getType() &&
5412 !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1) &&
5413 (Op0->getOpcode() == RedOp->getOpcode() || Op0 == Op1)) {
5414
5415 // Matched reduce.add(ext(mul(ext(A), ext(B)))
5416 // Note that the extend opcodes need to all match, or if A==B they will have
5417 // been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5418 // which is equally fine.
5419 bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5420 auto *ExtType = VectorType::get(ElementType: Op0->getOperand(i: 0)->getType(), Other: VectorTy);
5421 auto *MulType = VectorType::get(ElementType: Op0->getType(), Other: VectorTy);
5422
5423 InstructionCost ExtCost =
5424 TTI.getCastInstrCost(Opcode: Op0->getOpcode(), Dst: MulType, Src: ExtType,
5425 CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5426 InstructionCost MulCost =
5427 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: MulType, CostKind);
5428 InstructionCost Ext2Cost =
5429 TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: MulType,
5430 CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5431
5432 InstructionCost RedCost = TTI.getMulAccReductionCost(
5433 IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, CostKind);
5434
5435 if (RedCost.isValid() &&
5436 RedCost < ExtCost * 2 + MulCost + Ext2Cost + BaseCost)
5437 return I == RetI ? RedCost : 0;
5438 } else if (RedOp && match(V: RedOp, P: m_ZExtOrSExt(Op: m_Value())) &&
5439 !TheLoop->isLoopInvariant(V: RedOp)) {
5440 // Matched reduce(ext(A))
5441 bool IsUnsigned = isa<ZExtInst>(Val: RedOp);
5442 auto *ExtType = VectorType::get(ElementType: RedOp->getOperand(i: 0)->getType(), Other: VectorTy);
5443 InstructionCost RedCost = TTI.getExtendedReductionCost(
5444 Opcode: RdxDesc.getOpcode(), IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5445 FMF: RdxDesc.getFastMathFlags(), CostKind);
5446
5447 InstructionCost ExtCost =
5448 TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: ExtType,
5449 CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5450 if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5451 return I == RetI ? RedCost : 0;
5452 } else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5453 match(V: RedOp, P: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) {
5454 if (match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5455 Op0->getOpcode() == Op1->getOpcode() &&
5456 !TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1)) {
5457 bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5458 Type *Op0Ty = Op0->getOperand(i: 0)->getType();
5459 Type *Op1Ty = Op1->getOperand(i: 0)->getType();
5460 Type *LargestOpTy =
5461 Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5462 : Op0Ty;
5463 auto *ExtType = VectorType::get(ElementType: LargestOpTy, Other: VectorTy);
5464
5465 // Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5466 // different sizes. We take the largest type as the ext to reduce, and add
5467 // the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5468 InstructionCost ExtCost0 = TTI.getCastInstrCost(
5469 Opcode: Op0->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op0Ty, Other: VectorTy),
5470 CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5471 InstructionCost ExtCost1 = TTI.getCastInstrCost(
5472 Opcode: Op1->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op1Ty, Other: VectorTy),
5473 CCH: TTI::CastContextHint::None, CostKind, I: Op1);
5474 InstructionCost MulCost =
5475 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5476
5477 InstructionCost RedCost = TTI.getMulAccReductionCost(
5478 IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, CostKind);
5479 InstructionCost ExtraExtCost = 0;
5480 if (Op0Ty != LargestOpTy || Op1Ty != LargestOpTy) {
5481 Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5482 ExtraExtCost = TTI.getCastInstrCost(
5483 Opcode: ExtraExtOp->getOpcode(), Dst: ExtType,
5484 Src: VectorType::get(ElementType: ExtraExtOp->getOperand(i: 0)->getType(), Other: VectorTy),
5485 CCH: TTI::CastContextHint::None, CostKind, I: ExtraExtOp);
5486 }
5487
5488 if (RedCost.isValid() &&
5489 (RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5490 return I == RetI ? RedCost : 0;
5491 } else if (!match(V: I, P: m_ZExtOrSExt(Op: m_Value()))) {
5492 // Matched reduce.add(mul())
5493 InstructionCost MulCost =
5494 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5495
5496 InstructionCost RedCost = TTI.getMulAccReductionCost(
5497 IsUnsigned: true, ResTy: RdxDesc.getRecurrenceType(), Ty: VectorTy, CostKind);
5498
5499 if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5500 return I == RetI ? RedCost : 0;
5501 }
5502 }
5503
5504 return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5505}
5506
5507InstructionCost
5508LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5509 ElementCount VF) {
5510 // Calculate scalar cost only. Vectorization cost should be ready at this
5511 // moment.
5512 if (VF.isScalar()) {
5513 Type *ValTy = getLoadStoreType(I);
5514 const Align Alignment = getLoadStoreAlignment(I);
5515 unsigned AS = getLoadStoreAddressSpace(I);
5516
5517 TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: 0));
5518 return TTI.getAddressComputationCost(Ty: ValTy) +
5519 TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, CostKind,
5520 OpdInfo: OpInfo, I);
5521 }
5522 return getWideningCost(I, VF);
5523}
5524
5525InstructionCost
5526LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5527 ElementCount VF) const {
5528
5529 // There is no mechanism yet to create a scalable scalarization loop,
5530 // so this is currently Invalid.
5531 if (VF.isScalable())
5532 return InstructionCost::getInvalid();
5533
5534 if (VF.isScalar())
5535 return 0;
5536
5537 InstructionCost Cost = 0;
5538 Type *RetTy = toVectorizedTy(Ty: I->getType(), EC: VF);
5539 if (!RetTy->isVoidTy() &&
5540 (!isa<LoadInst>(Val: I) || !TTI.supportsEfficientVectorElementLoadStore())) {
5541
5542 for (Type *VectorTy : getContainedTypes(Ty: RetTy)) {
5543 Cost += TTI.getScalarizationOverhead(
5544 Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5545 /*Insert=*/true,
5546 /*Extract=*/false, CostKind);
5547 }
5548 }
5549
5550 // Some targets keep addresses scalar.
5551 if (isa<LoadInst>(Val: I) && !TTI.prefersVectorizedAddressing())
5552 return Cost;
5553
5554 // Some targets support efficient element stores.
5555 if (isa<StoreInst>(Val: I) && TTI.supportsEfficientVectorElementLoadStore())
5556 return Cost;
5557
5558 // Collect operands to consider.
5559 CallInst *CI = dyn_cast<CallInst>(Val: I);
5560 Instruction::op_range Ops = CI ? CI->args() : I->operands();
5561
5562 // Skip operands that do not require extraction/scalarization and do not incur
5563 // any overhead.
5564 SmallVector<Type *> Tys;
5565 for (auto *V : filterExtractingOperands(Ops, VF))
5566 Tys.push_back(Elt: maybeVectorizeType(Ty: V->getType(), VF));
5567 return Cost + TTI.getOperandsScalarizationOverhead(
5568 Args: filterExtractingOperands(Ops, VF), Tys, CostKind);
5569}
5570
5571void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
5572 if (VF.isScalar())
5573 return;
5574 NumPredStores = 0;
5575 for (BasicBlock *BB : TheLoop->blocks()) {
5576 // For each instruction in the old loop.
5577 for (Instruction &I : *BB) {
5578 Value *Ptr = getLoadStorePointerOperand(V: &I);
5579 if (!Ptr)
5580 continue;
5581
5582 // TODO: We should generate better code and update the cost model for
5583 // predicated uniform stores. Today they are treated as any other
5584 // predicated store (see added test cases in
5585 // invariant-store-vectorization.ll).
5586 if (isa<StoreInst>(Val: &I) && isScalarWithPredication(I: &I, VF))
5587 NumPredStores++;
5588
5589 if (Legal->isUniformMemOp(I, VF)) {
5590 auto IsLegalToScalarize = [&]() {
5591 if (!VF.isScalable())
5592 // Scalarization of fixed length vectors "just works".
5593 return true;
5594
5595 // We have dedicated lowering for unpredicated uniform loads and
5596 // stores. Note that even with tail folding we know that at least
5597 // one lane is active (i.e. generalized predication is not possible
5598 // here), and the logic below depends on this fact.
5599 if (!foldTailByMasking())
5600 return true;
5601
5602 // For scalable vectors, a uniform memop load is always
5603 // uniform-by-parts and we know how to scalarize that.
5604 if (isa<LoadInst>(Val: I))
5605 return true;
5606
5607 // A uniform store isn't neccessarily uniform-by-part
5608 // and we can't assume scalarization.
5609 auto &SI = cast<StoreInst>(Val&: I);
5610 return TheLoop->isLoopInvariant(V: SI.getValueOperand());
5611 };
5612
5613 const InstructionCost GatherScatterCost =
5614 isLegalGatherOrScatter(V: &I, VF) ?
5615 getGatherScatterCost(I: &I, VF) : InstructionCost::getInvalid();
5616
5617 // Load: Scalar load + broadcast
5618 // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5619 // FIXME: This cost is a significant under-estimate for tail folded
5620 // memory ops.
5621 const InstructionCost ScalarizationCost =
5622 IsLegalToScalarize() ? getUniformMemOpCost(I: &I, VF)
5623 : InstructionCost::getInvalid();
5624
5625 // Choose better solution for the current VF, Note that Invalid
5626 // costs compare as maximumal large. If both are invalid, we get
5627 // scalable invalid which signals a failure and a vectorization abort.
5628 if (GatherScatterCost < ScalarizationCost)
5629 setWideningDecision(I: &I, VF, W: CM_GatherScatter, Cost: GatherScatterCost);
5630 else
5631 setWideningDecision(I: &I, VF, W: CM_Scalarize, Cost: ScalarizationCost);
5632 continue;
5633 }
5634
5635 // We assume that widening is the best solution when possible.
5636 if (memoryInstructionCanBeWidened(I: &I, VF)) {
5637 InstructionCost Cost = getConsecutiveMemOpCost(I: &I, VF);
5638 int ConsecutiveStride = Legal->isConsecutivePtr(
5639 AccessTy: getLoadStoreType(I: &I), Ptr: getLoadStorePointerOperand(V: &I));
5640 assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
5641 "Expected consecutive stride.");
5642 InstWidening Decision =
5643 ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
5644 setWideningDecision(I: &I, VF, W: Decision, Cost);
5645 continue;
5646 }
5647
5648 // Choose between Interleaving, Gather/Scatter or Scalarization.
5649 InstructionCost InterleaveCost = InstructionCost::getInvalid();
5650 unsigned NumAccesses = 1;
5651 if (isAccessInterleaved(Instr: &I)) {
5652 const auto *Group = getInterleavedAccessGroup(Instr: &I);
5653 assert(Group && "Fail to get an interleaved access group.");
5654
5655 // Make one decision for the whole group.
5656 if (getWideningDecision(I: &I, VF) != CM_Unknown)
5657 continue;
5658
5659 NumAccesses = Group->getNumMembers();
5660 if (interleavedAccessCanBeWidened(I: &I, VF))
5661 InterleaveCost = getInterleaveGroupCost(I: &I, VF);
5662 }
5663
5664 InstructionCost GatherScatterCost =
5665 isLegalGatherOrScatter(V: &I, VF)
5666 ? getGatherScatterCost(I: &I, VF) * NumAccesses
5667 : InstructionCost::getInvalid();
5668
5669 InstructionCost ScalarizationCost =
5670 getMemInstScalarizationCost(I: &I, VF) * NumAccesses;
5671
5672 // Choose better solution for the current VF,
5673 // write down this decision and use it during vectorization.
5674 InstructionCost Cost;
5675 InstWidening Decision;
5676 if (InterleaveCost <= GatherScatterCost &&
5677 InterleaveCost < ScalarizationCost) {
5678 Decision = CM_Interleave;
5679 Cost = InterleaveCost;
5680 } else if (GatherScatterCost < ScalarizationCost) {
5681 Decision = CM_GatherScatter;
5682 Cost = GatherScatterCost;
5683 } else {
5684 Decision = CM_Scalarize;
5685 Cost = ScalarizationCost;
5686 }
5687 // If the instructions belongs to an interleave group, the whole group
5688 // receives the same decision. The whole group receives the cost, but
5689 // the cost will actually be assigned to one instruction.
5690 if (const auto *Group = getInterleavedAccessGroup(Instr: &I))
5691 setWideningDecision(Grp: Group, VF, W: Decision, Cost);
5692 else
5693 setWideningDecision(I: &I, VF, W: Decision, Cost);
5694 }
5695 }
5696
5697 // Make sure that any load of address and any other address computation
5698 // remains scalar unless there is gather/scatter support. This avoids
5699 // inevitable extracts into address registers, and also has the benefit of
5700 // activating LSR more, since that pass can't optimize vectorized
5701 // addresses.
5702 if (TTI.prefersVectorizedAddressing())
5703 return;
5704
5705 // Start with all scalar pointer uses.
5706 SmallPtrSet<Instruction *, 8> AddrDefs;
5707 for (BasicBlock *BB : TheLoop->blocks())
5708 for (Instruction &I : *BB) {
5709 Instruction *PtrDef =
5710 dyn_cast_or_null<Instruction>(Val: getLoadStorePointerOperand(V: &I));
5711 if (PtrDef && TheLoop->contains(Inst: PtrDef) &&
5712 getWideningDecision(I: &I, VF) != CM_GatherScatter)
5713 AddrDefs.insert(Ptr: PtrDef);
5714 }
5715
5716 // Add all instructions used to generate the addresses.
5717 SmallVector<Instruction *, 4> Worklist;
5718 append_range(C&: Worklist, R&: AddrDefs);
5719 while (!Worklist.empty()) {
5720 Instruction *I = Worklist.pop_back_val();
5721 for (auto &Op : I->operands())
5722 if (auto *InstOp = dyn_cast<Instruction>(Val&: Op))
5723 if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(Val: InstOp) &&
5724 AddrDefs.insert(Ptr: InstOp).second)
5725 Worklist.push_back(Elt: InstOp);
5726 }
5727
5728 for (auto *I : AddrDefs) {
5729 if (isa<LoadInst>(Val: I)) {
5730 // Setting the desired widening decision should ideally be handled in
5731 // by cost functions, but since this involves the task of finding out
5732 // if the loaded register is involved in an address computation, it is
5733 // instead changed here when we know this is the case.
5734 InstWidening Decision = getWideningDecision(I, VF);
5735 if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
5736 // Scalarize a widened load of address.
5737 setWideningDecision(
5738 I, VF, W: CM_Scalarize,
5739 Cost: (VF.getKnownMinValue() *
5740 getMemoryInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1))));
5741 else if (const auto *Group = getInterleavedAccessGroup(Instr: I)) {
5742 // Scalarize an interleave group of address loads.
5743 for (unsigned I = 0; I < Group->getFactor(); ++I) {
5744 if (Instruction *Member = Group->getMember(Index: I))
5745 setWideningDecision(
5746 I: Member, VF, W: CM_Scalarize,
5747 Cost: (VF.getKnownMinValue() *
5748 getMemoryInstructionCost(I: Member, VF: ElementCount::getFixed(MinVal: 1))));
5749 }
5750 }
5751 } else {
5752 // Cannot scalarize fixed-order recurrence phis at the moment.
5753 if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
5754 continue;
5755
5756 // Make sure I gets scalarized and a cost estimate without
5757 // scalarization overhead.
5758 ForcedScalars[VF].insert(Ptr: I);
5759 }
5760 }
5761}
5762
5763void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
5764 assert(!VF.isScalar() &&
5765 "Trying to set a vectorization decision for a scalar VF");
5766
5767 auto ForcedScalar = ForcedScalars.find(Val: VF);
5768 for (BasicBlock *BB : TheLoop->blocks()) {
5769 // For each instruction in the old loop.
5770 for (Instruction &I : *BB) {
5771 CallInst *CI = dyn_cast<CallInst>(Val: &I);
5772
5773 if (!CI)
5774 continue;
5775
5776 InstructionCost ScalarCost = InstructionCost::getInvalid();
5777 InstructionCost VectorCost = InstructionCost::getInvalid();
5778 InstructionCost IntrinsicCost = InstructionCost::getInvalid();
5779 Function *ScalarFunc = CI->getCalledFunction();
5780 Type *ScalarRetTy = CI->getType();
5781 SmallVector<Type *, 4> Tys, ScalarTys;
5782 for (auto &ArgOp : CI->args())
5783 ScalarTys.push_back(Elt: ArgOp->getType());
5784
5785 // Estimate cost of scalarized vector call. The source operands are
5786 // assumed to be vectors, so we need to extract individual elements from
5787 // there, execute VF scalar calls, and then gather the result into the
5788 // vector return value.
5789 InstructionCost ScalarCallCost =
5790 TTI.getCallInstrCost(F: ScalarFunc, RetTy: ScalarRetTy, Tys: ScalarTys, CostKind);
5791
5792 // Compute costs of unpacking argument values for the scalar calls and
5793 // packing the return values to a vector.
5794 InstructionCost ScalarizationCost = getScalarizationOverhead(I: CI, VF);
5795
5796 ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
5797 // Honor ForcedScalars and UniformAfterVectorization decisions.
5798 // TODO: For calls, it might still be more profitable to widen. Use
5799 // VPlan-based cost model to compare different options.
5800 if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
5801 ForcedScalar->second.contains(Ptr: CI)) ||
5802 isUniformAfterVectorization(I: CI, VF))) {
5803 setCallWideningDecision(CI, VF, Kind: CM_Scalarize, Variant: nullptr,
5804 IID: Intrinsic::not_intrinsic, MaskPos: std::nullopt,
5805 Cost: ScalarCost);
5806 continue;
5807 }
5808
5809 bool MaskRequired = Legal->isMaskRequired(I: CI);
5810 // Compute corresponding vector type for return value and arguments.
5811 Type *RetTy = toVectorizedTy(Ty: ScalarRetTy, EC: VF);
5812 for (Type *ScalarTy : ScalarTys)
5813 Tys.push_back(Elt: toVectorizedTy(Ty: ScalarTy, EC: VF));
5814
5815 // An in-loop reduction using an fmuladd intrinsic is a special case;
5816 // we don't want the normal cost for that intrinsic.
5817 if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
5818 if (auto RedCost = getReductionPatternCost(I: CI, VF, Ty: RetTy)) {
5819 setCallWideningDecision(CI, VF, Kind: CM_IntrinsicCall, Variant: nullptr,
5820 IID: getVectorIntrinsicIDForCall(CI, TLI),
5821 MaskPos: std::nullopt, Cost: *RedCost);
5822 continue;
5823 }
5824
5825 // Find the cost of vectorizing the call, if we can find a suitable
5826 // vector variant of the function.
5827 VFInfo FuncInfo;
5828 Function *VecFunc = nullptr;
5829 // Search through any available variants for one we can use at this VF.
5830 for (VFInfo &Info : VFDatabase::getMappings(CI: *CI)) {
5831 // Must match requested VF.
5832 if (Info.Shape.VF != VF)
5833 continue;
5834
5835 // Must take a mask argument if one is required
5836 if (MaskRequired && !Info.isMasked())
5837 continue;
5838
5839 // Check that all parameter kinds are supported
5840 bool ParamsOk = true;
5841 for (VFParameter Param : Info.Shape.Parameters) {
5842 switch (Param.ParamKind) {
5843 case VFParamKind::Vector:
5844 break;
5845 case VFParamKind::OMP_Uniform: {
5846 Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
5847 // Make sure the scalar parameter in the loop is invariant.
5848 if (!PSE.getSE()->isLoopInvariant(S: PSE.getSCEV(V: ScalarParam),
5849 L: TheLoop))
5850 ParamsOk = false;
5851 break;
5852 }
5853 case VFParamKind::OMP_Linear: {
5854 Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
5855 // Find the stride for the scalar parameter in this loop and see if
5856 // it matches the stride for the variant.
5857 // TODO: do we need to figure out the cost of an extract to get the
5858 // first lane? Or do we hope that it will be folded away?
5859 ScalarEvolution *SE = PSE.getSE();
5860 const auto *SAR =
5861 dyn_cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: ScalarParam));
5862
5863 if (!SAR || SAR->getLoop() != TheLoop) {
5864 ParamsOk = false;
5865 break;
5866 }
5867
5868 const SCEVConstant *Step =
5869 dyn_cast<SCEVConstant>(Val: SAR->getStepRecurrence(SE&: *SE));
5870
5871 if (!Step ||
5872 Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
5873 ParamsOk = false;
5874
5875 break;
5876 }
5877 case VFParamKind::GlobalPredicate:
5878 break;
5879 default:
5880 ParamsOk = false;
5881 break;
5882 }
5883 }
5884
5885 if (!ParamsOk)
5886 continue;
5887
5888 // Found a suitable candidate, stop here.
5889 VecFunc = CI->getModule()->getFunction(Name: Info.VectorName);
5890 FuncInfo = Info;
5891 break;
5892 }
5893
5894 if (TLI && VecFunc && !CI->isNoBuiltin())
5895 VectorCost = TTI.getCallInstrCost(F: nullptr, RetTy, Tys, CostKind);
5896
5897 // Find the cost of an intrinsic; some targets may have instructions that
5898 // perform the operation without needing an actual call.
5899 Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
5900 if (IID != Intrinsic::not_intrinsic)
5901 IntrinsicCost = getVectorIntrinsicCost(CI, VF);
5902
5903 InstructionCost Cost = ScalarCost;
5904 InstWidening Decision = CM_Scalarize;
5905
5906 if (VectorCost <= Cost) {
5907 Cost = VectorCost;
5908 Decision = CM_VectorCall;
5909 }
5910
5911 if (IntrinsicCost <= Cost) {
5912 Cost = IntrinsicCost;
5913 Decision = CM_IntrinsicCall;
5914 }
5915
5916 setCallWideningDecision(CI, VF, Kind: Decision, Variant: VecFunc, IID,
5917 MaskPos: FuncInfo.getParamIndexForOptionalMask(), Cost);
5918 }
5919 }
5920}
5921
5922bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
5923 if (!Legal->isInvariant(V: Op))
5924 return false;
5925 // Consider Op invariant, if it or its operands aren't predicated
5926 // instruction in the loop. In that case, it is not trivially hoistable.
5927 auto *OpI = dyn_cast<Instruction>(Val: Op);
5928 return !OpI || !TheLoop->contains(Inst: OpI) ||
5929 (!isPredicatedInst(I: OpI) &&
5930 (!isa<PHINode>(Val: OpI) || OpI->getParent() != TheLoop->getHeader()) &&
5931 all_of(Range: OpI->operands(),
5932 P: [this](Value *Op) { return shouldConsiderInvariant(Op); }));
5933}
5934
5935InstructionCost
5936LoopVectorizationCostModel::getInstructionCost(Instruction *I,
5937 ElementCount VF) {
5938 // If we know that this instruction will remain uniform, check the cost of
5939 // the scalar version.
5940 if (isUniformAfterVectorization(I, VF))
5941 VF = ElementCount::getFixed(MinVal: 1);
5942
5943 if (VF.isVector() && isProfitableToScalarize(I, VF))
5944 return InstsToScalarize[VF][I];
5945
5946 // Forced scalars do not have any scalarization overhead.
5947 auto ForcedScalar = ForcedScalars.find(Val: VF);
5948 if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
5949 auto InstSet = ForcedScalar->second;
5950 if (InstSet.count(Ptr: I))
5951 return getInstructionCost(I, VF: ElementCount::getFixed(MinVal: 1)) *
5952 VF.getKnownMinValue();
5953 }
5954
5955 Type *RetTy = I->getType();
5956 if (canTruncateToMinimalBitwidth(I, VF))
5957 RetTy = IntegerType::get(C&: RetTy->getContext(), NumBits: MinBWs[I]);
5958 auto *SE = PSE.getSE();
5959
5960 Type *VectorTy;
5961 if (isScalarAfterVectorization(I, VF)) {
5962 [[maybe_unused]] auto HasSingleCopyAfterVectorization =
5963 [this](Instruction *I, ElementCount VF) -> bool {
5964 if (VF.isScalar())
5965 return true;
5966
5967 auto Scalarized = InstsToScalarize.find(Val: VF);
5968 assert(Scalarized != InstsToScalarize.end() &&
5969 "VF not yet analyzed for scalarization profitability");
5970 return !Scalarized->second.count(Val: I) &&
5971 llvm::all_of(Range: I->users(), P: [&](User *U) {
5972 auto *UI = cast<Instruction>(Val: U);
5973 return !Scalarized->second.count(Val: UI);
5974 });
5975 };
5976
5977 // With the exception of GEPs and PHIs, after scalarization there should
5978 // only be one copy of the instruction generated in the loop. This is
5979 // because the VF is either 1, or any instructions that need scalarizing
5980 // have already been dealt with by the time we get here. As a result,
5981 // it means we don't have to multiply the instruction cost by VF.
5982 assert(I->getOpcode() == Instruction::GetElementPtr ||
5983 I->getOpcode() == Instruction::PHI ||
5984 (I->getOpcode() == Instruction::BitCast &&
5985 I->getType()->isPointerTy()) ||
5986 HasSingleCopyAfterVectorization(I, VF));
5987 VectorTy = RetTy;
5988 } else
5989 VectorTy = toVectorizedTy(Ty: RetTy, EC: VF);
5990
5991 if (VF.isVector() && VectorTy->isVectorTy() &&
5992 !TTI.getNumberOfParts(Tp: VectorTy))
5993 return InstructionCost::getInvalid();
5994
5995 // TODO: We need to estimate the cost of intrinsic calls.
5996 switch (I->getOpcode()) {
5997 case Instruction::GetElementPtr:
5998 // We mark this instruction as zero-cost because the cost of GEPs in
5999 // vectorized code depends on whether the corresponding memory instruction
6000 // is scalarized or not. Therefore, we handle GEPs with the memory
6001 // instruction cost.
6002 return 0;
6003 case Instruction::Br: {
6004 // In cases of scalarized and predicated instructions, there will be VF
6005 // predicated blocks in the vectorized loop. Each branch around these
6006 // blocks requires also an extract of its vector compare i1 element.
6007 // Note that the conditional branch from the loop latch will be replaced by
6008 // a single branch controlling the loop, so there is no extra overhead from
6009 // scalarization.
6010 bool ScalarPredicatedBB = false;
6011 BranchInst *BI = cast<BranchInst>(Val: I);
6012 if (VF.isVector() && BI->isConditional() &&
6013 (PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 0)) ||
6014 PredicatedBBsAfterVectorization[VF].count(Ptr: BI->getSuccessor(i: 1))) &&
6015 BI->getParent() != TheLoop->getLoopLatch())
6016 ScalarPredicatedBB = true;
6017
6018 if (ScalarPredicatedBB) {
6019 // Not possible to scalarize scalable vector with predicated instructions.
6020 if (VF.isScalable())
6021 return InstructionCost::getInvalid();
6022 // Return cost for branches around scalarized and predicated blocks.
6023 auto *VecI1Ty =
6024 VectorType::get(ElementType: IntegerType::getInt1Ty(C&: RetTy->getContext()), EC: VF);
6025 return (
6026 TTI.getScalarizationOverhead(
6027 Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
6028 /*Insert*/ false, /*Extract*/ true, CostKind) +
6029 (TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind) * VF.getFixedValue()));
6030 }
6031
6032 if (I->getParent() == TheLoop->getLoopLatch() || VF.isScalar())
6033 // The back-edge branch will remain, as will all scalar branches.
6034 return TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind);
6035
6036 // This branch will be eliminated by if-conversion.
6037 return 0;
6038 // Note: We currently assume zero cost for an unconditional branch inside
6039 // a predicated block since it will become a fall-through, although we
6040 // may decide in the future to call TTI for all branches.
6041 }
6042 case Instruction::Switch: {
6043 if (VF.isScalar())
6044 return TTI.getCFInstrCost(Opcode: Instruction::Switch, CostKind);
6045 auto *Switch = cast<SwitchInst>(Val: I);
6046 return Switch->getNumCases() *
6047 TTI.getCmpSelInstrCost(
6048 Opcode: Instruction::ICmp,
6049 ValTy: toVectorTy(Scalar: Switch->getCondition()->getType(), EC: VF),
6050 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
6051 VecPred: CmpInst::ICMP_EQ, CostKind);
6052 }
6053 case Instruction::PHI: {
6054 auto *Phi = cast<PHINode>(Val: I);
6055
6056 // First-order recurrences are replaced by vector shuffles inside the loop.
6057 if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6058 SmallVector<int> Mask(VF.getKnownMinValue());
6059 std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - 1);
6060 return TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
6061 DstTy: cast<VectorType>(Val: VectorTy),
6062 SrcTy: cast<VectorType>(Val: VectorTy), Mask, CostKind,
6063 Index: VF.getKnownMinValue() - 1);
6064 }
6065
6066 // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6067 // converted into select instructions. We require N - 1 selects per phi
6068 // node, where N is the number of incoming values.
6069 if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6070 Type *ResultTy = Phi->getType();
6071
6072 // All instructions in an Any-of reduction chain are narrowed to bool.
6073 // Check if that is the case for this phi node.
6074 auto *HeaderUser = cast_if_present<PHINode>(
6075 Val: find_singleton<User>(Range: Phi->users(), P: [this](User *U, bool) -> User * {
6076 auto *Phi = dyn_cast<PHINode>(Val: U);
6077 if (Phi && Phi->getParent() == TheLoop->getHeader())
6078 return Phi;
6079 return nullptr;
6080 }));
6081 if (HeaderUser) {
6082 auto &ReductionVars = Legal->getReductionVars();
6083 auto Iter = ReductionVars.find(Key: HeaderUser);
6084 if (Iter != ReductionVars.end() &&
6085 RecurrenceDescriptor::isAnyOfRecurrenceKind(
6086 Kind: Iter->second.getRecurrenceKind()))
6087 ResultTy = Type::getInt1Ty(C&: Phi->getContext());
6088 }
6089 return (Phi->getNumIncomingValues() - 1) *
6090 TTI.getCmpSelInstrCost(
6091 Opcode: Instruction::Select, ValTy: toVectorTy(Scalar: ResultTy, EC: VF),
6092 CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF),
6093 VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
6094 }
6095
6096 // When tail folding with EVL, if the phi is part of an out of loop
6097 // reduction then it will be transformed into a wide vp_merge.
6098 if (VF.isVector() && foldTailWithEVL() &&
6099 Legal->getReductionVars().contains(Key: Phi) && !isInLoopReduction(Phi)) {
6100 IntrinsicCostAttributes ICA(
6101 Intrinsic::vp_merge, toVectorTy(Scalar: Phi->getType(), EC: VF),
6102 {toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF)});
6103 return TTI.getIntrinsicInstrCost(ICA, CostKind);
6104 }
6105
6106 return TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
6107 }
6108 case Instruction::UDiv:
6109 case Instruction::SDiv:
6110 case Instruction::URem:
6111 case Instruction::SRem:
6112 if (VF.isVector() && isPredicatedInst(I)) {
6113 const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6114 return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6115 ScalarCost : SafeDivisorCost;
6116 }
6117 // We've proven all lanes safe to speculate, fall through.
6118 [[fallthrough]];
6119 case Instruction::Add:
6120 case Instruction::Sub: {
6121 auto Info = Legal->getHistogramInfo(I);
6122 if (Info && VF.isVector()) {
6123 const HistogramInfo *HGram = Info.value();
6124 // Assume that a non-constant update value (or a constant != 1) requires
6125 // a multiply, and add that into the cost.
6126 InstructionCost MulCost = TTI::TCC_Free;
6127 ConstantInt *RHS = dyn_cast<ConstantInt>(Val: I->getOperand(i: 1));
6128 if (!RHS || RHS->getZExtValue() != 1)
6129 MulCost =
6130 TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6131
6132 // Find the cost of the histogram operation itself.
6133 Type *PtrTy = VectorType::get(ElementType: HGram->Load->getPointerOperandType(), EC: VF);
6134 Type *ScalarTy = I->getType();
6135 Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: I->getContext()), EC: VF);
6136 IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6137 Type::getVoidTy(C&: I->getContext()),
6138 {PtrTy, ScalarTy, MaskTy});
6139
6140 // Add the costs together with the add/sub operation.
6141 return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6142 TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: VectorTy, CostKind);
6143 }
6144 [[fallthrough]];
6145 }
6146 case Instruction::FAdd:
6147 case Instruction::FSub:
6148 case Instruction::Mul:
6149 case Instruction::FMul:
6150 case Instruction::FDiv:
6151 case Instruction::FRem:
6152 case Instruction::Shl:
6153 case Instruction::LShr:
6154 case Instruction::AShr:
6155 case Instruction::And:
6156 case Instruction::Or:
6157 case Instruction::Xor: {
6158 // If we're speculating on the stride being 1, the multiplication may
6159 // fold away. We can generalize this for all operations using the notion
6160 // of neutral elements. (TODO)
6161 if (I->getOpcode() == Instruction::Mul &&
6162 ((TheLoop->isLoopInvariant(V: I->getOperand(i: 0)) &&
6163 PSE.getSCEV(V: I->getOperand(i: 0))->isOne()) ||
6164 (TheLoop->isLoopInvariant(V: I->getOperand(i: 1)) &&
6165 PSE.getSCEV(V: I->getOperand(i: 1))->isOne())))
6166 return 0;
6167
6168 // Detect reduction patterns
6169 if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
6170 return *RedCost;
6171
6172 // Certain instructions can be cheaper to vectorize if they have a constant
6173 // second vector operand. One example of this are shifts on x86.
6174 Value *Op2 = I->getOperand(i: 1);
6175 if (!isa<Constant>(Val: Op2) && TheLoop->isLoopInvariant(V: Op2) &&
6176 PSE.getSE()->isSCEVable(Ty: Op2->getType()) &&
6177 isa<SCEVConstant>(Val: PSE.getSCEV(V: Op2))) {
6178 Op2 = cast<SCEVConstant>(Val: PSE.getSCEV(V: Op2))->getValue();
6179 }
6180 auto Op2Info = TTI.getOperandInfo(V: Op2);
6181 if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6182 shouldConsiderInvariant(Op: Op2))
6183 Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6184
6185 SmallVector<const Value *, 4> Operands(I->operand_values());
6186 return TTI.getArithmeticInstrCost(
6187 Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6188 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6189 Opd2Info: Op2Info, Args: Operands, CxtI: I, TLibInfo: TLI);
6190 }
6191 case Instruction::FNeg: {
6192 return TTI.getArithmeticInstrCost(
6193 Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6194 Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6195 Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6196 Args: I->getOperand(i: 0), CxtI: I);
6197 }
6198 case Instruction::Select: {
6199 SelectInst *SI = cast<SelectInst>(Val: I);
6200 const SCEV *CondSCEV = SE->getSCEV(V: SI->getCondition());
6201 bool ScalarCond = (SE->isLoopInvariant(S: CondSCEV, L: TheLoop));
6202
6203 const Value *Op0, *Op1;
6204 using namespace llvm::PatternMatch;
6205 if (!ScalarCond && (match(V: I, P: m_LogicalAnd(L: m_Value(V&: Op0), R: m_Value(V&: Op1))) ||
6206 match(V: I, P: m_LogicalOr(L: m_Value(V&: Op0), R: m_Value(V&: Op1))))) {
6207 // select x, y, false --> x & y
6208 // select x, true, y --> x | y
6209 const auto [Op1VK, Op1VP] = TTI::getOperandInfo(V: Op0);
6210 const auto [Op2VK, Op2VP] = TTI::getOperandInfo(V: Op1);
6211 assert(Op0->getType()->getScalarSizeInBits() == 1 &&
6212 Op1->getType()->getScalarSizeInBits() == 1);
6213
6214 SmallVector<const Value *, 2> Operands{Op0, Op1};
6215 return TTI.getArithmeticInstrCost(
6216 Opcode: match(V: I, P: m_LogicalOr()) ? Instruction::Or : Instruction::And, Ty: VectorTy,
6217 CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: I);
6218 }
6219
6220 Type *CondTy = SI->getCondition()->getType();
6221 if (!ScalarCond)
6222 CondTy = VectorType::get(ElementType: CondTy, EC: VF);
6223
6224 CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6225 if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition()))
6226 Pred = Cmp->getPredicate();
6227 return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy, VecPred: Pred,
6228 CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
6229 Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
6230 }
6231 case Instruction::ICmp:
6232 case Instruction::FCmp: {
6233 Type *ValTy = I->getOperand(i: 0)->getType();
6234
6235 if (canTruncateToMinimalBitwidth(I, VF)) {
6236 [[maybe_unused]] Instruction *Op0AsInstruction =
6237 dyn_cast<Instruction>(Val: I->getOperand(i: 0));
6238 assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) ||
6239 MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6240 "if both the operand and the compare are marked for "
6241 "truncation, they must have the same bitwidth");
6242 ValTy = IntegerType::get(C&: ValTy->getContext(), NumBits: MinBWs[I]);
6243 }
6244
6245 VectorTy = toVectorTy(Scalar: ValTy, EC: VF);
6246 return TTI.getCmpSelInstrCost(
6247 Opcode: I->getOpcode(), ValTy: VectorTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VectorTy),
6248 VecPred: cast<CmpInst>(Val: I)->getPredicate(), CostKind,
6249 Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
6250 }
6251 case Instruction::Store:
6252 case Instruction::Load: {
6253 ElementCount Width = VF;
6254 if (Width.isVector()) {
6255 InstWidening Decision = getWideningDecision(I, VF: Width);
6256 assert(Decision != CM_Unknown &&
6257 "CM decision should be taken at this point");
6258 if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6259 return InstructionCost::getInvalid();
6260 if (Decision == CM_Scalarize)
6261 Width = ElementCount::getFixed(MinVal: 1);
6262 }
6263 VectorTy = toVectorTy(Scalar: getLoadStoreType(I), EC: Width);
6264 return getMemoryInstructionCost(I, VF);
6265 }
6266 case Instruction::BitCast:
6267 if (I->getType()->isPointerTy())
6268 return 0;
6269 [[fallthrough]];
6270 case Instruction::ZExt:
6271 case Instruction::SExt:
6272 case Instruction::FPToUI:
6273 case Instruction::FPToSI:
6274 case Instruction::FPExt:
6275 case Instruction::PtrToInt:
6276 case Instruction::IntToPtr:
6277 case Instruction::SIToFP:
6278 case Instruction::UIToFP:
6279 case Instruction::Trunc:
6280 case Instruction::FPTrunc: {
6281 // Computes the CastContextHint from a Load/Store instruction.
6282 auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6283 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
6284 "Expected a load or a store!");
6285
6286 if (VF.isScalar() || !TheLoop->contains(Inst: I))
6287 return TTI::CastContextHint::Normal;
6288
6289 switch (getWideningDecision(I, VF)) {
6290 case LoopVectorizationCostModel::CM_GatherScatter:
6291 return TTI::CastContextHint::GatherScatter;
6292 case LoopVectorizationCostModel::CM_Interleave:
6293 return TTI::CastContextHint::Interleave;
6294 case LoopVectorizationCostModel::CM_Scalarize:
6295 case LoopVectorizationCostModel::CM_Widen:
6296 return isPredicatedInst(I) ? TTI::CastContextHint::Masked
6297 : TTI::CastContextHint::Normal;
6298 case LoopVectorizationCostModel::CM_Widen_Reverse:
6299 return TTI::CastContextHint::Reversed;
6300 case LoopVectorizationCostModel::CM_Unknown:
6301 llvm_unreachable("Instr did not go through cost modelling?");
6302 case LoopVectorizationCostModel::CM_VectorCall:
6303 case LoopVectorizationCostModel::CM_IntrinsicCall:
6304 llvm_unreachable_internal(msg: "Instr has invalid widening decision");
6305 }
6306
6307 llvm_unreachable("Unhandled case!");
6308 };
6309
6310 unsigned Opcode = I->getOpcode();
6311 TTI::CastContextHint CCH = TTI::CastContextHint::None;
6312 // For Trunc, the context is the only user, which must be a StoreInst.
6313 if (Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) {
6314 if (I->hasOneUse())
6315 if (StoreInst *Store = dyn_cast<StoreInst>(Val: *I->user_begin()))
6316 CCH = ComputeCCH(Store);
6317 }
6318 // For Z/Sext, the context is the operand, which must be a LoadInst.
6319 else if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt ||
6320 Opcode == Instruction::FPExt) {
6321 if (LoadInst *Load = dyn_cast<LoadInst>(Val: I->getOperand(i: 0)))
6322 CCH = ComputeCCH(Load);
6323 }
6324
6325 // We optimize the truncation of induction variables having constant
6326 // integer steps. The cost of these truncations is the same as the scalar
6327 // operation.
6328 if (isOptimizableIVTruncate(I, VF)) {
6329 auto *Trunc = cast<TruncInst>(Val: I);
6330 return TTI.getCastInstrCost(Opcode: Instruction::Trunc, Dst: Trunc->getDestTy(),
6331 Src: Trunc->getSrcTy(), CCH, CostKind, I: Trunc);
6332 }
6333
6334 // Detect reduction patterns
6335 if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
6336 return *RedCost;
6337
6338 Type *SrcScalarTy = I->getOperand(i: 0)->getType();
6339 Instruction *Op0AsInstruction = dyn_cast<Instruction>(Val: I->getOperand(i: 0));
6340 if (canTruncateToMinimalBitwidth(I: Op0AsInstruction, VF))
6341 SrcScalarTy =
6342 IntegerType::get(C&: SrcScalarTy->getContext(), NumBits: MinBWs[Op0AsInstruction]);
6343 Type *SrcVecTy =
6344 VectorTy->isVectorTy() ? toVectorTy(Scalar: SrcScalarTy, EC: VF) : SrcScalarTy;
6345
6346 if (canTruncateToMinimalBitwidth(I, VF)) {
6347 // If the result type is <= the source type, there will be no extend
6348 // after truncating the users to the minimal required bitwidth.
6349 if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6350 (I->getOpcode() == Instruction::ZExt ||
6351 I->getOpcode() == Instruction::SExt))
6352 return 0;
6353 }
6354
6355 return TTI.getCastInstrCost(Opcode, Dst: VectorTy, Src: SrcVecTy, CCH, CostKind, I);
6356 }
6357 case Instruction::Call:
6358 return getVectorCallCost(CI: cast<CallInst>(Val: I), VF);
6359 case Instruction::ExtractValue:
6360 return TTI.getInstructionCost(U: I, CostKind);
6361 case Instruction::Alloca:
6362 // We cannot easily widen alloca to a scalable alloca, as
6363 // the result would need to be a vector of pointers.
6364 if (VF.isScalable())
6365 return InstructionCost::getInvalid();
6366 [[fallthrough]];
6367 default:
6368 // This opcode is unknown. Assume that it is the same as 'mul'.
6369 return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6370 } // end of switch.
6371}
6372
6373void LoopVectorizationCostModel::collectValuesToIgnore() {
6374 // Ignore ephemeral values.
6375 CodeMetrics::collectEphemeralValues(L: TheLoop, AC, EphValues&: ValuesToIgnore);
6376
6377 SmallVector<Value *, 4> DeadInterleavePointerOps;
6378 SmallVector<Value *, 4> DeadOps;
6379
6380 // If a scalar epilogue is required, users outside the loop won't use
6381 // live-outs from the vector loop but from the scalar epilogue. Ignore them if
6382 // that is the case.
6383 bool RequiresScalarEpilogue = requiresScalarEpilogue(IsVectorizing: true);
6384 auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6385 return RequiresScalarEpilogue &&
6386 !TheLoop->contains(BB: cast<Instruction>(Val: U)->getParent());
6387 };
6388
6389 LoopBlocksDFS DFS(TheLoop);
6390 DFS.perform(LI);
6391 MapVector<Value *, SmallVector<Value *>> DeadInvariantStoreOps;
6392 for (BasicBlock *BB : reverse(C: make_range(x: DFS.beginRPO(), y: DFS.endRPO())))
6393 for (Instruction &I : reverse(C&: *BB)) {
6394 // Find all stores to invariant variables. Since they are going to sink
6395 // outside the loop we do not need calculate cost for them.
6396 StoreInst *SI;
6397 if ((SI = dyn_cast<StoreInst>(Val: &I)) &&
6398 Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand())) {
6399 ValuesToIgnore.insert(Ptr: &I);
6400 DeadInvariantStoreOps[SI->getPointerOperand()].push_back(
6401 Elt: SI->getValueOperand());
6402 }
6403
6404 if (VecValuesToIgnore.contains(Ptr: &I) || ValuesToIgnore.contains(Ptr: &I))
6405 continue;
6406
6407 // Add instructions that would be trivially dead and are only used by
6408 // values already ignored to DeadOps to seed worklist.
6409 if (wouldInstructionBeTriviallyDead(I: &I, TLI) &&
6410 all_of(Range: I.users(), P: [this, IsLiveOutDead](User *U) {
6411 return VecValuesToIgnore.contains(Ptr: U) ||
6412 ValuesToIgnore.contains(Ptr: U) || IsLiveOutDead(U);
6413 }))
6414 DeadOps.push_back(Elt: &I);
6415
6416 // For interleave groups, we only create a pointer for the start of the
6417 // interleave group. Queue up addresses of group members except the insert
6418 // position for further processing.
6419 if (isAccessInterleaved(Instr: &I)) {
6420 auto *Group = getInterleavedAccessGroup(Instr: &I);
6421 if (Group->getInsertPos() == &I)
6422 continue;
6423 Value *PointerOp = getLoadStorePointerOperand(V: &I);
6424 DeadInterleavePointerOps.push_back(Elt: PointerOp);
6425 }
6426
6427 // Queue branches for analysis. They are dead, if their successors only
6428 // contain dead instructions.
6429 if (auto *Br = dyn_cast<BranchInst>(Val: &I)) {
6430 if (Br->isConditional())
6431 DeadOps.push_back(Elt: &I);
6432 }
6433 }
6434
6435 // Mark ops feeding interleave group members as free, if they are only used
6436 // by other dead computations.
6437 for (unsigned I = 0; I != DeadInterleavePointerOps.size(); ++I) {
6438 auto *Op = dyn_cast<Instruction>(Val: DeadInterleavePointerOps[I]);
6439 if (!Op || !TheLoop->contains(Inst: Op) || any_of(Range: Op->users(), P: [this](User *U) {
6440 Instruction *UI = cast<Instruction>(Val: U);
6441 return !VecValuesToIgnore.contains(Ptr: U) &&
6442 (!isAccessInterleaved(Instr: UI) ||
6443 getInterleavedAccessGroup(Instr: UI)->getInsertPos() == UI);
6444 }))
6445 continue;
6446 VecValuesToIgnore.insert(Ptr: Op);
6447 DeadInterleavePointerOps.append(in_start: Op->op_begin(), in_end: Op->op_end());
6448 }
6449
6450 for (const auto &[_, Ops] : DeadInvariantStoreOps)
6451 llvm::append_range(C&: DeadOps, R: drop_end(RangeOrContainer: Ops));
6452
6453 // Mark ops that would be trivially dead and are only used by ignored
6454 // instructions as free.
6455 BasicBlock *Header = TheLoop->getHeader();
6456
6457 // Returns true if the block contains only dead instructions. Such blocks will
6458 // be removed by VPlan-to-VPlan transforms and won't be considered by the
6459 // VPlan-based cost model, so skip them in the legacy cost-model as well.
6460 auto IsEmptyBlock = [this](BasicBlock *BB) {
6461 return all_of(Range&: *BB, P: [this](Instruction &I) {
6462 return ValuesToIgnore.contains(Ptr: &I) || VecValuesToIgnore.contains(Ptr: &I) ||
6463 (isa<BranchInst>(Val: &I) && !cast<BranchInst>(Val: &I)->isConditional());
6464 });
6465 };
6466 for (unsigned I = 0; I != DeadOps.size(); ++I) {
6467 auto *Op = dyn_cast<Instruction>(Val: DeadOps[I]);
6468
6469 // Check if the branch should be considered dead.
6470 if (auto *Br = dyn_cast_or_null<BranchInst>(Val: Op)) {
6471 BasicBlock *ThenBB = Br->getSuccessor(i: 0);
6472 BasicBlock *ElseBB = Br->getSuccessor(i: 1);
6473 // Don't considers branches leaving the loop for simplification.
6474 if (!TheLoop->contains(BB: ThenBB) || !TheLoop->contains(BB: ElseBB))
6475 continue;
6476 bool ThenEmpty = IsEmptyBlock(ThenBB);
6477 bool ElseEmpty = IsEmptyBlock(ElseBB);
6478 if ((ThenEmpty && ElseEmpty) ||
6479 (ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6480 ElseBB->phis().empty()) ||
6481 (ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6482 ThenBB->phis().empty())) {
6483 VecValuesToIgnore.insert(Ptr: Br);
6484 DeadOps.push_back(Elt: Br->getCondition());
6485 }
6486 continue;
6487 }
6488
6489 // Skip any op that shouldn't be considered dead.
6490 if (!Op || !TheLoop->contains(Inst: Op) ||
6491 (isa<PHINode>(Val: Op) && Op->getParent() == Header) ||
6492 !wouldInstructionBeTriviallyDead(I: Op, TLI) ||
6493 any_of(Range: Op->users(), P: [this, IsLiveOutDead](User *U) {
6494 return !VecValuesToIgnore.contains(Ptr: U) &&
6495 !ValuesToIgnore.contains(Ptr: U) && !IsLiveOutDead(U);
6496 }))
6497 continue;
6498
6499 // If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6500 // which applies for both scalar and vector versions. Otherwise it is only
6501 // dead in vector versions, so only add it to VecValuesToIgnore.
6502 if (all_of(Range: Op->users(),
6503 P: [this](User *U) { return ValuesToIgnore.contains(Ptr: U); }))
6504 ValuesToIgnore.insert(Ptr: Op);
6505
6506 VecValuesToIgnore.insert(Ptr: Op);
6507 DeadOps.append(in_start: Op->op_begin(), in_end: Op->op_end());
6508 }
6509
6510 // Ignore type-promoting instructions we identified during reduction
6511 // detection.
6512 for (const auto &Reduction : Legal->getReductionVars()) {
6513 const RecurrenceDescriptor &RedDes = Reduction.second;
6514 const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6515 VecValuesToIgnore.insert_range(R: Casts);
6516 }
6517 // Ignore type-casting instructions we identified during induction
6518 // detection.
6519 for (const auto &Induction : Legal->getInductionVars()) {
6520 const InductionDescriptor &IndDes = Induction.second;
6521 const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6522 VecValuesToIgnore.insert_range(R: Casts);
6523 }
6524}
6525
6526void LoopVectorizationCostModel::collectInLoopReductions() {
6527 // Avoid duplicating work finding in-loop reductions.
6528 if (!InLoopReductions.empty())
6529 return;
6530
6531 for (const auto &Reduction : Legal->getReductionVars()) {
6532 PHINode *Phi = Reduction.first;
6533 const RecurrenceDescriptor &RdxDesc = Reduction.second;
6534
6535 // We don't collect reductions that are type promoted (yet).
6536 if (RdxDesc.getRecurrenceType() != Phi->getType())
6537 continue;
6538
6539 // If the target would prefer this reduction to happen "in-loop", then we
6540 // want to record it as such.
6541 RecurKind Kind = RdxDesc.getRecurrenceKind();
6542 if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6543 !TTI.preferInLoopReduction(Kind, Ty: Phi->getType()))
6544 continue;
6545
6546 // Check that we can correctly put the reductions into the loop, by
6547 // finding the chain of operations that leads from the phi to the loop
6548 // exit value.
6549 SmallVector<Instruction *, 4> ReductionOperations =
6550 RdxDesc.getReductionOpChain(Phi, L: TheLoop);
6551 bool InLoop = !ReductionOperations.empty();
6552
6553 if (InLoop) {
6554 InLoopReductions.insert(Ptr: Phi);
6555 // Add the elements to InLoopReductionImmediateChains for cost modelling.
6556 Instruction *LastChain = Phi;
6557 for (auto *I : ReductionOperations) {
6558 InLoopReductionImmediateChains[I] = LastChain;
6559 LastChain = I;
6560 }
6561 }
6562 LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6563 << " reduction for phi: " << *Phi << "\n");
6564 }
6565}
6566
6567// This function will select a scalable VF if the target supports scalable
6568// vectors and a fixed one otherwise.
6569// TODO: we could return a pair of values that specify the max VF and
6570// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6571// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6572// doesn't have a cost model that can choose which plan to execute if
6573// more than one is generated.
6574static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
6575 LoopVectorizationCostModel &CM) {
6576 unsigned WidestType;
6577 std::tie(args: std::ignore, args&: WidestType) = CM.getSmallestAndWidestTypes();
6578
6579 TargetTransformInfo::RegisterKind RegKind =
6580 TTI.enableScalableVectorization()
6581 ? TargetTransformInfo::RGK_ScalableVector
6582 : TargetTransformInfo::RGK_FixedWidthVector;
6583
6584 TypeSize RegSize = TTI.getRegisterBitWidth(K: RegKind);
6585 unsigned N = RegSize.getKnownMinValue() / WidestType;
6586 return ElementCount::get(MinVal: N, Scalable: RegSize.isScalable());
6587}
6588
6589VectorizationFactor
6590LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6591 ElementCount VF = UserVF;
6592 // Outer loop handling: They may require CFG and instruction level
6593 // transformations before even evaluating whether vectorization is profitable.
6594 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
6595 // the vectorization pipeline.
6596 if (!OrigLoop->isInnermost()) {
6597 // If the user doesn't provide a vectorization factor, determine a
6598 // reasonable one.
6599 if (UserVF.isZero()) {
6600 VF = determineVPlanVF(TTI, CM);
6601 LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6602
6603 // Make sure we have a VF > 1 for stress testing.
6604 if (VPlanBuildStressTest && (VF.isScalar() || VF.isZero())) {
6605 LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6606 << "overriding computed VF.\n");
6607 VF = ElementCount::getFixed(MinVal: 4);
6608 }
6609 } else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6610 !ForceTargetSupportsScalableVectors) {
6611 LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6612 << "not supported by the target.\n");
6613 reportVectorizationFailure(
6614 DebugMsg: "Scalable vectorization requested but not supported by the target",
6615 OREMsg: "the scalable user-specified vectorization width for outer-loop "
6616 "vectorization cannot be used because the target does not support "
6617 "scalable vectors.",
6618 ORETag: "ScalableVFUnfeasible", ORE, TheLoop: OrigLoop);
6619 return VectorizationFactor::Disabled();
6620 }
6621 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6622 assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6623 "VF needs to be a power of two");
6624 LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6625 << "VF " << VF << " to build VPlans.\n");
6626 buildVPlans(MinVF: VF, MaxVF: VF);
6627
6628 if (VPlans.empty())
6629 return VectorizationFactor::Disabled();
6630
6631 // For VPlan build stress testing, we bail out after VPlan construction.
6632 if (VPlanBuildStressTest)
6633 return VectorizationFactor::Disabled();
6634
6635 return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
6636 }
6637
6638 LLVM_DEBUG(
6639 dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6640 "VPlan-native path.\n");
6641 return VectorizationFactor::Disabled();
6642}
6643
6644void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6645 assert(OrigLoop->isInnermost() && "Inner loop expected.");
6646 CM.collectValuesToIgnore();
6647 CM.collectElementTypesForWidening();
6648
6649 FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6650 if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6651 return;
6652
6653 // Invalidate interleave groups if all blocks of loop will be predicated.
6654 if (CM.blockNeedsPredicationForAnyReason(BB: OrigLoop->getHeader()) &&
6655 !useMaskedInterleavedAccesses(TTI)) {
6656 LLVM_DEBUG(
6657 dbgs()
6658 << "LV: Invalidate all interleaved groups due to fold-tail by masking "
6659 "which requires masked-interleaved support.\n");
6660 if (CM.InterleaveInfo.invalidateGroups())
6661 // Invalidating interleave groups also requires invalidating all decisions
6662 // based on them, which includes widening decisions and uniform and scalar
6663 // values.
6664 CM.invalidateCostModelingDecisions();
6665 }
6666
6667 if (CM.foldTailByMasking())
6668 Legal->prepareToFoldTailByMasking();
6669
6670 ElementCount MaxUserVF =
6671 UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6672 if (UserVF) {
6673 if (!ElementCount::isKnownLE(LHS: UserVF, RHS: MaxUserVF)) {
6674 reportVectorizationInfo(
6675 Msg: "UserVF ignored because it may be larger than the maximal safe VF",
6676 ORETag: "InvalidUserVF", ORE, TheLoop: OrigLoop);
6677 } else {
6678 assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
6679 "VF needs to be a power of two");
6680 // Collect the instructions (and their associated costs) that will be more
6681 // profitable to scalarize.
6682 CM.collectInLoopReductions();
6683 if (CM.selectUserVectorizationFactor(UserVF)) {
6684 LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6685 buildVPlansWithVPRecipes(MinVF: UserVF, MaxVF: UserVF);
6686 LLVM_DEBUG(printPlans(dbgs()));
6687 return;
6688 }
6689 reportVectorizationInfo(Msg: "UserVF ignored because of invalid costs.",
6690 ORETag: "InvalidCost", ORE, TheLoop: OrigLoop);
6691 }
6692 }
6693
6694 // Collect the Vectorization Factor Candidates.
6695 SmallVector<ElementCount> VFCandidates;
6696 for (auto VF = ElementCount::getFixed(MinVal: 1);
6697 ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.FixedVF); VF *= 2)
6698 VFCandidates.push_back(Elt: VF);
6699 for (auto VF = ElementCount::getScalable(MinVal: 1);
6700 ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.ScalableVF); VF *= 2)
6701 VFCandidates.push_back(Elt: VF);
6702
6703 CM.collectInLoopReductions();
6704 for (const auto &VF : VFCandidates) {
6705 // Collect Uniform and Scalar instructions after vectorization with VF.
6706 CM.collectNonVectorizedAndSetWideningDecisions(VF);
6707 }
6708
6709 buildVPlansWithVPRecipes(MinVF: ElementCount::getFixed(MinVal: 1), MaxVF: MaxFactors.FixedVF);
6710 buildVPlansWithVPRecipes(MinVF: ElementCount::getScalable(MinVal: 1), MaxVF: MaxFactors.ScalableVF);
6711
6712 LLVM_DEBUG(printPlans(dbgs()));
6713}
6714
6715InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
6716 ElementCount VF) const {
6717 if (ForceTargetInstructionCost.getNumOccurrences())
6718 return InstructionCost(ForceTargetInstructionCost.getNumOccurrences());
6719 return CM.getInstructionCost(I: UI, VF);
6720}
6721
6722bool VPCostContext::isLegacyUniformAfterVectorization(Instruction *I,
6723 ElementCount VF) const {
6724 return CM.isUniformAfterVectorization(I, VF);
6725}
6726
6727bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
6728 return CM.ValuesToIgnore.contains(Ptr: UI) ||
6729 (IsVector && CM.VecValuesToIgnore.contains(Ptr: UI)) ||
6730 SkipCostComputation.contains(Ptr: UI);
6731}
6732
6733InstructionCost
6734LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6735 VPCostContext &CostCtx) const {
6736 InstructionCost Cost;
6737 // Cost modeling for inductions is inaccurate in the legacy cost model
6738 // compared to the recipes that are generated. To match here initially during
6739 // VPlan cost model bring up directly use the induction costs from the legacy
6740 // cost model. Note that we do this as pre-processing; the VPlan may not have
6741 // any recipes associated with the original induction increment instruction
6742 // and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6743 // the cost of induction phis and increments (both that are represented by
6744 // recipes and those that are not), to avoid distinguishing between them here,
6745 // and skip all recipes that represent induction phis and increments (the
6746 // former case) later on, if they exist, to avoid counting them twice.
6747 // Similarly we pre-compute the cost of any optimized truncates.
6748 // TODO: Switch to more accurate costing based on VPlan.
6749 for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6750 Instruction *IVInc = cast<Instruction>(
6751 Val: IV->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()));
6752 SmallVector<Instruction *> IVInsts = {IVInc};
6753 for (unsigned I = 0; I != IVInsts.size(); I++) {
6754 for (Value *Op : IVInsts[I]->operands()) {
6755 auto *OpI = dyn_cast<Instruction>(Val: Op);
6756 if (Op == IV || !OpI || !OrigLoop->contains(Inst: OpI) || !Op->hasOneUse())
6757 continue;
6758 IVInsts.push_back(Elt: OpI);
6759 }
6760 }
6761 IVInsts.push_back(Elt: IV);
6762 for (User *U : IV->users()) {
6763 auto *CI = cast<Instruction>(Val: U);
6764 if (!CostCtx.CM.isOptimizableIVTruncate(I: CI, VF))
6765 continue;
6766 IVInsts.push_back(Elt: CI);
6767 }
6768
6769 // If the vector loop gets executed exactly once with the given VF, ignore
6770 // the costs of comparison and induction instructions, as they'll get
6771 // simplified away.
6772 // TODO: Remove this code after stepping away from the legacy cost model and
6773 // adding code to simplify VPlans before calculating their costs.
6774 auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: OrigLoop);
6775 if (TC == VF && !CM.foldTailByMasking())
6776 addFullyUnrolledInstructionsToIgnore(L: OrigLoop, IL: Legal->getInductionVars(),
6777 InstsToIgnore&: CostCtx.SkipCostComputation);
6778
6779 for (Instruction *IVInst : IVInsts) {
6780 if (CostCtx.skipCostComputation(UI: IVInst, IsVector: VF.isVector()))
6781 continue;
6782 InstructionCost InductionCost = CostCtx.getLegacyCost(UI: IVInst, VF);
6783 LLVM_DEBUG({
6784 dbgs() << "Cost of " << InductionCost << " for VF " << VF
6785 << ": induction instruction " << *IVInst << "\n";
6786 });
6787 Cost += InductionCost;
6788 CostCtx.SkipCostComputation.insert(Ptr: IVInst);
6789 }
6790 }
6791
6792 /// Compute the cost of all exiting conditions of the loop using the legacy
6793 /// cost model. This is to match the legacy behavior, which adds the cost of
6794 /// all exit conditions. Note that this over-estimates the cost, as there will
6795 /// be a single condition to control the vector loop.
6796 SmallVector<BasicBlock *> Exiting;
6797 CM.TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
6798 SetVector<Instruction *> ExitInstrs;
6799 // Collect all exit conditions.
6800 for (BasicBlock *EB : Exiting) {
6801 auto *Term = dyn_cast<BranchInst>(Val: EB->getTerminator());
6802 if (!Term || CostCtx.skipCostComputation(UI: Term, IsVector: VF.isVector()))
6803 continue;
6804 if (auto *CondI = dyn_cast<Instruction>(Val: Term->getOperand(i_nocapture: 0))) {
6805 ExitInstrs.insert(X: CondI);
6806 }
6807 }
6808 // Compute the cost of all instructions only feeding the exit conditions.
6809 for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
6810 Instruction *CondI = ExitInstrs[I];
6811 if (!OrigLoop->contains(Inst: CondI) ||
6812 !CostCtx.SkipCostComputation.insert(Ptr: CondI).second)
6813 continue;
6814 InstructionCost CondICost = CostCtx.getLegacyCost(UI: CondI, VF);
6815 LLVM_DEBUG({
6816 dbgs() << "Cost of " << CondICost << " for VF " << VF
6817 << ": exit condition instruction " << *CondI << "\n";
6818 });
6819 Cost += CondICost;
6820 for (Value *Op : CondI->operands()) {
6821 auto *OpI = dyn_cast<Instruction>(Val: Op);
6822 if (!OpI || CostCtx.skipCostComputation(UI: OpI, IsVector: VF.isVector()) ||
6823 any_of(Range: OpI->users(), P: [&ExitInstrs, this](User *U) {
6824 return OrigLoop->contains(BB: cast<Instruction>(Val: U)->getParent()) &&
6825 !ExitInstrs.contains(key: cast<Instruction>(Val: U));
6826 }))
6827 continue;
6828 ExitInstrs.insert(X: OpI);
6829 }
6830 }
6831
6832 // Pre-compute the costs for branches except for the backedge, as the number
6833 // of replicate regions in a VPlan may not directly match the number of
6834 // branches, which would lead to different decisions.
6835 // TODO: Compute cost of branches for each replicate region in the VPlan,
6836 // which is more accurate than the legacy cost model.
6837 for (BasicBlock *BB : OrigLoop->blocks()) {
6838 if (CostCtx.skipCostComputation(UI: BB->getTerminator(), IsVector: VF.isVector()))
6839 continue;
6840 CostCtx.SkipCostComputation.insert(Ptr: BB->getTerminator());
6841 if (BB == OrigLoop->getLoopLatch())
6842 continue;
6843 auto BranchCost = CostCtx.getLegacyCost(UI: BB->getTerminator(), VF);
6844 Cost += BranchCost;
6845 }
6846
6847 // Pre-compute costs for instructions that are forced-scalar or profitable to
6848 // scalarize. Their costs will be computed separately in the legacy cost
6849 // model.
6850 for (Instruction *ForcedScalar : CM.ForcedScalars[VF]) {
6851 if (CostCtx.skipCostComputation(UI: ForcedScalar, IsVector: VF.isVector()))
6852 continue;
6853 CostCtx.SkipCostComputation.insert(Ptr: ForcedScalar);
6854 InstructionCost ForcedCost = CostCtx.getLegacyCost(UI: ForcedScalar, VF);
6855 LLVM_DEBUG({
6856 dbgs() << "Cost of " << ForcedCost << " for VF " << VF
6857 << ": forced scalar " << *ForcedScalar << "\n";
6858 });
6859 Cost += ForcedCost;
6860 }
6861 for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize[VF]) {
6862 if (CostCtx.skipCostComputation(UI: Scalarized, IsVector: VF.isVector()))
6863 continue;
6864 CostCtx.SkipCostComputation.insert(Ptr: Scalarized);
6865 LLVM_DEBUG({
6866 dbgs() << "Cost of " << ScalarCost << " for VF " << VF
6867 << ": profitable to scalarize " << *Scalarized << "\n";
6868 });
6869 Cost += ScalarCost;
6870 }
6871
6872 return Cost;
6873}
6874
6875InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
6876 ElementCount VF) const {
6877 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
6878 CM.CostKind);
6879 InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
6880
6881 // Now compute and add the VPlan-based cost.
6882 Cost += Plan.cost(VF, Ctx&: CostCtx);
6883#ifndef NDEBUG
6884 unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning());
6885 LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
6886 << " (Estimated cost per lane: ");
6887 if (Cost.isValid()) {
6888 double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
6889 LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
6890 } else /* No point dividing an invalid cost - it will still be invalid */
6891 LLVM_DEBUG(dbgs() << "Invalid");
6892 LLVM_DEBUG(dbgs() << ")\n");
6893#endif
6894 return Cost;
6895}
6896
6897#ifndef NDEBUG
6898/// Return true if the original loop \ TheLoop contains any instructions that do
6899/// not have corresponding recipes in \p Plan and are not marked to be ignored
6900/// in \p CostCtx. This means the VPlan contains simplification that the legacy
6901/// cost-model did not account for.
6902static bool planContainsAdditionalSimplifications(VPlan &Plan,
6903 VPCostContext &CostCtx,
6904 Loop *TheLoop,
6905 ElementCount VF) {
6906 // First collect all instructions for the recipes in Plan.
6907 auto GetInstructionForCost = [](const VPRecipeBase *R) -> Instruction * {
6908 if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
6909 return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
6910 if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
6911 return &WidenMem->getIngredient();
6912 return nullptr;
6913 };
6914
6915 DenseSet<Instruction *> SeenInstrs;
6916 auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
6917 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
6918 for (VPRecipeBase &R : *VPBB) {
6919 if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
6920 auto *IG = IR->getInterleaveGroup();
6921 unsigned NumMembers = IG->getNumMembers();
6922 for (unsigned I = 0; I != NumMembers; ++I) {
6923 if (Instruction *M = IG->getMember(I))
6924 SeenInstrs.insert(M);
6925 }
6926 continue;
6927 }
6928 // Unused FOR splices are removed by VPlan transforms, so the VPlan-based
6929 // cost model won't cost it whilst the legacy will.
6930 if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
6931 if (none_of(FOR->users(), [](VPUser *U) {
6932 auto *VPI = dyn_cast<VPInstruction>(U);
6933 return VPI && VPI->getOpcode() ==
6934 VPInstruction::FirstOrderRecurrenceSplice;
6935 }))
6936 return true;
6937 }
6938 // The VPlan-based cost model is more accurate for partial reduction and
6939 // comparing against the legacy cost isn't desirable.
6940 if (isa<VPPartialReductionRecipe>(&R))
6941 return true;
6942
6943 /// If a VPlan transform folded a recipe to one producing a single-scalar,
6944 /// but the original instruction wasn't uniform-after-vectorization in the
6945 /// legacy cost model, the legacy cost overestimates the actual cost.
6946 if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
6947 if (RepR->isSingleScalar() &&
6948 !CostCtx.isLegacyUniformAfterVectorization(
6949 RepR->getUnderlyingInstr(), VF))
6950 return true;
6951 }
6952 if (Instruction *UI = GetInstructionForCost(&R)) {
6953 // If we adjusted the predicate of the recipe, the cost in the legacy
6954 // cost model may be different.
6955 if (auto *WidenCmp = dyn_cast<VPWidenRecipe>(&R)) {
6956 if ((WidenCmp->getOpcode() == Instruction::ICmp ||
6957 WidenCmp->getOpcode() == Instruction::FCmp) &&
6958 WidenCmp->getPredicate() != cast<CmpInst>(UI)->getPredicate())
6959 return true;
6960 }
6961 SeenInstrs.insert(UI);
6962 }
6963 }
6964 }
6965
6966 // Return true if the loop contains any instructions that are not also part of
6967 // the VPlan or are skipped for VPlan-based cost computations. This indicates
6968 // that the VPlan contains extra simplifications.
6969 return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
6970 TheLoop](BasicBlock *BB) {
6971 return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
6972 // Skip induction phis when checking for simplifications, as they may not
6973 // be lowered directly be lowered to a corresponding PHI recipe.
6974 if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
6975 CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
6976 return false;
6977 return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
6978 });
6979 });
6980}
6981#endif
6982
6983VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
6984 if (VPlans.empty())
6985 return VectorizationFactor::Disabled();
6986 // If there is a single VPlan with a single VF, return it directly.
6987 VPlan &FirstPlan = *VPlans[0];
6988 if (VPlans.size() == 1 && size(Range: FirstPlan.vectorFactors()) == 1)
6989 return {*FirstPlan.vectorFactors().begin(), 0, 0};
6990
6991 LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
6992 << (CM.CostKind == TTI::TCK_RecipThroughput
6993 ? "Reciprocal Throughput\n"
6994 : CM.CostKind == TTI::TCK_Latency
6995 ? "Instruction Latency\n"
6996 : CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
6997 : CM.CostKind == TTI::TCK_SizeAndLatency
6998 ? "Code Size and Latency\n"
6999 : "Unknown\n"));
7000
7001 ElementCount ScalarVF = ElementCount::getFixed(MinVal: 1);
7002 assert(hasPlanWithVF(ScalarVF) &&
7003 "More than a single plan/VF w/o any plan having scalar VF");
7004
7005 // TODO: Compute scalar cost using VPlan-based cost model.
7006 InstructionCost ScalarCost = CM.expectedCost(VF: ScalarVF);
7007 LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7008 VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7009 VectorizationFactor BestFactor = ScalarFactor;
7010
7011 bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7012 if (ForceVectorization) {
7013 // Ignore scalar width, because the user explicitly wants vectorization.
7014 // Initialize cost to max so that VF = 2 is, at least, chosen during cost
7015 // evaluation.
7016 BestFactor.Cost = InstructionCost::getMax();
7017 }
7018
7019 for (auto &P : VPlans) {
7020 ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
7021 P->vectorFactors().end());
7022
7023 SmallVector<VPRegisterUsage, 8> RUs;
7024 if (CM.useMaxBandwidth(RegKind: TargetTransformInfo::RGK_ScalableVector) ||
7025 CM.useMaxBandwidth(RegKind: TargetTransformInfo::RGK_FixedWidthVector))
7026 RUs = calculateRegisterUsageForPlan(Plan&: *P, VFs, TTI, ValuesToIgnore: CM.ValuesToIgnore);
7027
7028 for (unsigned I = 0; I < VFs.size(); I++) {
7029 ElementCount VF = VFs[I];
7030 if (VF.isScalar())
7031 continue;
7032 if (!ForceVectorization && !willGenerateVectors(Plan&: *P, VF, TTI)) {
7033 LLVM_DEBUG(
7034 dbgs()
7035 << "LV: Not considering vector loop of width " << VF
7036 << " because it will not generate any vector instructions.\n");
7037 continue;
7038 }
7039 if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(Plan&: *P)) {
7040 LLVM_DEBUG(
7041 dbgs()
7042 << "LV: Not considering vector loop of width " << VF
7043 << " because it would cause replicated blocks to be generated,"
7044 << " which isn't allowed when optimizing for size.\n");
7045 continue;
7046 }
7047
7048 InstructionCost Cost = cost(Plan&: *P, VF);
7049 VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7050
7051 if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI)) {
7052 LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
7053 << VF << " because it uses too many registers\n");
7054 continue;
7055 }
7056
7057 if (isMoreProfitable(A: CurrentFactor, B: BestFactor, HasTail: P->hasScalarTail()))
7058 BestFactor = CurrentFactor;
7059
7060 // If profitable add it to ProfitableVF list.
7061 if (isMoreProfitable(A: CurrentFactor, B: ScalarFactor, HasTail: P->hasScalarTail()))
7062 ProfitableVFs.push_back(Elt: CurrentFactor);
7063 }
7064 }
7065
7066#ifndef NDEBUG
7067 // Select the optimal vectorization factor according to the legacy cost-model.
7068 // This is now only used to verify the decisions by the new VPlan-based
7069 // cost-model and will be retired once the VPlan-based cost-model is
7070 // stabilized.
7071 VectorizationFactor LegacyVF = selectVectorizationFactor();
7072 VPlan &BestPlan = getPlanFor(BestFactor.Width);
7073
7074 // Pre-compute the cost and use it to check if BestPlan contains any
7075 // simplifications not accounted for in the legacy cost model. If that's the
7076 // case, don't trigger the assertion, as the extra simplifications may cause a
7077 // different VF to be picked by the VPlan-based cost model.
7078 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7079 CM.CostKind);
7080 precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7081 // Verify that the VPlan-based and legacy cost models agree, except for VPlans
7082 // with early exits and plans with additional VPlan simplifications. The
7083 // legacy cost model doesn't properly model costs for such loops.
7084 assert((BestFactor.Width == LegacyVF.Width || BestPlan.hasEarlyExit() ||
7085 planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
7086 CostCtx, OrigLoop,
7087 BestFactor.Width) ||
7088 planContainsAdditionalSimplifications(
7089 getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7090 " VPlan cost model and legacy cost model disagreed");
7091 assert((BestFactor.Width.isScalar() || BestFactor.ScalarCost > 0) &&
7092 "when vectorizing, the scalar cost must be computed.");
7093#endif
7094
7095 LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7096 return BestFactor;
7097}
7098
7099static void addRuntimeUnrollDisableMetaData(Loop *L) {
7100 SmallVector<Metadata *, 4> MDs;
7101 // Reserve first location for self reference to the LoopID metadata node.
7102 MDs.push_back(Elt: nullptr);
7103 bool IsUnrollMetadata = false;
7104 MDNode *LoopID = L->getLoopID();
7105 if (LoopID) {
7106 // First find existing loop unrolling disable metadata.
7107 for (unsigned I = 1, IE = LoopID->getNumOperands(); I < IE; ++I) {
7108 auto *MD = dyn_cast<MDNode>(Val: LoopID->getOperand(I));
7109 if (MD) {
7110 const auto *S = dyn_cast<MDString>(Val: MD->getOperand(I: 0));
7111 IsUnrollMetadata =
7112 S && S->getString().starts_with(Prefix: "llvm.loop.unroll.disable");
7113 }
7114 MDs.push_back(Elt: LoopID->getOperand(I));
7115 }
7116 }
7117
7118 if (!IsUnrollMetadata) {
7119 // Add runtime unroll disable metadata.
7120 LLVMContext &Context = L->getHeader()->getContext();
7121 SmallVector<Metadata *, 1> DisableOperands;
7122 DisableOperands.push_back(
7123 Elt: MDString::get(Context, Str: "llvm.loop.unroll.runtime.disable"));
7124 MDNode *DisableNode = MDNode::get(Context, MDs: DisableOperands);
7125 MDs.push_back(Elt: DisableNode);
7126 MDNode *NewLoopID = MDNode::get(Context, MDs);
7127 // Set operand 0 to refer to the loop id itself.
7128 NewLoopID->replaceOperandWith(I: 0, New: NewLoopID);
7129 L->setLoopID(NewLoopID);
7130 }
7131}
7132
7133static Value *getStartValueFromReductionResult(VPInstruction *RdxResult) {
7134 using namespace VPlanPatternMatch;
7135 assert(RdxResult->getOpcode() == VPInstruction::ComputeFindIVResult &&
7136 "RdxResult must be ComputeFindIVResult");
7137 VPValue *StartVPV = RdxResult->getOperand(N: 1);
7138 match(V: StartVPV, P: m_Freeze(Op0: m_VPValue(V&: StartVPV)));
7139 return StartVPV->getLiveInIRValue();
7140}
7141
7142// If \p EpiResumePhiR is resume VPPhi for a reduction when vectorizing the
7143// epilog loop, fix the reduction's scalar PHI node by adding the incoming value
7144// from the main vector loop.
7145static void fixReductionScalarResumeWhenVectorizingEpilog(
7146 VPPhi *EpiResumePhiR, VPTransformState &State, BasicBlock *BypassBlock) {
7147 // Get the VPInstruction computing the reduction result in the middle block.
7148 // The first operand may not be from the middle block if it is not connected
7149 // to the scalar preheader. In that case, there's nothing to fix.
7150 VPValue *Incoming = EpiResumePhiR->getOperand(N: 0);
7151 match(V: Incoming, P: VPlanPatternMatch::m_ZExtOrSExt(
7152 Op0: VPlanPatternMatch::m_VPValue(V&: Incoming)));
7153 auto *EpiRedResult = dyn_cast<VPInstruction>(Val: Incoming);
7154 if (!EpiRedResult ||
7155 (EpiRedResult->getOpcode() != VPInstruction::ComputeAnyOfResult &&
7156 EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult &&
7157 EpiRedResult->getOpcode() != VPInstruction::ComputeFindIVResult))
7158 return;
7159
7160 auto *EpiRedHeaderPhi =
7161 cast<VPReductionPHIRecipe>(Val: EpiRedResult->getOperand(N: 0));
7162 RecurKind Kind = EpiRedHeaderPhi->getRecurrenceKind();
7163 Value *MainResumeValue;
7164 if (auto *VPI = dyn_cast<VPInstruction>(Val: EpiRedHeaderPhi->getStartValue())) {
7165 assert((VPI->getOpcode() == VPInstruction::Broadcast ||
7166 VPI->getOpcode() == VPInstruction::ReductionStartVector) &&
7167 "unexpected start recipe");
7168 MainResumeValue = VPI->getOperand(N: 0)->getUnderlyingValue();
7169 } else
7170 MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7171 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind)) {
7172 [[maybe_unused]] Value *StartV =
7173 EpiRedResult->getOperand(N: 1)->getLiveInIRValue();
7174 auto *Cmp = cast<ICmpInst>(Val: MainResumeValue);
7175 assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7176 "AnyOf expected to start with ICMP_NE");
7177 assert(Cmp->getOperand(1) == StartV &&
7178 "AnyOf expected to start by comparing main resume value to original "
7179 "start value");
7180 MainResumeValue = Cmp->getOperand(i_nocapture: 0);
7181 } else if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind)) {
7182 Value *StartV = getStartValueFromReductionResult(RdxResult: EpiRedResult);
7183 Value *SentinelV = EpiRedResult->getOperand(N: 2)->getLiveInIRValue();
7184 using namespace llvm::PatternMatch;
7185 Value *Cmp, *OrigResumeV, *CmpOp;
7186 [[maybe_unused]] bool IsExpectedPattern =
7187 match(V: MainResumeValue,
7188 P: m_Select(C: m_OneUse(SubPattern: m_Value(V&: Cmp)), L: m_Specific(V: SentinelV),
7189 R: m_Value(V&: OrigResumeV))) &&
7190 (match(V: Cmp, P: m_SpecificICmp(MatchPred: ICmpInst::ICMP_EQ, L: m_Specific(V: OrigResumeV),
7191 R: m_Value(V&: CmpOp))) &&
7192 ((CmpOp == StartV && isGuaranteedNotToBeUndefOrPoison(V: CmpOp))));
7193 assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7194 MainResumeValue = OrigResumeV;
7195 }
7196 PHINode *MainResumePhi = cast<PHINode>(Val: MainResumeValue);
7197
7198 // When fixing reductions in the epilogue loop we should already have
7199 // created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7200 // over the incoming values correctly.
7201 auto *EpiResumePhi = cast<PHINode>(Val: State.get(Def: EpiResumePhiR, IsScalar: true));
7202 EpiResumePhi->setIncomingValueForBlock(
7203 BB: BypassBlock, V: MainResumePhi->getIncomingValueForBlock(BB: BypassBlock));
7204}
7205
7206DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
7207 ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7208 InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
7209 assert(BestVPlan.hasVF(BestVF) &&
7210 "Trying to execute plan with unsupported VF");
7211 assert(BestVPlan.hasUF(BestUF) &&
7212 "Trying to execute plan with unsupported UF");
7213 if (BestVPlan.hasEarlyExit())
7214 ++LoopsEarlyExitVectorized;
7215 // TODO: Move to VPlan transform stage once the transition to the VPlan-based
7216 // cost model is complete for better cost estimates.
7217 VPlanTransforms::runPass(Fn: VPlanTransforms::unrollByUF, Plan&: BestVPlan, Args&: BestUF,
7218 Args&: OrigLoop->getHeader()->getContext());
7219 VPlanTransforms::runPass(Fn: VPlanTransforms::replicateByVF, Plan&: BestVPlan, Args&: BestVF);
7220 VPlanTransforms::runPass(Fn: VPlanTransforms::materializeBroadcasts, Plan&: BestVPlan);
7221 bool HasBranchWeights =
7222 hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator());
7223 if (HasBranchWeights) {
7224 std::optional<unsigned> VScale = CM.getVScaleForTuning();
7225 VPlanTransforms::runPass(Fn: VPlanTransforms::addBranchWeightToMiddleTerminator,
7226 Plan&: BestVPlan, Args&: BestVF, Args&: VScale);
7227 }
7228
7229 if (!VectorizingEpilogue) {
7230 // Checks are the same for all VPlans, added to BestVPlan only for
7231 // compactness.
7232 attachRuntimeChecks(Plan&: BestVPlan, RTChecks&: ILV.RTChecks, HasBranchWeights);
7233 }
7234
7235 // Retrieving VectorPH now when it's easier while VPlan still has Regions.
7236 VPBasicBlock *VectorPH = cast<VPBasicBlock>(Val: BestVPlan.getVectorPreheader());
7237 VPlanTransforms::optimizeForVFAndUF(Plan&: BestVPlan, BestVF, BestUF, PSE);
7238 VPlanTransforms::simplifyRecipes(Plan&: BestVPlan, CanonicalIVTy&: *Legal->getWidestInductionType());
7239 VPlanTransforms::narrowInterleaveGroups(
7240 Plan&: BestVPlan, VF: BestVF,
7241 VectorRegWidth: TTI.getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector));
7242 VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
7243
7244 VPlanTransforms::convertToConcreteRecipes(Plan&: BestVPlan,
7245 CanonicalIVTy&: *Legal->getWidestInductionType());
7246 // Regions are dissolved after optimizing for VF and UF, which completely
7247 // removes unneeded loop regions first.
7248 VPlanTransforms::dissolveLoopRegions(Plan&: BestVPlan);
7249 // Perform the actual loop transformation.
7250 VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
7251 OrigLoop->getParentLoop(),
7252 Legal->getWidestInductionType());
7253
7254#ifdef EXPENSIVE_CHECKS
7255 assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7256#endif
7257
7258 // 0. Generate SCEV-dependent code in the entry, including TripCount, before
7259 // making any changes to the CFG.
7260 DenseMap<const SCEV *, Value *> ExpandedSCEVs;
7261 auto *Entry = cast<VPIRBasicBlock>(Val: BestVPlan.getEntry());
7262 State.Builder.SetInsertPoint(Entry->getIRBasicBlock()->getTerminator());
7263 for (VPRecipeBase &R : make_early_inc_range(Range&: *Entry)) {
7264 auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
7265 if (!ExpSCEV)
7266 continue;
7267 ExpSCEV->execute(State);
7268 ExpandedSCEVs[ExpSCEV->getSCEV()] = State.get(Def: ExpSCEV, Lane: VPLane(0));
7269 VPValue *Exp = BestVPlan.getOrAddLiveIn(V: ExpandedSCEVs[ExpSCEV->getSCEV()]);
7270 ExpSCEV->replaceAllUsesWith(New: Exp);
7271 if (BestVPlan.getTripCount() == ExpSCEV)
7272 BestVPlan.resetTripCount(NewTripCount: Exp);
7273 ExpSCEV->eraseFromParent();
7274 }
7275
7276 if (!ILV.getTripCount())
7277 ILV.setTripCount(State.get(Def: BestVPlan.getTripCount(), Lane: VPLane(0)));
7278 else
7279 assert(VectorizingEpilogue && "should only re-use the existing trip "
7280 "count during epilogue vectorization");
7281
7282 // 1. Set up the skeleton for vectorization, including vector pre-header and
7283 // middle block. The vector loop is created during VPlan execution.
7284 BasicBlock *EntryBB =
7285 cast<VPIRBasicBlock>(Val: BestVPlan.getEntry())->getIRBasicBlock();
7286 State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7287 if (VectorizingEpilogue)
7288 VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
7289
7290 assert(verifyVPlanIsValid(BestVPlan, true /*VerifyLate*/) &&
7291 "final VPlan is invalid");
7292
7293 ILV.printDebugTracesAtStart();
7294
7295 //===------------------------------------------------===//
7296 //
7297 // Notice: any optimization or new instruction that go
7298 // into the code below should also be implemented in
7299 // the cost-model.
7300 //
7301 //===------------------------------------------------===//
7302
7303 // 2. Copy and widen instructions from the old loop into the new loop.
7304 BestVPlan.prepareToExecute(
7305 TripCount: ILV.getTripCount(),
7306 VectorTripCount: ILV.getOrCreateVectorTripCount(InsertBlock: ILV.LoopVectorPreHeader), State);
7307 replaceVPBBWithIRVPBB(VPBB: VectorPH, IRBB: State.CFG.PrevBB);
7308
7309 // Move check blocks to their final position.
7310 // TODO: Move as part of VPIRBB execute and update impacted tests.
7311 if (BasicBlock *MemCheckBlock = ILV.RTChecks.getMemRuntimeChecks().second)
7312 MemCheckBlock->moveAfter(MovePos: EntryBB);
7313 if (BasicBlock *SCEVCheckBlock = ILV.RTChecks.getSCEVChecks().second)
7314 SCEVCheckBlock->moveAfter(MovePos: EntryBB);
7315
7316 BestVPlan.execute(State: &State);
7317
7318 // 2.5 When vectorizing the epilogue, fix reduction resume values from the
7319 // additional bypass block.
7320 if (VectorizingEpilogue) {
7321 assert(!BestVPlan.hasEarlyExit() &&
7322 "Epilogue vectorisation not yet supported with early exits");
7323 BasicBlock *PH = OrigLoop->getLoopPreheader();
7324 BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
7325 for (auto *Pred : predecessors(BB: PH)) {
7326 for (PHINode &Phi : PH->phis()) {
7327 if (Phi.getBasicBlockIndex(BB: Pred) != -1)
7328 continue;
7329 Phi.addIncoming(V: Phi.getIncomingValueForBlock(BB: BypassBlock), BB: Pred);
7330 }
7331 }
7332 VPBasicBlock *ScalarPH = BestVPlan.getScalarPreheader();
7333 if (ScalarPH->getNumPredecessors() > 0) {
7334 // If ScalarPH has predecessors, we may need to update its reduction
7335 // resume values.
7336 for (VPRecipeBase &R : ScalarPH->phis()) {
7337 fixReductionScalarResumeWhenVectorizingEpilog(EpiResumePhiR: cast<VPPhi>(Val: &R), State,
7338 BypassBlock);
7339 }
7340 }
7341 }
7342
7343 // 2.6. Maintain Loop Hints
7344 // Keep all loop hints from the original loop on the vector loop (we'll
7345 // replace the vectorizer-specific hints below).
7346 VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(Plan&: BestVPlan, VPDT&: State.VPDT);
7347 if (HeaderVPBB) {
7348 MDNode *OrigLoopID = OrigLoop->getLoopID();
7349
7350 std::optional<MDNode *> VectorizedLoopID =
7351 makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
7352 LLVMLoopVectorizeFollowupVectorized});
7353
7354 Loop *L = LI->getLoopFor(BB: State.CFG.VPBB2IRBB[HeaderVPBB]);
7355 if (VectorizedLoopID) {
7356 L->setLoopID(*VectorizedLoopID);
7357 } else {
7358 // Keep all loop hints from the original loop on the vector loop (we'll
7359 // replace the vectorizer-specific hints below).
7360 if (MDNode *LID = OrigLoop->getLoopID())
7361 L->setLoopID(LID);
7362
7363 LoopVectorizeHints Hints(L, true, *ORE);
7364 Hints.setAlreadyVectorized();
7365
7366 // Check if it's EVL-vectorized and mark the corresponding metadata.
7367 bool IsEVLVectorized =
7368 llvm::any_of(Range&: *HeaderVPBB, P: [](const VPRecipeBase &Recipe) {
7369 // Looking for the ExplictVectorLength VPInstruction.
7370 if (const auto *VI = dyn_cast<VPInstruction>(Val: &Recipe))
7371 return VI->getOpcode() == VPInstruction::ExplicitVectorLength;
7372 return false;
7373 });
7374 if (IsEVLVectorized) {
7375 LLVMContext &Context = L->getHeader()->getContext();
7376 MDNode *LoopID = L->getLoopID();
7377 auto *IsEVLVectorizedMD = MDNode::get(
7378 Context,
7379 MDs: {MDString::get(Context, Str: "llvm.loop.isvectorized.tailfoldingstyle"),
7380 MDString::get(Context, Str: "evl")});
7381 MDNode *NewLoopID = makePostTransformationMetadata(Context, OrigLoopID: LoopID, RemovePrefixes: {},
7382 AddAttrs: {IsEVLVectorizedMD});
7383 L->setLoopID(NewLoopID);
7384 }
7385 }
7386 TargetTransformInfo::UnrollingPreferences UP;
7387 TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7388 if (!UP.UnrollVectorizedLoop || VectorizingEpilogue)
7389 addRuntimeUnrollDisableMetaData(L);
7390 }
7391
7392 // 3. Fix the vectorized code: take care of header phi's, live-outs,
7393 // predication, updating analyses.
7394 ILV.fixVectorizedLoop(State);
7395
7396 ILV.printDebugTracesAtEnd();
7397
7398 return ExpandedSCEVs;
7399}
7400
7401//===--------------------------------------------------------------------===//
7402// EpilogueVectorizerMainLoop
7403//===--------------------------------------------------------------------===//
7404
7405/// This function is partially responsible for generating the control flow
7406/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7407BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7408 createVectorLoopSkeleton(Prefix: "");
7409
7410 // Generate the code to check the minimum iteration count of the vector
7411 // epilogue (see below).
7412 EPI.EpilogueIterationCountCheck =
7413 emitIterationCountCheck(Bypass: LoopScalarPreHeader, ForEpilogue: true);
7414 EPI.EpilogueIterationCountCheck->setName("iter.check");
7415
7416 // Generate the iteration count check for the main loop, *after* the check
7417 // for the epilogue loop, so that the path-length is shorter for the case
7418 // that goes directly through the vector epilogue. The longer-path length for
7419 // the main loop is compensated for, by the gain from vectorizing the larger
7420 // trip count. Note: the branch will get updated later on when we vectorize
7421 // the epilogue.
7422 EPI.MainLoopIterationCountCheck =
7423 emitIterationCountCheck(Bypass: LoopScalarPreHeader, ForEpilogue: false);
7424
7425 // Generate the induction variable.
7426 EPI.VectorTripCount = getOrCreateVectorTripCount(InsertBlock: LoopVectorPreHeader);
7427
7428 replaceVPBBWithIRVPBB(VPBB: Plan.getScalarPreheader(), IRBB: LoopScalarPreHeader);
7429 return LoopVectorPreHeader;
7430}
7431
7432void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7433 LLVM_DEBUG({
7434 dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7435 << "Main Loop VF:" << EPI.MainLoopVF
7436 << ", Main Loop UF:" << EPI.MainLoopUF
7437 << ", Epilogue Loop VF:" << EPI.EpilogueVF
7438 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7439 });
7440}
7441
7442void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7443 DEBUG_WITH_TYPE(VerboseDebug, {
7444 dbgs() << "intermediate fn:\n"
7445 << *OrigLoop->getHeader()->getParent() << "\n";
7446 });
7447}
7448
7449BasicBlock *
7450EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7451 bool ForEpilogue) {
7452 assert(Bypass && "Expected valid bypass basic block.");
7453 Value *Count = getTripCount();
7454 MinProfitableTripCount = ElementCount::getFixed(MinVal: 0);
7455 Value *CheckMinIters = createIterationCountCheck(
7456 VF: ForEpilogue ? EPI.EpilogueVF : VF, UF: ForEpilogue ? EPI.EpilogueUF : UF);
7457
7458 BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7459 if (!ForEpilogue)
7460 TCCheckBlock->setName("vector.main.loop.iter.check");
7461
7462 // Create new preheader for vector loop.
7463 LoopVectorPreHeader = SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(),
7464 DT: static_cast<DominatorTree *>(nullptr), LI,
7465 MSSAU: nullptr, BBName: "vector.ph");
7466
7467 if (ForEpilogue) {
7468 // Save the trip count so we don't have to regenerate it in the
7469 // vec.epilog.iter.check. This is safe to do because the trip count
7470 // generated here dominates the vector epilog iter check.
7471 EPI.TripCount = Count;
7472 }
7473
7474 BranchInst &BI =
7475 *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
7476 if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator()))
7477 setBranchWeights(I&: BI, Weights: MinItersBypassWeights, /*IsExpected=*/false);
7478 ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI);
7479
7480 // When vectorizing the main loop, its trip-count check is placed in a new
7481 // block, whereas the overall trip-count check is placed in the VPlan entry
7482 // block. When vectorizing the epilogue loop, its trip-count check is placed
7483 // in the VPlan entry block.
7484 if (!ForEpilogue)
7485 introduceCheckBlockInVPlan(CheckIRBB: TCCheckBlock);
7486 return TCCheckBlock;
7487}
7488
7489//===--------------------------------------------------------------------===//
7490// EpilogueVectorizerEpilogueLoop
7491//===--------------------------------------------------------------------===//
7492
7493/// This function is partially responsible for generating the control flow
7494/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7495BasicBlock *
7496EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7497 createVectorLoopSkeleton(Prefix: "vec.epilog.");
7498
7499 // Now, compare the remaining count and if there aren't enough iterations to
7500 // execute the vectorized epilogue skip to the scalar part.
7501 LoopVectorPreHeader->setName("vec.epilog.ph");
7502 BasicBlock *VecEpilogueIterationCountCheck =
7503 SplitBlock(Old: LoopVectorPreHeader, SplitPt: LoopVectorPreHeader->begin(), DT, LI,
7504 MSSAU: nullptr, BBName: "vec.epilog.iter.check", Before: true);
7505 emitMinimumVectorEpilogueIterCountCheck(Bypass: LoopScalarPreHeader,
7506 Insert: VecEpilogueIterationCountCheck);
7507 AdditionalBypassBlock = VecEpilogueIterationCountCheck;
7508
7509 // Adjust the control flow taking the state info from the main loop
7510 // vectorization into account.
7511 assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7512 "expected this to be saved from the previous pass.");
7513 EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7514 From: VecEpilogueIterationCountCheck, To: LoopVectorPreHeader);
7515
7516 EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7517 From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7518
7519 // Adjust the terminators of runtime check blocks and phis using them.
7520 BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks().second;
7521 BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks().second;
7522 if (SCEVCheckBlock)
7523 SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
7524 From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7525 if (MemCheckBlock)
7526 MemCheckBlock->getTerminator()->replaceUsesOfWith(
7527 From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7528
7529 DT->changeImmediateDominator(BB: LoopScalarPreHeader,
7530 NewBB: EPI.EpilogueIterationCountCheck);
7531
7532 // The vec.epilog.iter.check block may contain Phi nodes from inductions or
7533 // reductions which merge control-flow from the latch block and the middle
7534 // block. Update the incoming values here and move the Phi into the preheader.
7535 SmallVector<PHINode *, 4> PhisInBlock(
7536 llvm::make_pointer_range(Range: VecEpilogueIterationCountCheck->phis()));
7537
7538 for (PHINode *Phi : PhisInBlock) {
7539 Phi->moveBefore(InsertPos: LoopVectorPreHeader->getFirstNonPHIIt());
7540 Phi->replaceIncomingBlockWith(
7541 Old: VecEpilogueIterationCountCheck->getSinglePredecessor(),
7542 New: VecEpilogueIterationCountCheck);
7543
7544 // If the phi doesn't have an incoming value from the
7545 // EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7546 // value and also those from other check blocks. This is needed for
7547 // reduction phis only.
7548 if (none_of(Range: Phi->blocks(), P: [&](BasicBlock *IncB) {
7549 return EPI.EpilogueIterationCountCheck == IncB;
7550 }))
7551 continue;
7552 Phi->removeIncomingValue(BB: EPI.EpilogueIterationCountCheck);
7553 if (SCEVCheckBlock)
7554 Phi->removeIncomingValue(BB: SCEVCheckBlock);
7555 if (MemCheckBlock)
7556 Phi->removeIncomingValue(BB: MemCheckBlock);
7557 }
7558
7559 replaceVPBBWithIRVPBB(VPBB: Plan.getScalarPreheader(), IRBB: LoopScalarPreHeader);
7560 return LoopVectorPreHeader;
7561}
7562
7563BasicBlock *
7564EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7565 BasicBlock *Bypass, BasicBlock *Insert) {
7566
7567 assert(EPI.TripCount &&
7568 "Expected trip count to have been saved in the first pass.");
7569 Value *TC = EPI.TripCount;
7570 IRBuilder<> Builder(Insert->getTerminator());
7571 Value *Count = Builder.CreateSub(LHS: TC, RHS: EPI.VectorTripCount, Name: "n.vec.remaining");
7572
7573 // Generate code to check if the loop's trip count is less than VF * UF of the
7574 // vector epilogue loop.
7575 auto P = Cost->requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector())
7576 ? ICmpInst::ICMP_ULE
7577 : ICmpInst::ICMP_ULT;
7578
7579 Value *CheckMinIters =
7580 Builder.CreateICmp(P, LHS: Count,
7581 RHS: createStepForVF(B&: Builder, Ty: Count->getType(),
7582 VF: EPI.EpilogueVF, Step: EPI.EpilogueUF),
7583 Name: "min.epilog.iters.check");
7584
7585 BranchInst &BI =
7586 *BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
7587 if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())) {
7588 // FIXME: See test Transforms/LoopVectorize/branch-weights.ll. I don't
7589 // think the MainLoopStep is correct.
7590 unsigned MainLoopStep = UF * VF.getKnownMinValue();
7591 unsigned EpilogueLoopStep =
7592 EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7593 // We assume the remaining `Count` is equally distributed in
7594 // [0, MainLoopStep)
7595 // So the probability for `Count < EpilogueLoopStep` should be
7596 // min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7597 unsigned EstimatedSkipCount = std::min(a: MainLoopStep, b: EpilogueLoopStep);
7598 const uint32_t Weights[] = {EstimatedSkipCount,
7599 MainLoopStep - EstimatedSkipCount};
7600 setBranchWeights(I&: BI, Weights, /*IsExpected=*/false);
7601 }
7602 ReplaceInstWithInst(From: Insert->getTerminator(), To: &BI);
7603
7604 // A new entry block has been created for the epilogue VPlan. Hook it in, as
7605 // otherwise we would try to modify the entry to the main vector loop.
7606 VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(IRBB: Insert);
7607 VPBasicBlock *OldEntry = Plan.getEntry();
7608 VPBlockUtils::reassociateBlocks(Old: OldEntry, New: NewEntry);
7609 Plan.setEntry(NewEntry);
7610 // OldEntry is now dead and will be cleaned up when the plan gets destroyed.
7611
7612 return Insert;
7613}
7614
7615void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7616 LLVM_DEBUG({
7617 dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7618 << "Epilogue Loop VF:" << EPI.EpilogueVF
7619 << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7620 });
7621}
7622
7623void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7624 DEBUG_WITH_TYPE(VerboseDebug, {
7625 dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7626 });
7627}
7628
7629VPWidenMemoryRecipe *
7630VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
7631 VFRange &Range) {
7632 assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
7633 "Must be called with either a load or store");
7634
7635 auto WillWiden = [&](ElementCount VF) -> bool {
7636 LoopVectorizationCostModel::InstWidening Decision =
7637 CM.getWideningDecision(I, VF);
7638 assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7639 "CM decision should be taken at this point.");
7640 if (Decision == LoopVectorizationCostModel::CM_Interleave)
7641 return true;
7642 if (CM.isScalarAfterVectorization(I, VF) ||
7643 CM.isProfitableToScalarize(I, VF))
7644 return false;
7645 return Decision != LoopVectorizationCostModel::CM_Scalarize;
7646 };
7647
7648 if (!LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillWiden, Range))
7649 return nullptr;
7650
7651 VPValue *Mask = nullptr;
7652 if (Legal->isMaskRequired(I))
7653 Mask = getBlockInMask(VPBB: Builder.getInsertBlock());
7654
7655 // Determine if the pointer operand of the access is either consecutive or
7656 // reverse consecutive.
7657 LoopVectorizationCostModel::InstWidening Decision =
7658 CM.getWideningDecision(I, VF: Range.Start);
7659 bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
7660 bool Consecutive =
7661 Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
7662
7663 VPValue *Ptr = isa<LoadInst>(Val: I) ? Operands[0] : Operands[1];
7664 if (Consecutive) {
7665 auto *GEP = dyn_cast<GetElementPtrInst>(
7666 Val: Ptr->getUnderlyingValue()->stripPointerCasts());
7667 VPSingleDefRecipe *VectorPtr;
7668 if (Reverse) {
7669 // When folding the tail, we may compute an address that we don't in the
7670 // original scalar loop and it may not be inbounds. Drop Inbounds in that
7671 // case.
7672 GEPNoWrapFlags Flags =
7673 (CM.foldTailByMasking() || !GEP || !GEP->isInBounds())
7674 ? GEPNoWrapFlags::none()
7675 : GEPNoWrapFlags::inBounds();
7676 VectorPtr =
7677 new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
7678 /*Stride*/ -1, Flags, I->getDebugLoc());
7679 } else {
7680 VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
7681 GEP ? GEP->getNoWrapFlags()
7682 : GEPNoWrapFlags::none(),
7683 I->getDebugLoc());
7684 }
7685 Builder.insert(R: VectorPtr);
7686 Ptr = VectorPtr;
7687 }
7688 if (LoadInst *Load = dyn_cast<LoadInst>(Val: I))
7689 return new VPWidenLoadRecipe(*Load, Ptr, Mask, Consecutive, Reverse,
7690 VPIRMetadata(*Load, LVer), I->getDebugLoc());
7691
7692 StoreInst *Store = cast<StoreInst>(Val: I);
7693 return new VPWidenStoreRecipe(*Store, Ptr, Operands[0], Mask, Consecutive,
7694 Reverse, VPIRMetadata(*Store, LVer),
7695 I->getDebugLoc());
7696}
7697
7698/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
7699/// insert a recipe to expand the step for the induction recipe.
7700static VPWidenIntOrFpInductionRecipe *
7701createWidenInductionRecipes(PHINode *Phi, Instruction *PhiOrTrunc,
7702 VPValue *Start, const InductionDescriptor &IndDesc,
7703 VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
7704 assert(IndDesc.getStartValue() ==
7705 Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
7706 assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
7707 "step must be loop invariant");
7708
7709 VPValue *Step =
7710 vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: IndDesc.getStep(), SE);
7711 if (auto *TruncI = dyn_cast<TruncInst>(Val: PhiOrTrunc)) {
7712 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
7713 IndDesc, TruncI,
7714 TruncI->getDebugLoc());
7715 }
7716 assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
7717 return new VPWidenIntOrFpInductionRecipe(Phi, Start, Step, &Plan.getVF(),
7718 IndDesc, Phi->getDebugLoc());
7719}
7720
7721VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
7722 PHINode *Phi, ArrayRef<VPValue *> Operands, VFRange &Range) {
7723
7724 // Check if this is an integer or fp induction. If so, build the recipe that
7725 // produces its scalar and vector values.
7726 if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
7727 return createWidenInductionRecipes(Phi, PhiOrTrunc: Phi, Start: Operands[0], IndDesc: *II, Plan,
7728 SE&: *PSE.getSE(), OrigLoop&: *OrigLoop);
7729
7730 // Check if this is pointer induction. If so, build the recipe for it.
7731 if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
7732 VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: II->getStep(),
7733 SE&: *PSE.getSE());
7734 return new VPWidenPointerInductionRecipe(
7735 Phi, Operands[0], Step, &Plan.getVFxUF(), *II,
7736 LoopVectorizationPlanner::getDecisionAndClampRange(
7737 Predicate: [&](ElementCount VF) {
7738 return CM.isScalarAfterVectorization(I: Phi, VF);
7739 },
7740 Range),
7741 Phi->getDebugLoc());
7742 }
7743 return nullptr;
7744}
7745
7746VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
7747 TruncInst *I, ArrayRef<VPValue *> Operands, VFRange &Range) {
7748 // Optimize the special case where the source is a constant integer
7749 // induction variable. Notice that we can only optimize the 'trunc' case
7750 // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7751 // (c) other casts depend on pointer size.
7752
7753 // Determine whether \p K is a truncation based on an induction variable that
7754 // can be optimized.
7755 auto IsOptimizableIVTruncate =
7756 [&](Instruction *K) -> std::function<bool(ElementCount)> {
7757 return [=](ElementCount VF) -> bool {
7758 return CM.isOptimizableIVTruncate(I: K, VF);
7759 };
7760 };
7761
7762 if (LoopVectorizationPlanner::getDecisionAndClampRange(
7763 Predicate: IsOptimizableIVTruncate(I), Range)) {
7764
7765 auto *Phi = cast<PHINode>(Val: I->getOperand(i_nocapture: 0));
7766 const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
7767 VPValue *Start = Plan.getOrAddLiveIn(V: II.getStartValue());
7768 return createWidenInductionRecipes(Phi, PhiOrTrunc: I, Start, IndDesc: II, Plan, SE&: *PSE.getSE(),
7769 OrigLoop&: *OrigLoop);
7770 }
7771 return nullptr;
7772}
7773
7774VPSingleDefRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
7775 ArrayRef<VPValue *> Operands,
7776 VFRange &Range) {
7777 bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7778 Predicate: [this, CI](ElementCount VF) {
7779 return CM.isScalarWithPredication(I: CI, VF);
7780 },
7781 Range);
7782
7783 if (IsPredicated)
7784 return nullptr;
7785
7786 Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7787 if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
7788 ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect ||
7789 ID == Intrinsic::pseudoprobe ||
7790 ID == Intrinsic::experimental_noalias_scope_decl))
7791 return nullptr;
7792
7793 SmallVector<VPValue *, 4> Ops(Operands.take_front(N: CI->arg_size()));
7794
7795 // Is it beneficial to perform intrinsic call compared to lib call?
7796 bool ShouldUseVectorIntrinsic =
7797 ID && LoopVectorizationPlanner::getDecisionAndClampRange(
7798 Predicate: [&](ElementCount VF) -> bool {
7799 return CM.getCallWideningDecision(CI, VF).Kind ==
7800 LoopVectorizationCostModel::CM_IntrinsicCall;
7801 },
7802 Range);
7803 if (ShouldUseVectorIntrinsic)
7804 return new VPWidenIntrinsicRecipe(*CI, ID, Ops, CI->getType(),
7805 CI->getDebugLoc());
7806
7807 Function *Variant = nullptr;
7808 std::optional<unsigned> MaskPos;
7809 // Is better to call a vectorized version of the function than to to scalarize
7810 // the call?
7811 auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
7812 Predicate: [&](ElementCount VF) -> bool {
7813 // The following case may be scalarized depending on the VF.
7814 // The flag shows whether we can use a usual Call for vectorized
7815 // version of the instruction.
7816
7817 // If we've found a variant at a previous VF, then stop looking. A
7818 // vectorized variant of a function expects input in a certain shape
7819 // -- basically the number of input registers, the number of lanes
7820 // per register, and whether there's a mask required.
7821 // We store a pointer to the variant in the VPWidenCallRecipe, so
7822 // once we have an appropriate variant it's only valid for that VF.
7823 // This will force a different vplan to be generated for each VF that
7824 // finds a valid variant.
7825 if (Variant)
7826 return false;
7827 LoopVectorizationCostModel::CallWideningDecision Decision =
7828 CM.getCallWideningDecision(CI, VF);
7829 if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
7830 Variant = Decision.Variant;
7831 MaskPos = Decision.MaskPos;
7832 return true;
7833 }
7834
7835 return false;
7836 },
7837 Range);
7838 if (ShouldUseVectorCall) {
7839 if (MaskPos.has_value()) {
7840 // We have 2 cases that would require a mask:
7841 // 1) The block needs to be predicated, either due to a conditional
7842 // in the scalar loop or use of an active lane mask with
7843 // tail-folding, and we use the appropriate mask for the block.
7844 // 2) No mask is required for the block, but the only available
7845 // vector variant at this VF requires a mask, so we synthesize an
7846 // all-true mask.
7847 VPValue *Mask = nullptr;
7848 if (Legal->isMaskRequired(I: CI))
7849 Mask = getBlockInMask(VPBB: Builder.getInsertBlock());
7850 else
7851 Mask = Plan.getOrAddLiveIn(
7852 V: ConstantInt::getTrue(Ty: IntegerType::getInt1Ty(C&: CI->getContext())));
7853
7854 Ops.insert(I: Ops.begin() + *MaskPos, Elt: Mask);
7855 }
7856
7857 Ops.push_back(Elt: Operands.back());
7858 return new VPWidenCallRecipe(CI, Variant, Ops, CI->getDebugLoc());
7859 }
7860
7861 return nullptr;
7862}
7863
7864bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
7865 assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7866 !isa<StoreInst>(I) && "Instruction should have been handled earlier");
7867 // Instruction should be widened, unless it is scalar after vectorization,
7868 // scalarization is profitable or it is predicated.
7869 auto WillScalarize = [this, I](ElementCount VF) -> bool {
7870 return CM.isScalarAfterVectorization(I, VF) ||
7871 CM.isProfitableToScalarize(I, VF) ||
7872 CM.isScalarWithPredication(I, VF);
7873 };
7874 return !LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillScalarize,
7875 Range);
7876}
7877
7878VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I,
7879 ArrayRef<VPValue *> Operands) {
7880 switch (I->getOpcode()) {
7881 default:
7882 return nullptr;
7883 case Instruction::SDiv:
7884 case Instruction::UDiv:
7885 case Instruction::SRem:
7886 case Instruction::URem: {
7887 // If not provably safe, use a select to form a safe divisor before widening the
7888 // div/rem operation itself. Otherwise fall through to general handling below.
7889 if (CM.isPredicatedInst(I)) {
7890 SmallVector<VPValue *> Ops(Operands);
7891 VPValue *Mask = getBlockInMask(VPBB: Builder.getInsertBlock());
7892 VPValue *One =
7893 Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: I->getType(), V: 1u, IsSigned: false));
7894 auto *SafeRHS = Builder.createSelect(Cond: Mask, TrueVal: Ops[1], FalseVal: One, DL: I->getDebugLoc());
7895 Ops[1] = SafeRHS;
7896 return new VPWidenRecipe(*I, Ops);
7897 }
7898 [[fallthrough]];
7899 }
7900 case Instruction::Add:
7901 case Instruction::And:
7902 case Instruction::AShr:
7903 case Instruction::FAdd:
7904 case Instruction::FCmp:
7905 case Instruction::FDiv:
7906 case Instruction::FMul:
7907 case Instruction::FNeg:
7908 case Instruction::FRem:
7909 case Instruction::FSub:
7910 case Instruction::ICmp:
7911 case Instruction::LShr:
7912 case Instruction::Mul:
7913 case Instruction::Or:
7914 case Instruction::Select:
7915 case Instruction::Shl:
7916 case Instruction::Sub:
7917 case Instruction::Xor:
7918 case Instruction::Freeze: {
7919 SmallVector<VPValue *> NewOps(Operands);
7920 if (Instruction::isBinaryOp(Opcode: I->getOpcode())) {
7921 // The legacy cost model uses SCEV to check if some of the operands are
7922 // constants. To match the legacy cost model's behavior, use SCEV to try
7923 // to replace operands with constants.
7924 ScalarEvolution &SE = *PSE.getSE();
7925 auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
7926 if (!Op->isLiveIn())
7927 return Op;
7928 Value *V = Op->getUnderlyingValue();
7929 if (isa<Constant>(Val: V) || !SE.isSCEVable(Ty: V->getType()))
7930 return Op;
7931 auto *C = dyn_cast<SCEVConstant>(Val: SE.getSCEV(V));
7932 if (!C)
7933 return Op;
7934 return Plan.getOrAddLiveIn(V: C->getValue());
7935 };
7936 // For Mul, the legacy cost model checks both operands.
7937 if (I->getOpcode() == Instruction::Mul)
7938 NewOps[0] = GetConstantViaSCEV(NewOps[0]);
7939 // For other binops, the legacy cost model only checks the second operand.
7940 NewOps[1] = GetConstantViaSCEV(NewOps[1]);
7941 }
7942 return new VPWidenRecipe(*I, NewOps);
7943 }
7944 case Instruction::ExtractValue: {
7945 SmallVector<VPValue *> NewOps(Operands);
7946 Type *I32Ty = IntegerType::getInt32Ty(C&: I->getContext());
7947 auto *EVI = cast<ExtractValueInst>(Val: I);
7948 assert(EVI->getNumIndices() == 1 && "Expected one extractvalue index");
7949 unsigned Idx = EVI->getIndices()[0];
7950 NewOps.push_back(Elt: Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: I32Ty, V: Idx, IsSigned: false)));
7951 return new VPWidenRecipe(*I, NewOps);
7952 }
7953 };
7954}
7955
7956VPHistogramRecipe *
7957VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
7958 ArrayRef<VPValue *> Operands) {
7959 // FIXME: Support other operations.
7960 unsigned Opcode = HI->Update->getOpcode();
7961 assert((Opcode == Instruction::Add || Opcode == Instruction::Sub) &&
7962 "Histogram update operation must be an Add or Sub");
7963
7964 SmallVector<VPValue *, 3> HGramOps;
7965 // Bucket address.
7966 HGramOps.push_back(Elt: Operands[1]);
7967 // Increment value.
7968 HGramOps.push_back(Elt: getVPValueOrAddLiveIn(V: HI->Update->getOperand(i: 1)));
7969
7970 // In case of predicated execution (due to tail-folding, or conditional
7971 // execution, or both), pass the relevant mask.
7972 if (Legal->isMaskRequired(I: HI->Store))
7973 HGramOps.push_back(Elt: getBlockInMask(VPBB: Builder.getInsertBlock()));
7974
7975 return new VPHistogramRecipe(Opcode, HGramOps, HI->Store->getDebugLoc());
7976}
7977
7978VPReplicateRecipe *
7979VPRecipeBuilder::handleReplication(Instruction *I, ArrayRef<VPValue *> Operands,
7980 VFRange &Range) {
7981 bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7982 Predicate: [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7983 Range);
7984
7985 bool IsPredicated = CM.isPredicatedInst(I);
7986
7987 // Even if the instruction is not marked as uniform, there are certain
7988 // intrinsic calls that can be effectively treated as such, so we check for
7989 // them here. Conservatively, we only do this for scalable vectors, since
7990 // for fixed-width VFs we can always fall back on full scalarization.
7991 if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(Val: I)) {
7992 switch (cast<IntrinsicInst>(Val: I)->getIntrinsicID()) {
7993 case Intrinsic::assume:
7994 case Intrinsic::lifetime_start:
7995 case Intrinsic::lifetime_end:
7996 // For scalable vectors if one of the operands is variant then we still
7997 // want to mark as uniform, which will generate one instruction for just
7998 // the first lane of the vector. We can't scalarize the call in the same
7999 // way as for fixed-width vectors because we don't know how many lanes
8000 // there are.
8001 //
8002 // The reasons for doing it this way for scalable vectors are:
8003 // 1. For the assume intrinsic generating the instruction for the first
8004 // lane is still be better than not generating any at all. For
8005 // example, the input may be a splat across all lanes.
8006 // 2. For the lifetime start/end intrinsics the pointer operand only
8007 // does anything useful when the input comes from a stack object,
8008 // which suggests it should always be uniform. For non-stack objects
8009 // the effect is to poison the object, which still allows us to
8010 // remove the call.
8011 IsUniform = true;
8012 break;
8013 default:
8014 break;
8015 }
8016 }
8017 VPValue *BlockInMask = nullptr;
8018 if (!IsPredicated) {
8019 // Finalize the recipe for Instr, first if it is not predicated.
8020 LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8021 } else {
8022 LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8023 // Instructions marked for predication are replicated and a mask operand is
8024 // added initially. Masked replicate recipes will later be placed under an
8025 // if-then construct to prevent side-effects. Generate recipes to compute
8026 // the block mask for this region.
8027 BlockInMask = getBlockInMask(VPBB: Builder.getInsertBlock());
8028 }
8029
8030 // Note that there is some custom logic to mark some intrinsics as uniform
8031 // manually above for scalable vectors, which this assert needs to account for
8032 // as well.
8033 assert((Range.Start.isScalar() || !IsUniform || !IsPredicated ||
8034 (Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8035 "Should not predicate a uniform recipe");
8036 auto *Recipe = new VPReplicateRecipe(I, Operands, IsUniform, BlockInMask,
8037 VPIRMetadata(*I, LVer));
8038 return Recipe;
8039}
8040
8041/// Find all possible partial reductions in the loop and track all of those that
8042/// are valid so recipes can be formed later.
8043void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
8044 // Find all possible partial reductions.
8045 SmallVector<std::pair<PartialReductionChain, unsigned>>
8046 PartialReductionChains;
8047 for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
8048 getScaledReductions(PHI: Phi, RdxExitInstr: RdxDesc.getLoopExitInstr(), Range,
8049 Chains&: PartialReductionChains);
8050 }
8051
8052 // A partial reduction is invalid if any of its extends are used by
8053 // something that isn't another partial reduction. This is because the
8054 // extends are intended to be lowered along with the reduction itself.
8055
8056 // Build up a set of partial reduction ops for efficient use checking.
8057 SmallSet<User *, 4> PartialReductionOps;
8058 for (const auto &[PartialRdx, _] : PartialReductionChains)
8059 PartialReductionOps.insert(Ptr: PartialRdx.ExtendUser);
8060
8061 auto ExtendIsOnlyUsedByPartialReductions =
8062 [&PartialReductionOps](Instruction *Extend) {
8063 return all_of(Range: Extend->users(), P: [&](const User *U) {
8064 return PartialReductionOps.contains(Ptr: U);
8065 });
8066 };
8067
8068 // Check if each use of a chain's two extends is a partial reduction
8069 // and only add those that don't have non-partial reduction users.
8070 for (auto Pair : PartialReductionChains) {
8071 PartialReductionChain Chain = Pair.first;
8072 if (ExtendIsOnlyUsedByPartialReductions(Chain.ExtendA) &&
8073 (!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)))
8074 ScaledReductionMap.try_emplace(Key: Chain.Reduction, Args&: Pair.second);
8075 }
8076}
8077
8078bool VPRecipeBuilder::getScaledReductions(
8079 Instruction *PHI, Instruction *RdxExitInstr, VFRange &Range,
8080 SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) {
8081 if (!CM.TheLoop->contains(Inst: RdxExitInstr))
8082 return false;
8083
8084 auto *Update = dyn_cast<BinaryOperator>(Val: RdxExitInstr);
8085 if (!Update)
8086 return false;
8087
8088 Value *Op = Update->getOperand(i_nocapture: 0);
8089 Value *PhiOp = Update->getOperand(i_nocapture: 1);
8090 if (Op == PHI)
8091 std::swap(a&: Op, b&: PhiOp);
8092
8093 // Try and get a scaled reduction from the first non-phi operand.
8094 // If one is found, we use the discovered reduction instruction in
8095 // place of the accumulator for costing.
8096 if (auto *OpInst = dyn_cast<Instruction>(Val: Op)) {
8097 if (getScaledReductions(PHI, RdxExitInstr: OpInst, Range, Chains)) {
8098 PHI = Chains.rbegin()->first.Reduction;
8099
8100 Op = Update->getOperand(i_nocapture: 0);
8101 PhiOp = Update->getOperand(i_nocapture: 1);
8102 if (Op == PHI)
8103 std::swap(a&: Op, b&: PhiOp);
8104 }
8105 }
8106 if (PhiOp != PHI)
8107 return false;
8108
8109 using namespace llvm::PatternMatch;
8110
8111 // If the update is a binary operator, check both of its operands to see if
8112 // they are extends. Otherwise, see if the update comes directly from an
8113 // extend.
8114 Instruction *Exts[2] = {nullptr};
8115 BinaryOperator *ExtendUser = dyn_cast<BinaryOperator>(Val: Op);
8116 std::optional<unsigned> BinOpc;
8117 Type *ExtOpTypes[2] = {nullptr};
8118
8119 auto CollectExtInfo = [this, &Exts,
8120 &ExtOpTypes](SmallVectorImpl<Value *> &Ops) -> bool {
8121 unsigned I = 0;
8122 for (Value *OpI : Ops) {
8123 Value *ExtOp;
8124 if (!match(V: OpI, P: m_ZExtOrSExt(Op: m_Value(V&: ExtOp))))
8125 return false;
8126 Exts[I] = cast<Instruction>(Val: OpI);
8127
8128 // TODO: We should be able to support live-ins.
8129 if (!CM.TheLoop->contains(Inst: Exts[I]))
8130 return false;
8131
8132 ExtOpTypes[I] = ExtOp->getType();
8133 I++;
8134 }
8135 return true;
8136 };
8137
8138 if (ExtendUser) {
8139 if (!ExtendUser->hasOneUse())
8140 return false;
8141
8142 // Use the side-effect of match to replace BinOp only if the pattern is
8143 // matched, we don't care at this point whether it actually matched.
8144 match(V: ExtendUser, P: m_Neg(V: m_BinOp(I&: ExtendUser)));
8145
8146 SmallVector<Value *> Ops(ExtendUser->operands());
8147 if (!CollectExtInfo(Ops))
8148 return false;
8149
8150 BinOpc = std::make_optional(t: ExtendUser->getOpcode());
8151 } else if (match(V: Update, P: m_Add(L: m_Value(), R: m_Value()))) {
8152 // We already know the operands for Update are Op and PhiOp.
8153 SmallVector<Value *> Ops({Op});
8154 if (!CollectExtInfo(Ops))
8155 return false;
8156
8157 ExtendUser = Update;
8158 BinOpc = std::nullopt;
8159 } else
8160 return false;
8161
8162 TTI::PartialReductionExtendKind OpAExtend =
8163 TTI::getPartialReductionExtendKind(I: Exts[0]);
8164 TTI::PartialReductionExtendKind OpBExtend =
8165 Exts[1] ? TTI::getPartialReductionExtendKind(I: Exts[1]) : TTI::PR_None;
8166 PartialReductionChain Chain(RdxExitInstr, Exts[0], Exts[1], ExtendUser);
8167
8168 TypeSize PHISize = PHI->getType()->getPrimitiveSizeInBits();
8169 TypeSize ASize = ExtOpTypes[0]->getPrimitiveSizeInBits();
8170 if (!PHISize.hasKnownScalarFactor(RHS: ASize))
8171 return false;
8172 unsigned TargetScaleFactor = PHISize.getKnownScalarFactor(RHS: ASize);
8173
8174 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8175 Predicate: [&](ElementCount VF) {
8176 InstructionCost Cost = TTI->getPartialReductionCost(
8177 Opcode: Update->getOpcode(), InputTypeA: ExtOpTypes[0], InputTypeB: ExtOpTypes[1],
8178 AccumType: PHI->getType(), VF, OpAExtend, OpBExtend, BinOp: BinOpc, CostKind: CM.CostKind);
8179 return Cost.isValid();
8180 },
8181 Range)) {
8182 Chains.emplace_back(Args&: Chain, Args&: TargetScaleFactor);
8183 return true;
8184 }
8185
8186 return false;
8187}
8188
8189VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
8190 VFRange &Range) {
8191 // First, check for specific widening recipes that deal with inductions, Phi
8192 // nodes, calls and memory operations.
8193 VPRecipeBase *Recipe;
8194 Instruction *Instr = R->getUnderlyingInstr();
8195 SmallVector<VPValue *, 4> Operands(R->operands());
8196 if (auto *PhiR = dyn_cast<VPWidenPHIRecipe>(Val: R)) {
8197 VPBasicBlock *Parent = PhiR->getParent();
8198 [[maybe_unused]] VPRegionBlock *LoopRegionOf =
8199 Parent->getEnclosingLoopRegion();
8200 assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent &&
8201 "Non-header phis should have been handled during predication");
8202 auto *Phi = cast<PHINode>(Val: R->getUnderlyingInstr());
8203 assert(Operands.size() == 2 && "Must have 2 operands for header phis");
8204 if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8205 return Recipe;
8206
8207 VPHeaderPHIRecipe *PhiRecipe = nullptr;
8208 assert((Legal->isReductionVariable(Phi) ||
8209 Legal->isFixedOrderRecurrence(Phi)) &&
8210 "can only widen reductions and fixed-order recurrences here");
8211 VPValue *StartV = Operands[0];
8212 if (Legal->isReductionVariable(PN: Phi)) {
8213 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(PN: Phi);
8214 assert(RdxDesc.getRecurrenceStartValue() ==
8215 Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8216
8217 // If the PHI is used by a partial reduction, set the scale factor.
8218 unsigned ScaleFactor =
8219 getScalingForReduction(ExitInst: RdxDesc.getLoopExitInstr()).value_or(u: 1);
8220 PhiRecipe = new VPReductionPHIRecipe(
8221 Phi, RdxDesc.getRecurrenceKind(), *StartV, CM.isInLoopReduction(Phi),
8222 CM.useOrderedReductions(RdxDesc), ScaleFactor);
8223 } else {
8224 // TODO: Currently fixed-order recurrences are modeled as chains of
8225 // first-order recurrences. If there are no users of the intermediate
8226 // recurrences in the chain, the fixed order recurrence should be modeled
8227 // directly, enabling more efficient codegen.
8228 PhiRecipe = new VPFirstOrderRecurrencePHIRecipe(Phi, *StartV);
8229 }
8230 // Add backedge value.
8231 PhiRecipe->addOperand(Operand: Operands[1]);
8232 return PhiRecipe;
8233 }
8234
8235 if (isa<TruncInst>(Val: Instr) && (Recipe = tryToOptimizeInductionTruncate(
8236 I: cast<TruncInst>(Val: Instr), Operands, Range)))
8237 return Recipe;
8238
8239 // All widen recipes below deal only with VF > 1.
8240 if (LoopVectorizationPlanner::getDecisionAndClampRange(
8241 Predicate: [&](ElementCount VF) { return VF.isScalar(); }, Range))
8242 return nullptr;
8243
8244 if (auto *CI = dyn_cast<CallInst>(Val: Instr))
8245 return tryToWidenCall(CI, Operands, Range);
8246
8247 if (StoreInst *SI = dyn_cast<StoreInst>(Val: Instr))
8248 if (auto HistInfo = Legal->getHistogramInfo(I: SI))
8249 return tryToWidenHistogram(HI: *HistInfo, Operands);
8250
8251 if (isa<LoadInst>(Val: Instr) || isa<StoreInst>(Val: Instr))
8252 return tryToWidenMemory(I: Instr, Operands, Range);
8253
8254 if (std::optional<unsigned> ScaleFactor = getScalingForReduction(ExitInst: Instr))
8255 return tryToCreatePartialReduction(Reduction: Instr, Operands, ScaleFactor: ScaleFactor.value());
8256
8257 if (!shouldWiden(I: Instr, Range))
8258 return nullptr;
8259
8260 if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Instr))
8261 return new VPWidenGEPRecipe(GEP, Operands);
8262
8263 if (auto *SI = dyn_cast<SelectInst>(Val: Instr)) {
8264 return new VPWidenSelectRecipe(*SI, Operands);
8265 }
8266
8267 if (auto *CI = dyn_cast<CastInst>(Val: Instr)) {
8268 return new VPWidenCastRecipe(CI->getOpcode(), Operands[0], CI->getType(),
8269 *CI);
8270 }
8271
8272 return tryToWiden(I: Instr, Operands);
8273}
8274
8275VPRecipeBase *
8276VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
8277 ArrayRef<VPValue *> Operands,
8278 unsigned ScaleFactor) {
8279 assert(Operands.size() == 2 &&
8280 "Unexpected number of operands for partial reduction");
8281
8282 VPValue *BinOp = Operands[0];
8283 VPValue *Accumulator = Operands[1];
8284 VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
8285 if (isa<VPReductionPHIRecipe>(Val: BinOpRecipe) ||
8286 isa<VPPartialReductionRecipe>(Val: BinOpRecipe))
8287 std::swap(a&: BinOp, b&: Accumulator);
8288
8289 unsigned ReductionOpcode = Reduction->getOpcode();
8290 if (ReductionOpcode == Instruction::Sub) {
8291 auto *const Zero = ConstantInt::get(Ty: Reduction->getType(), V: 0);
8292 SmallVector<VPValue *, 2> Ops;
8293 Ops.push_back(Elt: Plan.getOrAddLiveIn(V: Zero));
8294 Ops.push_back(Elt: BinOp);
8295 BinOp = new VPWidenRecipe(*Reduction, Ops);
8296 Builder.insert(R: BinOp->getDefiningRecipe());
8297 ReductionOpcode = Instruction::Add;
8298 }
8299
8300 VPValue *Cond = nullptr;
8301 if (CM.blockNeedsPredicationForAnyReason(BB: Reduction->getParent())) {
8302 assert((ReductionOpcode == Instruction::Add ||
8303 ReductionOpcode == Instruction::Sub) &&
8304 "Expected an ADD or SUB operation for predicated partial "
8305 "reductions (because the neutral element in the mask is zero)!");
8306 Cond = getBlockInMask(VPBB: Builder.getInsertBlock());
8307 VPValue *Zero =
8308 Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: Reduction->getType(), V: 0));
8309 BinOp = Builder.createSelect(Cond, TrueVal: BinOp, FalseVal: Zero, DL: Reduction->getDebugLoc());
8310 }
8311 return new VPPartialReductionRecipe(ReductionOpcode, Accumulator, BinOp, Cond,
8312 ScaleFactor, Reduction);
8313}
8314
8315void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8316 ElementCount MaxVF) {
8317 if (ElementCount::isKnownGT(LHS: MinVF, RHS: MaxVF))
8318 return;
8319
8320 assert(OrigLoop->isInnermost() && "Inner loop expected.");
8321
8322 const LoopAccessInfo *LAI = Legal->getLAI();
8323 LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(),
8324 OrigLoop, LI, DT, PSE.getSE());
8325 if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
8326 !LAI->getRuntimePointerChecking()->getDiffChecks()) {
8327 // Only use noalias metadata when using memory checks guaranteeing no
8328 // overlap across all iterations.
8329 LVer.prepareNoAliasMetadata();
8330 }
8331
8332 auto MaxVFTimes2 = MaxVF * 2;
8333 auto VPlan0 = VPlanTransforms::buildPlainCFG(TheLoop: OrigLoop, LI&: *LI);
8334 for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
8335 VFRange SubRange = {VF, MaxVFTimes2};
8336 if (auto Plan = tryToBuildVPlanWithVPRecipes(
8337 InitialPlan: std::unique_ptr<VPlan>(VPlan0->duplicate()), Range&: SubRange, LVer: &LVer)) {
8338 bool HasScalarVF = Plan->hasScalarVFOnly();
8339 // Now optimize the initial VPlan.
8340 if (!HasScalarVF)
8341 VPlanTransforms::runPass(Fn: VPlanTransforms::truncateToMinimalBitwidths,
8342 Plan&: *Plan, Args: CM.getMinimalBitwidths());
8343 VPlanTransforms::runPass(Fn: VPlanTransforms::optimize, Plan&: *Plan);
8344 // TODO: try to put it close to addActiveLaneMask().
8345 // Discard the plan if it is not EVL-compatible
8346 if (CM.foldTailWithEVL() && !HasScalarVF &&
8347 !VPlanTransforms::runPass(Transform: VPlanTransforms::tryAddExplicitVectorLength,
8348 Plan&: *Plan, Args: CM.getMaxSafeElements()))
8349 break;
8350 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8351 VPlans.push_back(Elt: std::move(Plan));
8352 }
8353 VF = SubRange.End;
8354 }
8355}
8356
8357/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
8358/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
8359/// the end value of the induction.
8360static VPInstruction *addResumePhiRecipeForInduction(
8361 VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
8362 VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
8363 auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
8364 // Truncated wide inductions resume from the last lane of their vector value
8365 // in the last vector iteration which is handled elsewhere.
8366 if (WideIntOrFp && WideIntOrFp->getTruncInst())
8367 return nullptr;
8368
8369 VPValue *Start = WideIV->getStartValue();
8370 VPValue *Step = WideIV->getStepValue();
8371 const InductionDescriptor &ID = WideIV->getInductionDescriptor();
8372 VPValue *EndValue = VectorTC;
8373 if (!WideIntOrFp || !WideIntOrFp->isCanonical()) {
8374 EndValue = VectorPHBuilder.createDerivedIV(
8375 Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
8376 Start, Current: VectorTC, Step);
8377 }
8378
8379 // EndValue is derived from the vector trip count (which has the same type as
8380 // the widest induction) and thus may be wider than the induction here.
8381 Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(V: WideIV);
8382 if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(V: EndValue)) {
8383 EndValue = VectorPHBuilder.createScalarCast(Opcode: Instruction::Trunc, Op: EndValue,
8384 ResultTy: ScalarTypeOfWideIV,
8385 DL: WideIV->getDebugLoc());
8386 }
8387
8388 auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
8389 IncomingValues: {EndValue, Start}, DL: WideIV->getDebugLoc(), Name: "bc.resume.val");
8390 return ResumePhiRecipe;
8391}
8392
8393/// Create resume phis in the scalar preheader for first-order recurrences,
8394/// reductions and inductions, and update the VPIRInstructions wrapping the
8395/// original phis in the scalar header. End values for inductions are added to
8396/// \p IVEndValues.
8397static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
8398 DenseMap<VPValue *, VPValue *> &IVEndValues) {
8399 VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
8400 auto *ScalarPH = Plan.getScalarPreheader();
8401 auto *MiddleVPBB = cast<VPBasicBlock>(Val: ScalarPH->getPredecessors()[0]);
8402 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8403 VPBuilder VectorPHBuilder(
8404 cast<VPBasicBlock>(Val: VectorRegion->getSinglePredecessor()));
8405 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8406 VPBuilder ScalarPHBuilder(ScalarPH);
8407 for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
8408 auto *ScalarPhiIRI = cast<VPIRPhi>(Val: &ScalarPhiR);
8409
8410 // TODO: Extract final value from induction recipe initially, optimize to
8411 // pre-computed end value together in optimizeInductionExitUsers.
8412 auto *VectorPhiR =
8413 cast<VPHeaderPHIRecipe>(Val: Builder.getRecipe(I: &ScalarPhiIRI->getIRPhi()));
8414 if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(Val: VectorPhiR)) {
8415 if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
8416 WideIV: WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
8417 VectorTC: &Plan.getVectorTripCount())) {
8418 assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
8419 IVEndValues[WideIVR] = ResumePhi->getOperand(N: 0);
8420 ScalarPhiIRI->addOperand(Operand: ResumePhi);
8421 continue;
8422 }
8423 // TODO: Also handle truncated inductions here. Computing end-values
8424 // separately should be done as VPlan-to-VPlan optimization, after
8425 // legalizing all resume values to use the last lane from the loop.
8426 assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
8427 "should only skip truncated wide inductions");
8428 continue;
8429 }
8430
8431 // The backedge value provides the value to resume coming out of a loop,
8432 // which for FORs is a vector whose last element needs to be extracted. The
8433 // start value provides the value if the loop is bypassed.
8434 bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(Val: VectorPhiR);
8435 auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
8436 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
8437 "Cannot handle loops with uncountable early exits");
8438 if (IsFOR)
8439 ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
8440 Opcode: VPInstruction::ExtractLastElement, Operands: {ResumeFromVectorLoop}, Inst: {},
8441 Name: "vector.recur.extract");
8442 StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
8443 auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
8444 IncomingValues: {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, DL: {}, Name);
8445 ScalarPhiIRI->addOperand(Operand: ResumePhiR);
8446 }
8447}
8448
8449// Collect VPIRInstructions for phis in the exit block from the latch only.
8450static SetVector<VPIRInstruction *> collectUsersInLatchExitBlock(VPlan &Plan) {
8451 SetVector<VPIRInstruction *> ExitUsersToFix;
8452 for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
8453
8454 if (ExitVPBB->getSinglePredecessor() != Plan.getMiddleBlock())
8455 continue;
8456
8457 for (VPRecipeBase &R : ExitVPBB->phis()) {
8458 auto *ExitIRI = cast<VPIRPhi>(Val: &R);
8459 assert(ExitIRI->getNumOperands() == 1 && "must have a single operand");
8460 VPValue *V = ExitIRI->getOperand(N: 0);
8461 if (V->isLiveIn())
8462 continue;
8463 assert(V->getDefiningRecipe()->getParent()->getEnclosingLoopRegion() &&
8464 "Only recipes defined inside a region should need fixing.");
8465 ExitUsersToFix.insert(X: ExitIRI);
8466 }
8467 }
8468 return ExitUsersToFix;
8469}
8470
8471// Add exit values to \p Plan. Extracts are added for each entry in \p
8472// ExitUsersToFix if needed and their operands are updated.
8473static void
8474addUsersInExitBlocks(VPlan &Plan,
8475 const SetVector<VPIRInstruction *> &ExitUsersToFix) {
8476 if (ExitUsersToFix.empty())
8477 return;
8478
8479 auto *MiddleVPBB = Plan.getMiddleBlock();
8480 VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8481
8482 // Introduce extract for exiting values and update the VPIRInstructions
8483 // modeling the corresponding LCSSA phis.
8484 for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
8485 assert(ExitIRI->getNumOperands() == 1 &&
8486 ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
8487 "exit values from early exits must be fixed when branch to "
8488 "early-exit is added");
8489 ExitIRI->extractLastLaneOfFirstOperand(Builder&: B);
8490 }
8491}
8492
8493/// Handle users in the exit block for first order reductions in the original
8494/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
8495/// users in the original exit block using the VPIRInstruction wrapping to the
8496/// LCSSA phi.
8497static void addExitUsersForFirstOrderRecurrences(
8498 VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix, VFRange &Range) {
8499 VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8500 auto *ScalarPHVPBB = Plan.getScalarPreheader();
8501 auto *MiddleVPBB = Plan.getMiddleBlock();
8502 VPBuilder ScalarPHBuilder(ScalarPHVPBB);
8503 VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8504
8505 auto IsScalableOne = [](ElementCount VF) -> bool {
8506 return VF == ElementCount::getScalable(MinVal: 1);
8507 };
8508
8509 for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
8510 auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &HeaderPhi);
8511 if (!FOR)
8512 continue;
8513
8514 assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
8515 "Cannot handle loops with uncountable early exits");
8516
8517 // This is the second phase of vectorizing first-order recurrences, creating
8518 // extract for users outside the loop. An overview of the transformation is
8519 // described below. Suppose we have the following loop with some use after
8520 // the loop of the last a[i-1],
8521 //
8522 // for (int i = 0; i < n; ++i) {
8523 // t = a[i - 1];
8524 // b[i] = a[i] - t;
8525 // }
8526 // use t;
8527 //
8528 // There is a first-order recurrence on "a". For this loop, the shorthand
8529 // scalar IR looks like:
8530 //
8531 // scalar.ph:
8532 // s.init = a[-1]
8533 // br scalar.body
8534 //
8535 // scalar.body:
8536 // i = phi [0, scalar.ph], [i+1, scalar.body]
8537 // s1 = phi [s.init, scalar.ph], [s2, scalar.body]
8538 // s2 = a[i]
8539 // b[i] = s2 - s1
8540 // br cond, scalar.body, exit.block
8541 //
8542 // exit.block:
8543 // use = lcssa.phi [s1, scalar.body]
8544 //
8545 // In this example, s1 is a recurrence because it's value depends on the
8546 // previous iteration. In the first phase of vectorization, we created a
8547 // VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
8548 // for users in the scalar preheader and exit block.
8549 //
8550 // vector.ph:
8551 // v_init = vector(..., ..., ..., a[-1])
8552 // br vector.body
8553 //
8554 // vector.body
8555 // i = phi [0, vector.ph], [i+4, vector.body]
8556 // v1 = phi [v_init, vector.ph], [v2, vector.body]
8557 // v2 = a[i, i+1, i+2, i+3]
8558 // b[i] = v2 - v1
8559 // // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
8560 // b[i, i+1, i+2, i+3] = v2 - v1
8561 // br cond, vector.body, middle.block
8562 //
8563 // middle.block:
8564 // vector.recur.extract.for.phi = v2(2)
8565 // vector.recur.extract = v2(3)
8566 // br cond, scalar.ph, exit.block
8567 //
8568 // scalar.ph:
8569 // scalar.recur.init = phi [vector.recur.extract, middle.block],
8570 // [s.init, otherwise]
8571 // br scalar.body
8572 //
8573 // scalar.body:
8574 // i = phi [0, scalar.ph], [i+1, scalar.body]
8575 // s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
8576 // s2 = a[i]
8577 // b[i] = s2 - s1
8578 // br cond, scalar.body, exit.block
8579 //
8580 // exit.block:
8581 // lo = lcssa.phi [s1, scalar.body],
8582 // [vector.recur.extract.for.phi, middle.block]
8583 //
8584 // Now update VPIRInstructions modeling LCSSA phis in the exit block.
8585 // Extract the penultimate value of the recurrence and use it as operand for
8586 // the VPIRInstruction modeling the phi.
8587 for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
8588 if (ExitIRI->getOperand(N: 0) != FOR)
8589 continue;
8590 // For VF vscale x 1, if vscale = 1, we are unable to extract the
8591 // penultimate value of the recurrence. Instead, we rely on function
8592 // addUsersInExitBlocks to extract the last element from the result of
8593 // VPInstruction::FirstOrderRecurrenceSplice by leaving the user of the
8594 // recurrence phi in ExitUsersToFix.
8595 // TODO: Consider vscale_range info and UF.
8596 if (LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: IsScalableOne,
8597 Range))
8598 return;
8599 VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
8600 Opcode: VPInstruction::ExtractPenultimateElement, Operands: {FOR->getBackedgeValue()},
8601 Inst: {}, Name: "vector.recur.extract.for.phi");
8602 ExitIRI->setOperand(I: 0, New: PenultimateElement);
8603 ExitUsersToFix.remove(X: ExitIRI);
8604 }
8605 }
8606}
8607
8608VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
8609 VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
8610
8611 using namespace llvm::VPlanPatternMatch;
8612 SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
8613
8614 // ---------------------------------------------------------------------------
8615 // Build initial VPlan: Scan the body of the loop in a topological order to
8616 // visit each basic block after having visited its predecessor basic blocks.
8617 // ---------------------------------------------------------------------------
8618
8619 // Create initial VPlan skeleton, having a basic block for the pre-header
8620 // which contains SCEV expansions that need to happen before the CFG is
8621 // modified; a basic block for the vector pre-header, followed by a region for
8622 // the vector loop, followed by the middle basic block. The skeleton vector
8623 // loop region contains a header and latch basic blocks.
8624
8625 bool RequiresScalarEpilogueCheck =
8626 LoopVectorizationPlanner::getDecisionAndClampRange(
8627 Predicate: [this](ElementCount VF) {
8628 return !CM.requiresScalarEpilogue(IsVectorizing: VF.isVector());
8629 },
8630 Range);
8631 VPlanTransforms::prepareForVectorization(
8632 Plan&: *Plan, InductionTy: Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck,
8633 TailFolded: CM.foldTailByMasking(), TheLoop: OrigLoop,
8634 IVDL: getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()),
8635 HasUncountableExit: Legal->hasUncountableEarlyExit(), Range);
8636 VPlanTransforms::createLoopRegions(Plan&: *Plan);
8637
8638 // Don't use getDecisionAndClampRange here, because we don't know the UF
8639 // so this function is better to be conservative, rather than to split
8640 // it up into different VPlans.
8641 // TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8642 bool IVUpdateMayOverflow = false;
8643 for (ElementCount VF : Range)
8644 IVUpdateMayOverflow |= !isIndvarOverflowCheckKnownFalse(Cost: &CM, VF);
8645
8646 TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8647 // Use NUW for the induction increment if we proved that it won't overflow in
8648 // the vector loop or when not folding the tail. In the later case, we know
8649 // that the canonical induction increment will not overflow as the vector trip
8650 // count is >= increment and a multiple of the increment.
8651 bool HasNUW = !IVUpdateMayOverflow || Style == TailFoldingStyle::None;
8652 if (!HasNUW) {
8653 auto *IVInc = Plan->getVectorLoopRegion()
8654 ->getExitingBasicBlock()
8655 ->getTerminator()
8656 ->getOperand(N: 0);
8657 assert(match(IVInc, m_VPInstruction<Instruction::Add>(
8658 m_Specific(Plan->getCanonicalIV()), m_VPValue())) &&
8659 "Did not find the canonical IV increment");
8660 cast<VPRecipeWithIRFlags>(Val: IVInc)->dropPoisonGeneratingFlags();
8661 }
8662
8663 // ---------------------------------------------------------------------------
8664 // Pre-construction: record ingredients whose recipes we'll need to further
8665 // process after constructing the initial VPlan.
8666 // ---------------------------------------------------------------------------
8667
8668 // For each interleave group which is relevant for this (possibly trimmed)
8669 // Range, add it to the set of groups to be later applied to the VPlan and add
8670 // placeholders for its members' Recipes which we'll be replacing with a
8671 // single VPInterleaveRecipe.
8672 for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8673 auto ApplyIG = [IG, this](ElementCount VF) -> bool {
8674 bool Result = (VF.isVector() && // Query is illegal for VF == 1
8675 CM.getWideningDecision(I: IG->getInsertPos(), VF) ==
8676 LoopVectorizationCostModel::CM_Interleave);
8677 // For scalable vectors, the interleave factors must be <= 8 since we
8678 // require the (de)interleaveN intrinsics instead of shufflevectors.
8679 assert((!Result || !VF.isScalable() || IG->getFactor() <= 8) &&
8680 "Unsupported interleave factor for scalable vectors");
8681 return Result;
8682 };
8683 if (!getDecisionAndClampRange(Predicate: ApplyIG, Range))
8684 continue;
8685 InterleaveGroups.insert(Ptr: IG);
8686 }
8687
8688 // ---------------------------------------------------------------------------
8689 // Predicate and linearize the top-level loop region.
8690 // ---------------------------------------------------------------------------
8691 auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(
8692 Plan&: *Plan, FoldTail: CM.foldTailByMasking());
8693
8694 // ---------------------------------------------------------------------------
8695 // Construct wide recipes and apply predication for original scalar
8696 // VPInstructions in the loop.
8697 // ---------------------------------------------------------------------------
8698 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
8699 Builder, BlockMaskCache, LVer);
8700 RecipeBuilder.collectScaledReductions(Range);
8701
8702 // Scan the body of the loop in a topological order to visit each basic block
8703 // after having visited its predecessor basic blocks.
8704 VPRegionBlock *LoopRegion = Plan->getVectorLoopRegion();
8705 VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
8706 ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
8707 HeaderVPBB);
8708
8709 auto *MiddleVPBB = Plan->getMiddleBlock();
8710 VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
8711 // Mapping from VPValues in the initial plan to their widened VPValues. Needed
8712 // temporarily to update created block masks.
8713 DenseMap<VPValue *, VPValue *> Old2New;
8714 for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
8715 // Convert input VPInstructions to widened recipes.
8716 for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
8717 auto *SingleDef = cast<VPSingleDefRecipe>(Val: &R);
8718 auto *UnderlyingValue = SingleDef->getUnderlyingValue();
8719 // Skip recipes that do not need transforming, including canonical IV,
8720 // wide canonical IV and VPInstructions without underlying values. The
8721 // latter are added above for masking.
8722 // FIXME: Migrate code relying on the underlying instruction from VPlan0
8723 // to construct recipes below to not use the underlying instruction.
8724 if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe, VPBlendRecipe>(
8725 Val: &R) ||
8726 (isa<VPInstruction>(Val: &R) && !UnderlyingValue))
8727 continue;
8728
8729 // FIXME: VPlan0, which models a copy of the original scalar loop, should
8730 // not use VPWidenPHIRecipe to model the phis.
8731 assert((isa<VPWidenPHIRecipe>(&R) || isa<VPInstruction>(&R)) &&
8732 UnderlyingValue && "unsupported recipe");
8733
8734 // TODO: Gradually replace uses of underlying instruction by analyses on
8735 // VPlan.
8736 Instruction *Instr = cast<Instruction>(Val: UnderlyingValue);
8737 Builder.setInsertPoint(SingleDef);
8738
8739 // The stores with invariant address inside the loop will be deleted, and
8740 // in the exit block, a uniform store recipe will be created for the final
8741 // invariant store of the reduction.
8742 StoreInst *SI;
8743 if ((SI = dyn_cast<StoreInst>(Val: Instr)) &&
8744 Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand())) {
8745 // Only create recipe for the final invariant store of the reduction.
8746 if (Legal->isInvariantStoreOfReduction(SI)) {
8747 auto *Recipe =
8748 new VPReplicateRecipe(SI, R.operands(), true /* IsUniform */,
8749 nullptr /*Mask*/, VPIRMetadata(*SI, LVer));
8750 Recipe->insertBefore(BB&: *MiddleVPBB, IP: MBIP);
8751 }
8752 R.eraseFromParent();
8753 continue;
8754 }
8755
8756 VPRecipeBase *Recipe =
8757 RecipeBuilder.tryToCreateWidenRecipe(R: SingleDef, Range);
8758 if (!Recipe) {
8759 SmallVector<VPValue *, 4> Operands(R.operands());
8760 Recipe = RecipeBuilder.handleReplication(I: Instr, Operands, Range);
8761 }
8762
8763 RecipeBuilder.setRecipe(I: Instr, R: Recipe);
8764 if (isa<VPWidenIntOrFpInductionRecipe>(Val: Recipe) && isa<TruncInst>(Val: Instr)) {
8765 // Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
8766 // moved to the phi section in the header.
8767 Recipe->insertBefore(BB&: *HeaderVPBB, IP: HeaderVPBB->getFirstNonPhi());
8768 } else {
8769 Builder.insert(R: Recipe);
8770 }
8771 if (Recipe->getNumDefinedValues() == 1) {
8772 SingleDef->replaceAllUsesWith(New: Recipe->getVPSingleValue());
8773 Old2New[SingleDef] = Recipe->getVPSingleValue();
8774 } else {
8775 assert(Recipe->getNumDefinedValues() == 0 &&
8776 "Unexpected multidef recipe");
8777 R.eraseFromParent();
8778 }
8779 }
8780 }
8781
8782 // replaceAllUsesWith above may invalidate the block masks. Update them here.
8783 // TODO: Include the masks as operands in the predicated VPlan directly
8784 // to remove the need to keep a map of masks beyond the predication
8785 // transform.
8786 RecipeBuilder.updateBlockMaskCache(Old2New);
8787 for (const auto &[Old, _] : Old2New)
8788 Old->getDefiningRecipe()->eraseFromParent();
8789
8790 assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8791 !Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8792 "entry block must be set to a VPRegionBlock having a non-empty entry "
8793 "VPBasicBlock");
8794
8795 // Update wide induction increments to use the same step as the corresponding
8796 // wide induction. This enables detecting induction increments directly in
8797 // VPlan and removes redundant splats.
8798 for (const auto &[Phi, ID] : Legal->getInductionVars()) {
8799 auto *IVInc = cast<Instruction>(
8800 Val: Phi->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()));
8801 if (IVInc->getOperand(i: 0) != Phi || IVInc->getOpcode() != Instruction::Add)
8802 continue;
8803 VPWidenInductionRecipe *WideIV =
8804 cast<VPWidenInductionRecipe>(Val: RecipeBuilder.getRecipe(I: Phi));
8805 VPRecipeBase *R = RecipeBuilder.getRecipe(I: IVInc);
8806 R->setOperand(I: 1, New: WideIV->getStepValue());
8807 }
8808
8809 DenseMap<VPValue *, VPValue *> IVEndValues;
8810 addScalarResumePhis(Builder&: RecipeBuilder, Plan&: *Plan, IVEndValues);
8811 SetVector<VPIRInstruction *> ExitUsersToFix =
8812 collectUsersInLatchExitBlock(Plan&: *Plan);
8813 addExitUsersForFirstOrderRecurrences(Plan&: *Plan, ExitUsersToFix, Range);
8814 addUsersInExitBlocks(Plan&: *Plan, ExitUsersToFix);
8815
8816 // ---------------------------------------------------------------------------
8817 // Transform initial VPlan: Apply previously taken decisions, in order, to
8818 // bring the VPlan to its final state.
8819 // ---------------------------------------------------------------------------
8820
8821 // Adjust the recipes for any inloop reductions.
8822 adjustRecipesForReductions(Plan, RecipeBuilder, MinVF: Range.Start);
8823
8824 // Apply mandatory transformation to handle FP maxnum/minnum reduction with
8825 // NaNs if possible, bail out otherwise.
8826 if (!VPlanTransforms::runPass(Transform: VPlanTransforms::handleMaxMinNumReductions,
8827 Plan&: *Plan))
8828 return nullptr;
8829
8830 // Transform recipes to abstract recipes if it is legal and beneficial and
8831 // clamp the range for better cost estimation.
8832 // TODO: Enable following transform when the EVL-version of extended-reduction
8833 // and mulacc-reduction are implemented.
8834 if (!CM.foldTailWithEVL()) {
8835 VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
8836 CM.CostKind);
8837 VPlanTransforms::runPass(Fn: VPlanTransforms::convertToAbstractRecipes, Plan&: *Plan,
8838 Args&: CostCtx, Args&: Range);
8839 }
8840
8841 for (ElementCount VF : Range)
8842 Plan->addVF(VF);
8843 Plan->setName("Initial VPlan");
8844
8845 // Interleave memory: for each Interleave Group we marked earlier as relevant
8846 // for this VPlan, replace the Recipes widening its memory instructions with a
8847 // single VPInterleaveRecipe at its insertion point.
8848 VPlanTransforms::runPass(Fn: VPlanTransforms::createInterleaveGroups, Plan&: *Plan,
8849 Args: InterleaveGroups, Args&: RecipeBuilder,
8850 Args: CM.isScalarEpilogueAllowed());
8851
8852 // Replace VPValues for known constant strides guaranteed by predicate scalar
8853 // evolution.
8854 auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
8855 auto *R = cast<VPRecipeBase>(Val: &U);
8856 return R->getParent()->getParent() ||
8857 R->getParent() ==
8858 Plan->getVectorLoopRegion()->getSinglePredecessor();
8859 };
8860 for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8861 auto *StrideV = cast<SCEVUnknown>(Val: Stride)->getValue();
8862 auto *ScevStride = dyn_cast<SCEVConstant>(Val: PSE.getSCEV(V: StrideV));
8863 // Only handle constant strides for now.
8864 if (!ScevStride)
8865 continue;
8866
8867 auto *CI = Plan->getOrAddLiveIn(
8868 V: ConstantInt::get(Ty: Stride->getType(), V: ScevStride->getAPInt()));
8869 if (VPValue *StrideVPV = Plan->getLiveIn(V: StrideV))
8870 StrideVPV->replaceUsesWithIf(New: CI, ShouldReplace: CanUseVersionedStride);
8871
8872 // The versioned value may not be used in the loop directly but through a
8873 // sext/zext. Add new live-ins in those cases.
8874 for (Value *U : StrideV->users()) {
8875 if (!isa<SExtInst, ZExtInst>(Val: U))
8876 continue;
8877 VPValue *StrideVPV = Plan->getLiveIn(V: U);
8878 if (!StrideVPV)
8879 continue;
8880 unsigned BW = U->getType()->getScalarSizeInBits();
8881 APInt C = isa<SExtInst>(Val: U) ? ScevStride->getAPInt().sext(width: BW)
8882 : ScevStride->getAPInt().zext(width: BW);
8883 VPValue *CI = Plan->getOrAddLiveIn(V: ConstantInt::get(Ty: U->getType(), V: C));
8884 StrideVPV->replaceUsesWithIf(New: CI, ShouldReplace: CanUseVersionedStride);
8885 }
8886 }
8887
8888 auto BlockNeedsPredication = [this](BasicBlock *BB) {
8889 return Legal->blockNeedsPredication(BB);
8890 };
8891 VPlanTransforms::runPass(Fn: VPlanTransforms::dropPoisonGeneratingRecipes, Plan&: *Plan,
8892 Args: BlockNeedsPredication);
8893
8894 // Sink users of fixed-order recurrence past the recipe defining the previous
8895 // value and introduce FirstOrderRecurrenceSplice VPInstructions.
8896 if (!VPlanTransforms::runPass(Transform: VPlanTransforms::adjustFixedOrderRecurrences,
8897 Plan&: *Plan, Args&: Builder))
8898 return nullptr;
8899
8900 if (useActiveLaneMask(Style)) {
8901 // TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8902 // TailFoldingStyle is visible there.
8903 bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8904 bool WithoutRuntimeCheck =
8905 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8906 VPlanTransforms::addActiveLaneMask(Plan&: *Plan, UseActiveLaneMaskForControlFlow: ForControlFlow,
8907 DataAndControlFlowWithoutRuntimeCheck: WithoutRuntimeCheck);
8908 }
8909 VPlanTransforms::optimizeInductionExitUsers(Plan&: *Plan, EndValues&: IVEndValues);
8910
8911 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8912 return Plan;
8913}
8914
8915VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
8916 // Outer loop handling: They may require CFG and instruction level
8917 // transformations before even evaluating whether vectorization is profitable.
8918 // Since we cannot modify the incoming IR, we need to build VPlan upfront in
8919 // the vectorization pipeline.
8920 assert(!OrigLoop->isInnermost());
8921 assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8922
8923 auto Plan = VPlanTransforms::buildPlainCFG(TheLoop: OrigLoop, LI&: *LI);
8924 VPlanTransforms::prepareForVectorization(
8925 Plan&: *Plan, InductionTy: Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck: true, TailFolded: false, TheLoop: OrigLoop,
8926 IVDL: getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()), HasUncountableExit: false,
8927 Range);
8928 VPlanTransforms::createLoopRegions(Plan&: *Plan);
8929
8930 for (ElementCount VF : Range)
8931 Plan->addVF(VF);
8932
8933 if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
8934 Plan,
8935 GetIntOrFpInductionDescriptor: [this](PHINode *P) {
8936 return Legal->getIntOrFpInductionDescriptor(Phi: P);
8937 },
8938 SE&: *PSE.getSE(), TLI: *TLI))
8939 return nullptr;
8940
8941 // Collect mapping of IR header phis to header phi recipes, to be used in
8942 // addScalarResumePhis.
8943 DenseMap<VPBasicBlock *, VPValue *> BlockMaskCache;
8944 VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
8945 Builder, BlockMaskCache, nullptr /*LVer*/);
8946 for (auto &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8947 if (isa<VPCanonicalIVPHIRecipe>(Val: &R))
8948 continue;
8949 auto *HeaderR = cast<VPHeaderPHIRecipe>(Val: &R);
8950 RecipeBuilder.setRecipe(I: HeaderR->getUnderlyingInstr(), R: HeaderR);
8951 }
8952 DenseMap<VPValue *, VPValue *> IVEndValues;
8953 // TODO: IVEndValues are not used yet in the native path, to optimize exit
8954 // values.
8955 addScalarResumePhis(Builder&: RecipeBuilder, Plan&: *Plan, IVEndValues);
8956
8957 assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8958 return Plan;
8959}
8960
8961// Adjust the recipes for reductions. For in-loop reductions the chain of
8962// instructions leading from the loop exit instr to the phi need to be converted
8963// to reductions, with one operand being vector and the other being the scalar
8964// reduction chain. For other reductions, a select is introduced between the phi
8965// and users outside the vector region when folding the tail.
8966//
8967// A ComputeReductionResult recipe is added to the middle block, also for
8968// in-loop reductions which compute their result in-loop, because generating
8969// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8970//
8971// Adjust AnyOf reductions; replace the reduction phi for the selected value
8972// with a boolean reduction phi node to check if the condition is true in any
8973// iteration. The final value is selected by the final ComputeReductionResult.
8974void LoopVectorizationPlanner::adjustRecipesForReductions(
8975 VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8976 using namespace VPlanPatternMatch;
8977 VPRegionBlock *VectorLoopRegion = Plan->getVectorLoopRegion();
8978 VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8979 VPBasicBlock *MiddleVPBB = Plan->getMiddleBlock();
8980 SmallVector<VPRecipeBase *> ToDelete;
8981
8982 for (VPRecipeBase &R : Header->phis()) {
8983 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
8984 if (!PhiR || !PhiR->isInLoop() || (MinVF.isScalar() && !PhiR->isOrdered()))
8985 continue;
8986
8987 RecurKind Kind = PhiR->getRecurrenceKind();
8988 assert(
8989 !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
8990 !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
8991 "AnyOf and FindIV reductions are not allowed for in-loop reductions");
8992
8993 // Collect the chain of "link" recipes for the reduction starting at PhiR.
8994 SetVector<VPSingleDefRecipe *> Worklist;
8995 Worklist.insert(X: PhiR);
8996 for (unsigned I = 0; I != Worklist.size(); ++I) {
8997 VPSingleDefRecipe *Cur = Worklist[I];
8998 for (VPUser *U : Cur->users()) {
8999 auto *UserRecipe = cast<VPSingleDefRecipe>(Val: U);
9000 if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9001 assert((UserRecipe->getParent() == MiddleVPBB ||
9002 UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9003 "U must be either in the loop region, the middle block or the "
9004 "scalar preheader.");
9005 continue;
9006 }
9007 Worklist.insert(X: UserRecipe);
9008 }
9009 }
9010
9011 // Visit operation "Links" along the reduction chain top-down starting from
9012 // the phi until LoopExitValue. We keep track of the previous item
9013 // (PreviousLink) to tell which of the two operands of a Link will remain
9014 // scalar and which will be reduced. For minmax by select(cmp), Link will be
9015 // the select instructions. Blend recipes of in-loop reduction phi's will
9016 // get folded to their non-phi operand, as the reduction recipe handles the
9017 // condition directly.
9018 VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
9019 for (VPSingleDefRecipe *CurrentLink : drop_begin(RangeOrContainer&: Worklist)) {
9020 if (auto *Blend = dyn_cast<VPBlendRecipe>(Val: CurrentLink)) {
9021 assert(Blend->getNumIncomingValues() == 2 &&
9022 "Blend must have 2 incoming values");
9023 if (Blend->getIncomingValue(Idx: 0) == PhiR) {
9024 Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: 1));
9025 } else {
9026 assert(Blend->getIncomingValue(1) == PhiR &&
9027 "PhiR must be an operand of the blend");
9028 Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: 0));
9029 }
9030 continue;
9031 }
9032
9033 Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9034
9035 // Index of the first operand which holds a non-mask vector operand.
9036 unsigned IndexOfFirstOperand;
9037 // Recognize a call to the llvm.fmuladd intrinsic.
9038 bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9039 VPValue *VecOp;
9040 VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9041 if (IsFMulAdd) {
9042 assert(
9043 RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9044 "Expected instruction to be a call to the llvm.fmuladd intrinsic");
9045 assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
9046 isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9047 CurrentLink->getOperand(2) == PreviousLink &&
9048 "expected a call where the previous link is the added operand");
9049
9050 // If the instruction is a call to the llvm.fmuladd intrinsic then we
9051 // need to create an fmul recipe (multiplying the first two operands of
9052 // the fmuladd together) to use as the vector operand for the fadd
9053 // reduction.
9054 VPInstruction *FMulRecipe = new VPInstruction(
9055 Instruction::FMul,
9056 {CurrentLink->getOperand(N: 0), CurrentLink->getOperand(N: 1)},
9057 CurrentLinkI->getFastMathFlags());
9058 LinkVPBB->insert(Recipe: FMulRecipe, InsertPt: CurrentLink->getIterator());
9059 VecOp = FMulRecipe;
9060 } else {
9061 if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9062 if (isa<VPWidenRecipe>(Val: CurrentLink)) {
9063 assert(isa<CmpInst>(CurrentLinkI) &&
9064 "need to have the compare of the select");
9065 continue;
9066 }
9067 assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9068 "must be a select recipe");
9069 IndexOfFirstOperand = 1;
9070 } else {
9071 assert((MinVF.isScalar() || isa<VPWidenRecipe>(CurrentLink)) &&
9072 "Expected to replace a VPWidenSC");
9073 IndexOfFirstOperand = 0;
9074 }
9075 // Note that for non-commutable operands (cmp-selects), the semantics of
9076 // the cmp-select are captured in the recurrence kind.
9077 unsigned VecOpId =
9078 CurrentLink->getOperand(N: IndexOfFirstOperand) == PreviousLink
9079 ? IndexOfFirstOperand + 1
9080 : IndexOfFirstOperand;
9081 VecOp = CurrentLink->getOperand(N: VecOpId);
9082 assert(VecOp != PreviousLink &&
9083 CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
9084 (VecOpId - IndexOfFirstOperand)) ==
9085 PreviousLink &&
9086 "PreviousLink must be the operand other than VecOp");
9087 }
9088
9089 VPValue *CondOp = nullptr;
9090 if (CM.blockNeedsPredicationForAnyReason(BB: CurrentLinkI->getParent()))
9091 CondOp = RecipeBuilder.getBlockInMask(VPBB: CurrentLink->getParent());
9092
9093 // TODO: Retrieve FMFs from recipes directly.
9094 RecurrenceDescriptor RdxDesc = Legal->getRecurrenceDescriptor(
9095 PN: cast<PHINode>(Val: PhiR->getUnderlyingInstr()));
9096 // Non-FP RdxDescs will have all fast math flags set, so clear them.
9097 FastMathFlags FMFs = isa<FPMathOperator>(Val: CurrentLinkI)
9098 ? RdxDesc.getFastMathFlags()
9099 : FastMathFlags();
9100 auto *RedRecipe = new VPReductionRecipe(
9101 Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp,
9102 PhiR->isOrdered(), CurrentLinkI->getDebugLoc());
9103 // Append the recipe to the end of the VPBasicBlock because we need to
9104 // ensure that it comes after all of it's inputs, including CondOp.
9105 // Delete CurrentLink as it will be invalid if its operand is replaced
9106 // with a reduction defined at the bottom of the block in the next link.
9107 if (LinkVPBB->getNumSuccessors() == 0)
9108 RedRecipe->insertBefore(InsertPos: &*std::prev(x: std::prev(x: LinkVPBB->end())));
9109 else
9110 LinkVPBB->appendRecipe(Recipe: RedRecipe);
9111
9112 CurrentLink->replaceAllUsesWith(New: RedRecipe);
9113 ToDelete.push_back(Elt: CurrentLink);
9114 PreviousLink = RedRecipe;
9115 }
9116 }
9117 VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9118 Builder.setInsertPoint(&*std::prev(x: std::prev(x: LatchVPBB->end())));
9119 VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9120 for (VPRecipeBase &R :
9121 Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9122 VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
9123 if (!PhiR)
9124 continue;
9125
9126 const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
9127 PN: cast<PHINode>(Val: PhiR->getUnderlyingInstr()));
9128 Type *PhiTy = PhiR->getUnderlyingValue()->getType();
9129 // If tail is folded by masking, introduce selects between the phi
9130 // and the users outside the vector region of each reduction, at the
9131 // beginning of the dedicated latch block.
9132 auto *OrigExitingVPV = PhiR->getBackedgeValue();
9133 auto *NewExitingVPV = PhiR->getBackedgeValue();
9134 // Don't output selects for partial reductions because they have an output
9135 // with fewer lanes than the VF. So the operands of the select would have
9136 // different numbers of lanes. Partial reductions mask the input instead.
9137 if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
9138 !isa<VPPartialReductionRecipe>(Val: OrigExitingVPV->getDefiningRecipe())) {
9139 VPValue *Cond = RecipeBuilder.getBlockInMask(VPBB: PhiR->getParent());
9140 std::optional<FastMathFlags> FMFs =
9141 PhiTy->isFloatingPointTy()
9142 ? std::make_optional(t: RdxDesc.getFastMathFlags())
9143 : std::nullopt;
9144 NewExitingVPV =
9145 Builder.createSelect(Cond, TrueVal: OrigExitingVPV, FalseVal: PhiR, DL: {}, Name: "", FMFs);
9146 OrigExitingVPV->replaceUsesWithIf(New: NewExitingVPV, ShouldReplace: [](VPUser &U, unsigned) {
9147 return isa<VPInstruction>(Val: &U) &&
9148 (cast<VPInstruction>(Val: &U)->getOpcode() ==
9149 VPInstruction::ComputeAnyOfResult ||
9150 cast<VPInstruction>(Val: &U)->getOpcode() ==
9151 VPInstruction::ComputeReductionResult ||
9152 cast<VPInstruction>(Val: &U)->getOpcode() ==
9153 VPInstruction::ComputeFindIVResult);
9154 });
9155 if (CM.usePredicatedReductionSelect())
9156 PhiR->setOperand(I: 1, New: NewExitingVPV);
9157 }
9158
9159 // We want code in the middle block to appear to execute on the location of
9160 // the scalar loop's latch terminator because: (a) it is all compiler
9161 // generated, (b) these instructions are always executed after evaluating
9162 // the latch conditional branch, and (c) other passes may add new
9163 // predecessors which terminate on this line. This is the easiest way to
9164 // ensure we don't accidentally cause an extra step back into the loop while
9165 // debugging.
9166 DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9167
9168 // TODO: At the moment ComputeReductionResult also drives creation of the
9169 // bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9170 // even for in-loop reductions, until the reduction resume value handling is
9171 // also modeled in VPlan.
9172 VPInstruction *FinalReductionResult;
9173 VPBuilder::InsertPointGuard Guard(Builder);
9174 Builder.setInsertPoint(TheBB: MiddleVPBB, IP);
9175 RecurKind RecurrenceKind = PhiR->getRecurrenceKind();
9176 if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RecurrenceKind)) {
9177 VPValue *Start = PhiR->getStartValue();
9178 VPValue *Sentinel = Plan->getOrAddLiveIn(V: RdxDesc.getSentinelValue());
9179 FinalReductionResult =
9180 Builder.createNaryOp(Opcode: VPInstruction::ComputeFindIVResult,
9181 Operands: {PhiR, Start, Sentinel, NewExitingVPV}, DL: ExitDL);
9182 } else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RecurrenceKind)) {
9183 VPValue *Start = PhiR->getStartValue();
9184 FinalReductionResult =
9185 Builder.createNaryOp(Opcode: VPInstruction::ComputeAnyOfResult,
9186 Operands: {PhiR, Start, NewExitingVPV}, DL: ExitDL);
9187 } else {
9188 VPIRFlags Flags =
9189 RecurrenceDescriptor::isFloatingPointRecurrenceKind(Kind: RecurrenceKind)
9190 ? VPIRFlags(RdxDesc.getFastMathFlags())
9191 : VPIRFlags();
9192 FinalReductionResult =
9193 Builder.createNaryOp(Opcode: VPInstruction::ComputeReductionResult,
9194 Operands: {PhiR, NewExitingVPV}, Flags, DL: ExitDL);
9195 }
9196 // If the vector reduction can be performed in a smaller type, we truncate
9197 // then extend the loop exit value to enable InstCombine to evaluate the
9198 // entire expression in the smaller type.
9199 if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9200 !RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RecurrenceKind)) {
9201 assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9202 assert(!RecurrenceDescriptor::isMinMaxRecurrenceKind(RecurrenceKind) &&
9203 "Unexpected truncated min-max recurrence!");
9204 Type *RdxTy = RdxDesc.getRecurrenceType();
9205 auto *Trunc =
9206 new VPWidenCastRecipe(Instruction::Trunc, NewExitingVPV, RdxTy);
9207 Instruction::CastOps ExtendOpc =
9208 RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
9209 auto *Extnd = new VPWidenCastRecipe(ExtendOpc, Trunc, PhiTy);
9210 Trunc->insertAfter(InsertPos: NewExitingVPV->getDefiningRecipe());
9211 Extnd->insertAfter(InsertPos: Trunc);
9212 if (PhiR->getOperand(N: 1) == NewExitingVPV)
9213 PhiR->setOperand(I: 1, New: Extnd->getVPSingleValue());
9214
9215 // Update ComputeReductionResult with the truncated exiting value and
9216 // extend its result.
9217 FinalReductionResult->setOperand(I: 1, New: Trunc);
9218 FinalReductionResult =
9219 Builder.createScalarCast(Opcode: ExtendOpc, Op: FinalReductionResult, ResultTy: PhiTy, DL: {});
9220 }
9221
9222 // Update all users outside the vector region. Also replace redundant
9223 // ExtractLastElement.
9224 for (auto *U : to_vector(Range: OrigExitingVPV->users())) {
9225 auto *Parent = cast<VPRecipeBase>(Val: U)->getParent();
9226 if (FinalReductionResult == U || Parent->getParent())
9227 continue;
9228 U->replaceUsesOfWith(From: OrigExitingVPV, To: FinalReductionResult);
9229 if (match(U, P: m_VPInstruction<VPInstruction::ExtractLastElement>(
9230 Op0: m_VPValue())))
9231 cast<VPInstruction>(Val: U)->replaceAllUsesWith(New: FinalReductionResult);
9232 }
9233
9234 // Adjust AnyOf reductions; replace the reduction phi for the selected value
9235 // with a boolean reduction phi node to check if the condition is true in
9236 // any iteration. The final value is selected by the final
9237 // ComputeReductionResult.
9238 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RecurrenceKind)) {
9239 auto *Select = cast<VPRecipeBase>(Val: *find_if(Range: PhiR->users(), P: [](VPUser *U) {
9240 return isa<VPWidenSelectRecipe>(Val: U) ||
9241 (isa<VPReplicateRecipe>(Val: U) &&
9242 cast<VPReplicateRecipe>(Val: U)->getUnderlyingInstr()->getOpcode() ==
9243 Instruction::Select);
9244 }));
9245 VPValue *Cmp = Select->getOperand(N: 0);
9246 // If the compare is checking the reduction PHI node, adjust it to check
9247 // the start value.
9248 if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
9249 CmpR->replaceUsesOfWith(From: PhiR, To: PhiR->getStartValue());
9250 Builder.setInsertPoint(Select);
9251
9252 // If the true value of the select is the reduction phi, the new value is
9253 // selected if the negated condition is true in any iteration.
9254 if (Select->getOperand(N: 1) == PhiR)
9255 Cmp = Builder.createNot(Operand: Cmp);
9256 VPValue *Or = Builder.createOr(LHS: PhiR, RHS: Cmp);
9257 Select->getVPSingleValue()->replaceAllUsesWith(New: Or);
9258 // Delete Select now that it has invalid types.
9259 ToDelete.push_back(Elt: Select);
9260
9261 // Convert the reduction phi to operate on bools.
9262 PhiR->setOperand(I: 0, New: Plan->getOrAddLiveIn(V: ConstantInt::getFalse(
9263 Context&: OrigLoop->getHeader()->getContext())));
9264 continue;
9265 }
9266
9267 if (RecurrenceDescriptor::isFindIVRecurrenceKind(
9268 Kind: RdxDesc.getRecurrenceKind())) {
9269 // Adjust the start value for FindFirstIV/FindLastIV recurrences to use
9270 // the sentinel value after generating the ResumePhi recipe, which uses
9271 // the original start value.
9272 PhiR->setOperand(I: 0, New: Plan->getOrAddLiveIn(V: RdxDesc.getSentinelValue()));
9273 }
9274 RecurKind RK = RdxDesc.getRecurrenceKind();
9275 if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK) &&
9276 !RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RK) &&
9277 !RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK))) {
9278 VPBuilder PHBuilder(Plan->getVectorPreheader());
9279 VPValue *Iden = Plan->getOrAddLiveIn(
9280 V: getRecurrenceIdentity(K: RK, Tp: PhiTy, FMF: RdxDesc.getFastMathFlags()));
9281 // If the PHI is used by a partial reduction, set the scale factor.
9282 unsigned ScaleFactor =
9283 RecipeBuilder.getScalingForReduction(ExitInst: RdxDesc.getLoopExitInstr())
9284 .value_or(u: 1);
9285 Type *I32Ty = IntegerType::getInt32Ty(C&: PhiTy->getContext());
9286 auto *ScaleFactorVPV =
9287 Plan->getOrAddLiveIn(V: ConstantInt::get(Ty: I32Ty, V: ScaleFactor));
9288 VPValue *StartV = PHBuilder.createNaryOp(
9289 Opcode: VPInstruction::ReductionStartVector,
9290 Operands: {PhiR->getStartValue(), Iden, ScaleFactorVPV},
9291 Flags: PhiTy->isFloatingPointTy() ? RdxDesc.getFastMathFlags()
9292 : FastMathFlags());
9293 PhiR->setOperand(I: 0, New: StartV);
9294 }
9295 }
9296 for (VPRecipeBase *R : ToDelete)
9297 R->eraseFromParent();
9298
9299 VPlanTransforms::runPass(Fn: VPlanTransforms::clearReductionWrapFlags, Plan&: *Plan);
9300}
9301
9302void LoopVectorizationPlanner::attachRuntimeChecks(
9303 VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
9304 const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
9305 if (SCEVCheckBlock) {
9306 assert((!CM.OptForSize ||
9307 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
9308 "Cannot SCEV check stride or overflow when optimizing for size");
9309 VPlanTransforms::attachCheckBlock(Plan, Cond: SCEVCheckCond, CheckBlock: SCEVCheckBlock,
9310 AddBranchWeights: HasBranchWeights);
9311 }
9312 const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
9313 if (MemCheckBlock) {
9314 // VPlan-native path does not do any analysis for runtime checks
9315 // currently.
9316 assert((!EnableVPlanNativePath || OrigLoop->isInnermost()) &&
9317 "Runtime checks are not supported for outer loops yet");
9318
9319 if (CM.OptForSize) {
9320 assert(
9321 CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
9322 "Cannot emit memory checks when optimizing for size, unless forced "
9323 "to vectorize.");
9324 ORE->emit(RemarkBuilder: [&]() {
9325 return OptimizationRemarkAnalysis(DEBUG_TYPE, "VectorizationCodeSize",
9326 OrigLoop->getStartLoc(),
9327 OrigLoop->getHeader())
9328 << "Code-size may be reduced by not forcing "
9329 "vectorization, or by source-code modifications "
9330 "eliminating the need for runtime checks "
9331 "(e.g., adding 'restrict').";
9332 });
9333 }
9334 VPlanTransforms::attachCheckBlock(Plan, Cond: MemCheckCond, CheckBlock: MemCheckBlock,
9335 AddBranchWeights: HasBranchWeights);
9336 }
9337}
9338
9339void VPDerivedIVRecipe::execute(VPTransformState &State) {
9340 assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9341
9342 // Fast-math-flags propagate from the original induction instruction.
9343 IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9344 if (FPBinOp)
9345 State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9346
9347 Value *Step = State.get(Def: getStepValue(), Lane: VPLane(0));
9348 Value *Index = State.get(Def: getOperand(N: 1), Lane: VPLane(0));
9349 Value *DerivedIV = emitTransformedIndex(
9350 B&: State.Builder, Index, StartValue: getStartValue()->getLiveInIRValue(), Step, InductionKind: Kind,
9351 InductionBinOp: cast_if_present<BinaryOperator>(Val: FPBinOp));
9352 DerivedIV->setName(Name);
9353 // If index is the vector trip count, the concrete value will only be set in
9354 // prepareToExecute, leading to missed simplifications, e.g. if it is 0.
9355 // TODO: Remove the special case for the vector trip count once it is computed
9356 // in VPlan and can be used during VPlan simplification.
9357 assert((DerivedIV != Index ||
9358 getOperand(1) == &getParent()->getPlan()->getVectorTripCount()) &&
9359 "IV didn't need transforming?");
9360 State.set(Def: this, V: DerivedIV, Lane: VPLane(0));
9361}
9362
9363// Determine how to lower the scalar epilogue, which depends on 1) optimising
9364// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9365// predication, and 4) a TTI hook that analyses whether the loop is suitable
9366// for predication.
9367static ScalarEpilogueLowering getScalarEpilogueLowering(
9368 Function *F, Loop *L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9369 BlockFrequencyInfo *BFI, TargetTransformInfo *TTI, TargetLibraryInfo *TLI,
9370 LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9371 // 1) OptSize takes precedence over all other options, i.e. if this is set,
9372 // don't look at hints or options, and don't request a scalar epilogue.
9373 // (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9374 // LoopAccessInfo (due to code dependency and not being able to reliably get
9375 // PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9376 // of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9377 // versioning when the vectorization is forced, unlike hasOptSize. So revert
9378 // back to the old way and vectorize with versioning when forced. See D81345.)
9379 if (F->hasOptSize() || (llvm::shouldOptimizeForSize(BB: L->getHeader(), PSI, BFI,
9380 QueryType: PGSOQueryType::IRPass) &&
9381 Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9382 return CM_ScalarEpilogueNotAllowedOptSize;
9383
9384 // 2) If set, obey the directives
9385 if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9386 switch (PreferPredicateOverEpilogue) {
9387 case PreferPredicateTy::ScalarEpilogue:
9388 return CM_ScalarEpilogueAllowed;
9389 case PreferPredicateTy::PredicateElseScalarEpilogue:
9390 return CM_ScalarEpilogueNotNeededUsePredicate;
9391 case PreferPredicateTy::PredicateOrDontVectorize:
9392 return CM_ScalarEpilogueNotAllowedUsePredicate;
9393 };
9394 }
9395
9396 // 3) If set, obey the hints
9397 switch (Hints.getPredicate()) {
9398 case LoopVectorizeHints::FK_Enabled:
9399 return CM_ScalarEpilogueNotNeededUsePredicate;
9400 case LoopVectorizeHints::FK_Disabled:
9401 return CM_ScalarEpilogueAllowed;
9402 };
9403
9404 // 4) if the TTI hook indicates this is profitable, request predication.
9405 TailFoldingInfo TFI(TLI, &LVL, IAI);
9406 if (TTI->preferPredicateOverEpilogue(TFI: &TFI))
9407 return CM_ScalarEpilogueNotNeededUsePredicate;
9408
9409 return CM_ScalarEpilogueAllowed;
9410}
9411
9412// Process the loop in the VPlan-native vectorization path. This path builds
9413// VPlan upfront in the vectorization pipeline, which allows to apply
9414// VPlan-to-VPlan transformations from the very beginning without modifying the
9415// input LLVM IR.
9416static bool processLoopInVPlanNativePath(
9417 Loop *L, PredicatedScalarEvolution &PSE, LoopInfo *LI, DominatorTree *DT,
9418 LoopVectorizationLegality *LVL, TargetTransformInfo *TTI,
9419 TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC,
9420 OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
9421 ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9422 LoopVectorizationRequirements &Requirements) {
9423
9424 if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
9425 LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9426 return false;
9427 }
9428 assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9429 Function *F = L->getHeader()->getParent();
9430 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9431
9432 ScalarEpilogueLowering SEL =
9433 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL&: *LVL, IAI: &IAI);
9434
9435 LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9436 &Hints, IAI, PSI, BFI);
9437 // Use the planner for outer loop vectorization.
9438 // TODO: CM is not used at this point inside the planner. Turn CM into an
9439 // optional argument if we don't need it in the future.
9440 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9441 ORE);
9442
9443 // Get user vectorization factor.
9444 ElementCount UserVF = Hints.getWidth();
9445
9446 CM.collectElementTypesForWidening();
9447
9448 // Plan how to best vectorize, return the best VF and its cost.
9449 const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9450
9451 // If we are stress testing VPlan builds, do not attempt to generate vector
9452 // code. Masked vector code generation support will follow soon.
9453 // Also, do not attempt to vectorize if no vector code will be produced.
9454 if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
9455 return false;
9456
9457 VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
9458
9459 {
9460 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
9461 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9462 VF.Width, 1, &CM, BFI, PSI, Checks, BestPlan);
9463 LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9464 << L->getHeader()->getParent()->getName() << "\"\n");
9465 LVP.executePlan(BestVF: VF.Width, BestUF: 1, BestVPlan&: BestPlan, ILV&: LB, DT, VectorizingEpilogue: false);
9466 }
9467
9468 reportVectorization(ORE, TheLoop: L, VF, IC: 1);
9469
9470 // Mark the loop as already vectorized to avoid vectorizing again.
9471 Hints.setAlreadyVectorized();
9472 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9473 return true;
9474}
9475
9476// Emit a remark if there are stores to floats that required a floating point
9477// extension. If the vectorized loop was generated with floating point there
9478// will be a performance penalty from the conversion overhead and the change in
9479// the vector width.
9480static void checkMixedPrecision(Loop *L, OptimizationRemarkEmitter *ORE) {
9481 SmallVector<Instruction *, 4> Worklist;
9482 for (BasicBlock *BB : L->getBlocks()) {
9483 for (Instruction &Inst : *BB) {
9484 if (auto *S = dyn_cast<StoreInst>(Val: &Inst)) {
9485 if (S->getValueOperand()->getType()->isFloatTy())
9486 Worklist.push_back(Elt: S);
9487 }
9488 }
9489 }
9490
9491 // Traverse the floating point stores upwards searching, for floating point
9492 // conversions.
9493 SmallPtrSet<const Instruction *, 4> Visited;
9494 SmallPtrSet<const Instruction *, 4> EmittedRemark;
9495 while (!Worklist.empty()) {
9496 auto *I = Worklist.pop_back_val();
9497 if (!L->contains(Inst: I))
9498 continue;
9499 if (!Visited.insert(Ptr: I).second)
9500 continue;
9501
9502 // Emit a remark if the floating point store required a floating
9503 // point conversion.
9504 // TODO: More work could be done to identify the root cause such as a
9505 // constant or a function return type and point the user to it.
9506 if (isa<FPExtInst>(Val: I) && EmittedRemark.insert(Ptr: I).second)
9507 ORE->emit(RemarkBuilder: [&]() {
9508 return OptimizationRemarkAnalysis(LV_NAME, "VectorMixedPrecision",
9509 I->getDebugLoc(), L->getHeader())
9510 << "floating point conversion changes vector width. "
9511 << "Mixed floating point precision requires an up/down "
9512 << "cast that will negatively impact performance.";
9513 });
9514
9515 for (Use &Op : I->operands())
9516 if (auto *OpI = dyn_cast<Instruction>(Val&: Op))
9517 Worklist.push_back(Elt: OpI);
9518 }
9519}
9520
9521/// For loops with uncountable early exits, find the cost of doing work when
9522/// exiting the loop early, such as calculating the final exit values of
9523/// variables used outside the loop.
9524/// TODO: This is currently overly pessimistic because the loop may not take
9525/// the early exit, but better to keep this conservative for now. In future,
9526/// it might be possible to relax this by using branch probabilities.
9527static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx,
9528 VPlan &Plan, ElementCount VF) {
9529 InstructionCost Cost = 0;
9530 for (auto *ExitVPBB : Plan.getExitBlocks()) {
9531 for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
9532 // If the predecessor is not the middle.block, then it must be the
9533 // vector.early.exit block, which may contain work to calculate the exit
9534 // values of variables used outside the loop.
9535 if (PredVPBB != Plan.getMiddleBlock()) {
9536 LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
9537 << PredVPBB->getName() << ":\n");
9538 Cost += PredVPBB->cost(VF, Ctx&: CostCtx);
9539 }
9540 }
9541 }
9542 return Cost;
9543}
9544
9545/// This function determines whether or not it's still profitable to vectorize
9546/// the loop given the extra work we have to do outside of the loop:
9547/// 1. Perform the runtime checks before entering the loop to ensure it's safe
9548/// to vectorize.
9549/// 2. In the case of loops with uncountable early exits, we may have to do
9550/// extra work when exiting the loop early, such as calculating the final
9551/// exit values of variables used outside the loop.
9552static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
9553 VectorizationFactor &VF, Loop *L,
9554 PredicatedScalarEvolution &PSE,
9555 VPCostContext &CostCtx, VPlan &Plan,
9556 ScalarEpilogueLowering SEL,
9557 std::optional<unsigned> VScale) {
9558 InstructionCost TotalCost = Checks.getCost();
9559 if (!TotalCost.isValid())
9560 return false;
9561
9562 // Add on the cost of any work required in the vector early exit block, if
9563 // one exists.
9564 TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF: VF.Width);
9565
9566 // When interleaving only scalar and vector cost will be equal, which in turn
9567 // would lead to a divide by 0. Fall back to hard threshold.
9568 if (VF.Width.isScalar()) {
9569 // TODO: Should we rename VectorizeMemoryCheckThreshold?
9570 if (TotalCost > VectorizeMemoryCheckThreshold) {
9571 LLVM_DEBUG(
9572 dbgs()
9573 << "LV: Interleaving only is not profitable due to runtime checks\n");
9574 return false;
9575 }
9576 return true;
9577 }
9578
9579 // The scalar cost should only be 0 when vectorizing with a user specified
9580 // VF/IC. In those cases, runtime checks should always be generated.
9581 uint64_t ScalarC = VF.ScalarCost.getValue();
9582 if (ScalarC == 0)
9583 return true;
9584
9585 // First, compute the minimum iteration count required so that the vector
9586 // loop outperforms the scalar loop.
9587 // The total cost of the scalar loop is
9588 // ScalarC * TC
9589 // where
9590 // * TC is the actual trip count of the loop.
9591 // * ScalarC is the cost of a single scalar iteration.
9592 //
9593 // The total cost of the vector loop is
9594 // RtC + VecC * (TC / VF) + EpiC
9595 // where
9596 // * RtC is the cost of the generated runtime checks plus the cost of
9597 // performing any additional work in the vector.early.exit block for loops
9598 // with uncountable early exits.
9599 // * VecC is the cost of a single vector iteration.
9600 // * TC is the actual trip count of the loop
9601 // * VF is the vectorization factor
9602 // * EpiCost is the cost of the generated epilogue, including the cost
9603 // of the remaining scalar operations.
9604 //
9605 // Vectorization is profitable once the total vector cost is less than the
9606 // total scalar cost:
9607 // RtC + VecC * (TC / VF) + EpiC < ScalarC * TC
9608 //
9609 // Now we can compute the minimum required trip count TC as
9610 // VF * (RtC + EpiC) / (ScalarC * VF - VecC) < TC
9611 //
9612 // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9613 // the computations are performed on doubles, not integers and the result
9614 // is rounded up, hence we get an upper estimate of the TC.
9615 unsigned IntVF = getEstimatedRuntimeVF(VF: VF.Width, VScale);
9616 uint64_t RtC = TotalCost.getValue();
9617 uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
9618 uint64_t MinTC1 = Div == 0 ? 0 : divideCeil(Numerator: RtC * IntVF, Denominator: Div);
9619
9620 // Second, compute a minimum iteration count so that the cost of the
9621 // runtime checks is only a fraction of the total scalar loop cost. This
9622 // adds a loop-dependent bound on the overhead incurred if the runtime
9623 // checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9624 // * TC. To bound the runtime check to be a fraction 1/X of the scalar
9625 // cost, compute
9626 // RtC < ScalarC * TC * (1 / X) ==> RtC * X / ScalarC < TC
9627 uint64_t MinTC2 = divideCeil(Numerator: RtC * 10, Denominator: ScalarC);
9628
9629 // Now pick the larger minimum. If it is not a multiple of VF and a scalar
9630 // epilogue is allowed, choose the next closest multiple of VF. This should
9631 // partly compensate for ignoring the epilogue cost.
9632 uint64_t MinTC = std::max(a: MinTC1, b: MinTC2);
9633 if (SEL == CM_ScalarEpilogueAllowed)
9634 MinTC = alignTo(Value: MinTC, Align: IntVF);
9635 VF.MinProfitableTripCount = ElementCount::getFixed(MinVal: MinTC);
9636
9637 LLVM_DEBUG(
9638 dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9639 << VF.MinProfitableTripCount << "\n");
9640
9641 // Skip vectorization if the expected trip count is less than the minimum
9642 // required trip count.
9643 if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
9644 if (ElementCount::isKnownLT(LHS: *ExpectedTC, RHS: VF.MinProfitableTripCount)) {
9645 LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9646 "trip count < minimum profitable VF ("
9647 << *ExpectedTC << " < " << VF.MinProfitableTripCount
9648 << ")\n");
9649
9650 return false;
9651 }
9652 }
9653 return true;
9654}
9655
9656LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9657 : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
9658 !EnableLoopInterleaving),
9659 VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
9660 !EnableLoopVectorization) {}
9661
9662/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
9663/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
9664/// don't have a corresponding wide induction in \p EpiPlan.
9665static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
9666 // Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
9667 // will need their resume-values computed in the main vector loop. Others
9668 // can be removed from the main VPlan.
9669 SmallPtrSet<PHINode *, 2> EpiWidenedPhis;
9670 for (VPRecipeBase &R :
9671 EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9672 if (isa<VPCanonicalIVPHIRecipe>(Val: &R))
9673 continue;
9674 EpiWidenedPhis.insert(
9675 Ptr: cast<PHINode>(Val: R.getVPSingleValue()->getUnderlyingValue()));
9676 }
9677 for (VPRecipeBase &R :
9678 make_early_inc_range(Range: MainPlan.getScalarHeader()->phis())) {
9679 auto *VPIRInst = cast<VPIRPhi>(Val: &R);
9680 if (EpiWidenedPhis.contains(Ptr: &VPIRInst->getIRPhi()))
9681 continue;
9682 // There is no corresponding wide induction in the epilogue plan that would
9683 // need a resume value. Remove the VPIRInst wrapping the scalar header phi
9684 // together with the corresponding ResumePhi. The resume values for the
9685 // scalar loop will be created during execution of EpiPlan.
9686 VPRecipeBase *ResumePhi = VPIRInst->getOperand(N: 0)->getDefiningRecipe();
9687 VPIRInst->eraseFromParent();
9688 ResumePhi->eraseFromParent();
9689 }
9690 VPlanTransforms::runPass(Fn: VPlanTransforms::removeDeadRecipes, Plan&: MainPlan);
9691
9692 using namespace VPlanPatternMatch;
9693 // When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
9694 // introduce multiple uses of undef/poison. If the reduction start value may
9695 // be undef or poison it needs to be frozen and the frozen start has to be
9696 // used when computing the reduction result. We also need to use the frozen
9697 // value in the resume phi generated by the main vector loop, as this is also
9698 // used to compute the reduction result after the epilogue vector loop.
9699 auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
9700 bool UpdateResumePhis) {
9701 VPBuilder Builder(Plan.getEntry());
9702 for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
9703 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
9704 if (!VPI || VPI->getOpcode() != VPInstruction::ComputeFindIVResult)
9705 continue;
9706 VPValue *OrigStart = VPI->getOperand(N: 1);
9707 if (isGuaranteedNotToBeUndefOrPoison(V: OrigStart->getLiveInIRValue()))
9708 continue;
9709 VPInstruction *Freeze =
9710 Builder.createNaryOp(Opcode: Instruction::Freeze, Operands: {OrigStart}, Inst: {}, Name: "fr");
9711 VPI->setOperand(I: 1, New: Freeze);
9712 if (UpdateResumePhis)
9713 OrigStart->replaceUsesWithIf(New: Freeze, ShouldReplace: [Freeze](VPUser &U, unsigned) {
9714 return Freeze != &U && isa<VPPhi>(Val: &U);
9715 });
9716 }
9717 };
9718 AddFreezeForFindLastIVReductions(MainPlan, true);
9719 AddFreezeForFindLastIVReductions(EpiPlan, false);
9720
9721 VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
9722 VPValue *VectorTC = &MainPlan.getVectorTripCount();
9723 // If there is a suitable resume value for the canonical induction in the
9724 // scalar (which will become vector) epilogue loop we are done. Otherwise
9725 // create it below.
9726 if (any_of(Range&: *MainScalarPH, P: [VectorTC](VPRecipeBase &R) {
9727 return match(V: &R, P: m_VPInstruction<Instruction::PHI>(Op0: m_Specific(VPV: VectorTC),
9728 Op1: m_SpecificInt(V: 0)));
9729 }))
9730 return;
9731 VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
9732 ScalarPHBuilder.createScalarPhi(
9733 IncomingValues: {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, DL: {},
9734 Name: "vec.epilog.resume.val");
9735}
9736
9737/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
9738/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
9739static void
9740preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
9741 const SCEV2ValueTy &ExpandedSCEVs,
9742 const EpilogueLoopVectorizationInfo &EPI) {
9743 VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
9744 VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9745 Header->setName("vec.epilog.vector.body");
9746
9747 DenseMap<Value *, Value *> ToFrozen;
9748 // Ensure that the start values for all header phi recipes are updated before
9749 // vectorizing the epilogue loop.
9750 for (VPRecipeBase &R : Header->phis()) {
9751 if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(Val: &R)) {
9752 // When vectorizing the epilogue loop, the canonical induction start
9753 // value needs to be changed from zero to the value after the main
9754 // vector loop. Find the resume value created during execution of the main
9755 // VPlan.
9756 // FIXME: Improve modeling for canonical IV start values in the epilogue
9757 // loop.
9758 using namespace llvm::PatternMatch;
9759 Type *IdxTy = IV->getScalarType();
9760 PHINode *EPResumeVal = find_singleton<PHINode>(
9761 Range: L->getLoopPreheader()->phis(),
9762 P: [&EPI, IdxTy](PHINode &P, bool) -> PHINode * {
9763 if (P.getType() == IdxTy &&
9764 match(
9765 V: P.getIncomingValueForBlock(BB: EPI.MainLoopIterationCountCheck),
9766 P: m_SpecificInt(V: 0)) &&
9767 all_of(Range: P.incoming_values(), P: [&EPI](Value *Inc) {
9768 return Inc == EPI.VectorTripCount ||
9769 match(V: Inc, P: m_SpecificInt(V: 0));
9770 }))
9771 return &P;
9772 return nullptr;
9773 });
9774 assert(EPResumeVal && "must have a resume value for the canonical IV");
9775 VPValue *VPV = Plan.getOrAddLiveIn(V: EPResumeVal);
9776 assert(all_of(IV->users(),
9777 [](const VPUser *U) {
9778 return isa<VPScalarIVStepsRecipe>(U) ||
9779 isa<VPDerivedIVRecipe>(U) ||
9780 cast<VPRecipeBase>(U)->isScalarCast() ||
9781 cast<VPInstruction>(U)->getOpcode() ==
9782 Instruction::Add;
9783 }) &&
9784 "the canonical IV should only be used by its increment or "
9785 "ScalarIVSteps when resetting the start value");
9786 IV->setOperand(I: 0, New: VPV);
9787 continue;
9788 }
9789
9790 Value *ResumeV = nullptr;
9791 // TODO: Move setting of resume values to prepareToExecute.
9792 if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R)) {
9793 auto *RdxResult =
9794 cast<VPInstruction>(Val: *find_if(Range: ReductionPhi->users(), P: [](VPUser *U) {
9795 auto *VPI = dyn_cast<VPInstruction>(Val: U);
9796 return VPI &&
9797 (VPI->getOpcode() == VPInstruction::ComputeAnyOfResult ||
9798 VPI->getOpcode() == VPInstruction::ComputeReductionResult ||
9799 VPI->getOpcode() == VPInstruction::ComputeFindIVResult);
9800 }));
9801 ResumeV = cast<PHINode>(Val: ReductionPhi->getUnderlyingInstr())
9802 ->getIncomingValueForBlock(BB: L->getLoopPreheader());
9803 RecurKind RK = ReductionPhi->getRecurrenceKind();
9804 if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK)) {
9805 Value *StartV = RdxResult->getOperand(N: 1)->getLiveInIRValue();
9806 // VPReductionPHIRecipes for AnyOf reductions expect a boolean as
9807 // start value; compare the final value from the main vector loop
9808 // to the start value.
9809 BasicBlock *PBB = cast<Instruction>(Val: ResumeV)->getParent();
9810 IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
9811 ResumeV = Builder.CreateICmpNE(LHS: ResumeV, RHS: StartV);
9812 } else if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RK)) {
9813 Value *StartV = getStartValueFromReductionResult(RdxResult);
9814 ToFrozen[StartV] = cast<PHINode>(Val: ResumeV)->getIncomingValueForBlock(
9815 BB: EPI.MainLoopIterationCountCheck);
9816
9817 // VPReductionPHIRecipe for FindFirstIV/FindLastIV reductions requires
9818 // an adjustment to the resume value. The resume value is adjusted to
9819 // the sentinel value when the final value from the main vector loop
9820 // equals the start value. This ensures correctness when the start value
9821 // might not be less than the minimum value of a monotonically
9822 // increasing induction variable.
9823 BasicBlock *ResumeBB = cast<Instruction>(Val: ResumeV)->getParent();
9824 IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
9825 Value *Cmp = Builder.CreateICmpEQ(LHS: ResumeV, RHS: ToFrozen[StartV]);
9826 Value *Sentinel = RdxResult->getOperand(N: 2)->getLiveInIRValue();
9827 ResumeV = Builder.CreateSelect(C: Cmp, True: Sentinel, False: ResumeV);
9828 } else {
9829 VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
9830 auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
9831 if (auto *VPI = dyn_cast<VPInstruction>(Val: PhiR->getStartValue())) {
9832 assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
9833 "unexpected start value");
9834 VPI->setOperand(I: 0, New: StartVal);
9835 continue;
9836 }
9837 }
9838 } else {
9839 // Retrieve the induction resume values for wide inductions from
9840 // their original phi nodes in the scalar loop.
9841 PHINode *IndPhi = cast<VPWidenInductionRecipe>(Val: &R)->getPHINode();
9842 // Hook up to the PHINode generated by a ResumePhi recipe of main
9843 // loop VPlan, which feeds the scalar loop.
9844 ResumeV = IndPhi->getIncomingValueForBlock(BB: L->getLoopPreheader());
9845 }
9846 assert(ResumeV && "Must have a resume value");
9847 VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
9848 cast<VPHeaderPHIRecipe>(Val: &R)->setStartValue(StartVal);
9849 }
9850
9851 // For some VPValues in the epilogue plan we must re-use the generated IR
9852 // values from the main plan. Replace them with live-in VPValues.
9853 // TODO: This is a workaround needed for epilogue vectorization and it
9854 // should be removed once induction resume value creation is done
9855 // directly in VPlan.
9856 for (auto &R : make_early_inc_range(Range&: *Plan.getEntry())) {
9857 // Re-use frozen values from the main plan for Freeze VPInstructions in the
9858 // epilogue plan. This ensures all users use the same frozen value.
9859 auto *VPI = dyn_cast<VPInstruction>(Val: &R);
9860 if (VPI && VPI->getOpcode() == Instruction::Freeze) {
9861 VPI->replaceAllUsesWith(New: Plan.getOrAddLiveIn(
9862 V: ToFrozen.lookup(Val: VPI->getOperand(N: 0)->getLiveInIRValue())));
9863 continue;
9864 }
9865
9866 // Re-use the trip count and steps expanded for the main loop, as
9867 // skeleton creation needs it as a value that dominates both the scalar
9868 // and vector epilogue loops
9869 auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
9870 if (!ExpandR)
9871 continue;
9872 VPValue *ExpandedVal =
9873 Plan.getOrAddLiveIn(V: ExpandedSCEVs.lookup(Val: ExpandR->getSCEV()));
9874 ExpandR->replaceAllUsesWith(New: ExpandedVal);
9875 if (Plan.getTripCount() == ExpandR)
9876 Plan.resetTripCount(NewTripCount: ExpandedVal);
9877 ExpandR->eraseFromParent();
9878 }
9879}
9880
9881// Generate bypass values from the additional bypass block. Note that when the
9882// vectorized epilogue is skipped due to iteration count check, then the
9883// resume value for the induction variable comes from the trip count of the
9884// main vector loop, passed as the second argument.
9885static Value *createInductionAdditionalBypassValues(
9886 PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder,
9887 const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
9888 Instruction *OldInduction) {
9889 Value *Step = getExpandedStep(ID: II, ExpandedSCEVs);
9890 // For the primary induction the additional bypass end value is known.
9891 // Otherwise it is computed.
9892 Value *EndValueFromAdditionalBypass = MainVectorTripCount;
9893 if (OrigPhi != OldInduction) {
9894 auto *BinOp = II.getInductionBinOp();
9895 // Fast-math-flags propagate from the original induction instruction.
9896 if (isa_and_nonnull<FPMathOperator>(Val: BinOp))
9897 BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
9898
9899 // Compute the end value for the additional bypass.
9900 EndValueFromAdditionalBypass =
9901 emitTransformedIndex(B&: BypassBuilder, Index: MainVectorTripCount,
9902 StartValue: II.getStartValue(), Step, InductionKind: II.getKind(), InductionBinOp: BinOp);
9903 EndValueFromAdditionalBypass->setName("ind.end");
9904 }
9905 return EndValueFromAdditionalBypass;
9906}
9907
9908bool LoopVectorizePass::processLoop(Loop *L) {
9909 assert((EnableVPlanNativePath || L->isInnermost()) &&
9910 "VPlan-native path is not enabled. Only process inner loops.");
9911
9912 LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9913 << L->getHeader()->getParent()->getName() << "' from "
9914 << L->getLocStr() << "\n");
9915
9916 LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9917
9918 LLVM_DEBUG(
9919 dbgs() << "LV: Loop hints:"
9920 << " force="
9921 << (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9922 ? "disabled"
9923 : (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9924 ? "enabled"
9925 : "?"))
9926 << " width=" << Hints.getWidth()
9927 << " interleave=" << Hints.getInterleave() << "\n");
9928
9929 // Function containing loop
9930 Function *F = L->getHeader()->getParent();
9931
9932 // Looking at the diagnostic output is the only way to determine if a loop
9933 // was vectorized (other than looking at the IR or machine code), so it
9934 // is important to generate an optimization remark for each loop. Most of
9935 // these messages are generated as OptimizationRemarkAnalysis. Remarks
9936 // generated as OptimizationRemark and OptimizationRemarkMissed are
9937 // less verbose reporting vectorized loops and unvectorized loops that may
9938 // benefit from vectorization, respectively.
9939
9940 if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9941 LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9942 return false;
9943 }
9944
9945 PredicatedScalarEvolution PSE(*SE, *L);
9946
9947 // Check if it is legal to vectorize the loop.
9948 LoopVectorizationRequirements Requirements;
9949 LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9950 &Requirements, &Hints, DB, AC, BFI, PSI);
9951 if (!LVL.canVectorize(UseVPlanNativePath: EnableVPlanNativePath)) {
9952 LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9953 Hints.emitRemarkWithHints();
9954 return false;
9955 }
9956
9957 if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
9958 reportVectorizationFailure(DebugMsg: "Auto-vectorization of loops with uncountable "
9959 "early exit is not enabled",
9960 ORETag: "UncountableEarlyExitLoopsDisabled", ORE, TheLoop: L);
9961 return false;
9962 }
9963
9964 // Entrance to the VPlan-native vectorization path. Outer loops are processed
9965 // here. They may require CFG and instruction level transformations before
9966 // even evaluating whether vectorization is profitable. Since we cannot modify
9967 // the incoming IR, we need to build VPlan upfront in the vectorization
9968 // pipeline.
9969 if (!L->isInnermost())
9970 return processLoopInVPlanNativePath(L, PSE, LI, DT, LVL: &LVL, TTI, TLI, DB, AC,
9971 ORE, BFI, PSI, Hints, Requirements);
9972
9973 assert(L->isInnermost() && "Inner loop expected.");
9974
9975 InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9976 bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9977
9978 // If an override option has been passed in for interleaved accesses, use it.
9979 if (EnableInterleavedMemAccesses.getNumOccurrences() > 0)
9980 UseInterleaved = EnableInterleavedMemAccesses;
9981
9982 // Analyze interleaved memory accesses.
9983 if (UseInterleaved)
9984 IAI.analyzeInterleaving(EnableMaskedInterleavedGroup: useMaskedInterleavedAccesses(TTI: *TTI));
9985
9986 if (LVL.hasUncountableEarlyExit()) {
9987 BasicBlock *LoopLatch = L->getLoopLatch();
9988 if (IAI.requiresScalarEpilogue() ||
9989 any_of(Range: LVL.getCountableExitingBlocks(),
9990 P: [LoopLatch](BasicBlock *BB) { return BB != LoopLatch; })) {
9991 reportVectorizationFailure(DebugMsg: "Auto-vectorization of early exit loops "
9992 "requiring a scalar epilogue is unsupported",
9993 ORETag: "UncountableEarlyExitUnsupported", ORE, TheLoop: L);
9994 return false;
9995 }
9996 }
9997
9998 // Check the function attributes and profiles to find out if this function
9999 // should be optimized for size.
10000 ScalarEpilogueLowering SEL =
10001 getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, IAI: &IAI);
10002
10003 // Check the loop for a trip count threshold: vectorize loops with a tiny trip
10004 // count by optimizing for size, to minimize overheads.
10005 auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10006 if (ExpectedTC && ExpectedTC->isFixed() &&
10007 ExpectedTC->getFixedValue() < TinyTripCountVectorThreshold) {
10008 LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10009 << "This loop is worth vectorizing only if no scalar "
10010 << "iteration overheads are incurred.");
10011 if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10012 LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10013 else {
10014 LLVM_DEBUG(dbgs() << "\n");
10015 // Predicate tail-folded loops are efficient even when the loop
10016 // iteration count is low. However, setting the epilogue policy to
10017 // `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10018 // with runtime checks. It's more effective to let
10019 // `isOutsideLoopWorkProfitable` determine if vectorization is
10020 // beneficial for the loop.
10021 if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
10022 SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10023 }
10024 }
10025
10026 // Check the function attributes to see if implicit floats or vectors are
10027 // allowed.
10028 if (F->hasFnAttribute(Kind: Attribute::NoImplicitFloat)) {
10029 reportVectorizationFailure(
10030 DebugMsg: "Can't vectorize when the NoImplicitFloat attribute is used",
10031 OREMsg: "loop not vectorized due to NoImplicitFloat attribute",
10032 ORETag: "NoImplicitFloat", ORE, TheLoop: L);
10033 Hints.emitRemarkWithHints();
10034 return false;
10035 }
10036
10037 // Check if the target supports potentially unsafe FP vectorization.
10038 // FIXME: Add a check for the type of safety issue (denormal, signaling)
10039 // for the target we're vectorizing for, to make sure none of the
10040 // additional fp-math flags can help.
10041 if (Hints.isPotentiallyUnsafe() &&
10042 TTI->isFPVectorizationPotentiallyUnsafe()) {
10043 reportVectorizationFailure(
10044 DebugMsg: "Potentially unsafe FP op prevents vectorization",
10045 OREMsg: "loop not vectorized due to unsafe FP support.",
10046 ORETag: "UnsafeFP", ORE, TheLoop: L);
10047 Hints.emitRemarkWithHints();
10048 return false;
10049 }
10050
10051 bool AllowOrderedReductions;
10052 // If the flag is set, use that instead and override the TTI behaviour.
10053 if (ForceOrderedReductions.getNumOccurrences() > 0)
10054 AllowOrderedReductions = ForceOrderedReductions;
10055 else
10056 AllowOrderedReductions = TTI->enableOrderedReductions();
10057 if (!LVL.canVectorizeFPMath(EnableStrictReductions: AllowOrderedReductions)) {
10058 ORE->emit(RemarkBuilder: [&]() {
10059 auto *ExactFPMathInst = Requirements.getExactFPInst();
10060 return OptimizationRemarkAnalysisFPCommute(DEBUG_TYPE, "CantReorderFPOps",
10061 ExactFPMathInst->getDebugLoc(),
10062 ExactFPMathInst->getParent())
10063 << "loop not vectorized: cannot prove it is safe to reorder "
10064 "floating-point operations";
10065 });
10066 LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10067 "reorder floating-point operations\n");
10068 Hints.emitRemarkWithHints();
10069 return false;
10070 }
10071
10072 // Use the cost model.
10073 LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10074 F, &Hints, IAI, PSI, BFI);
10075 // Use the planner for vectorization.
10076 LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10077 ORE);
10078
10079 // Get user vectorization factor and interleave count.
10080 ElementCount UserVF = Hints.getWidth();
10081 unsigned UserIC = Hints.getInterleave();
10082 if (LVL.hasUncountableEarlyExit() && UserIC != 1) {
10083 UserIC = 1;
10084 reportVectorizationInfo(Msg: "Interleaving not supported for loops "
10085 "with uncountable early exits",
10086 ORETag: "InterleaveEarlyExitDisabled", ORE, TheLoop: L);
10087 }
10088
10089 // Plan how to best vectorize.
10090 LVP.plan(UserVF, UserIC);
10091 VectorizationFactor VF = LVP.computeBestVF();
10092 unsigned IC = 1;
10093
10094 if (ORE->allowExtraAnalysis(LV_NAME))
10095 LVP.emitInvalidCostRemarks(ORE);
10096
10097 GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
10098 if (LVP.hasPlanWithVF(VF: VF.Width)) {
10099 // Select the interleave count.
10100 IC = CM.selectInterleaveCount(Plan&: LVP.getPlanFor(VF: VF.Width), VF: VF.Width, LoopCost: VF.Cost);
10101
10102 unsigned SelectedIC = std::max(a: IC, b: UserIC);
10103 // Optimistically generate runtime checks if they are needed. Drop them if
10104 // they turn out to not be profitable.
10105 if (VF.Width.isVector() || SelectedIC > 1)
10106 Checks.create(L, LAI: *LVL.getLAI(), UnionPred: PSE.getPredicate(), VF: VF.Width, IC: SelectedIC);
10107
10108 // Check if it is profitable to vectorize with runtime checks.
10109 bool ForceVectorization =
10110 Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10111 VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
10112 CM, CM.CostKind);
10113 if (!ForceVectorization &&
10114 !isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
10115 Plan&: LVP.getPlanFor(VF: VF.Width), SEL,
10116 VScale: CM.getVScaleForTuning())) {
10117 ORE->emit(RemarkBuilder: [&]() {
10118 return OptimizationRemarkAnalysisAliasing(
10119 DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10120 L->getHeader())
10121 << "loop not vectorized: cannot prove it is safe to reorder "
10122 "memory operations";
10123 });
10124 LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10125 Hints.emitRemarkWithHints();
10126 return false;
10127 }
10128 }
10129
10130 // Identify the diagnostic messages that should be produced.
10131 std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10132 bool VectorizeLoop = true, InterleaveLoop = true;
10133 if (VF.Width.isScalar()) {
10134 LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10135 VecDiagMsg = {
10136 "VectorizationNotBeneficial",
10137 "the cost-model indicates that vectorization is not beneficial"};
10138 VectorizeLoop = false;
10139 }
10140
10141 if (!LVP.hasPlanWithVF(VF: VF.Width) && UserIC > 1) {
10142 // Tell the user interleaving was avoided up-front, despite being explicitly
10143 // requested.
10144 LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10145 "interleaving should be avoided up front\n");
10146 IntDiagMsg = {"InterleavingAvoided",
10147 "Ignoring UserIC, because interleaving was avoided up front"};
10148 InterleaveLoop = false;
10149 } else if (IC == 1 && UserIC <= 1) {
10150 // Tell the user interleaving is not beneficial.
10151 LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10152 IntDiagMsg = {
10153 "InterleavingNotBeneficial",
10154 "the cost-model indicates that interleaving is not beneficial"};
10155 InterleaveLoop = false;
10156 if (UserIC == 1) {
10157 IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10158 IntDiagMsg.second +=
10159 " and is explicitly disabled or interleave count is set to 1";
10160 }
10161 } else if (IC > 1 && UserIC == 1) {
10162 // Tell the user interleaving is beneficial, but it explicitly disabled.
10163 LLVM_DEBUG(
10164 dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10165 IntDiagMsg = {"InterleavingBeneficialButDisabled",
10166 "the cost-model indicates that interleaving is beneficial "
10167 "but is explicitly disabled or interleave count is set to 1"};
10168 InterleaveLoop = false;
10169 }
10170
10171 // If there is a histogram in the loop, do not just interleave without
10172 // vectorizing. The order of operations will be incorrect without the
10173 // histogram intrinsics, which are only used for recipes with VF > 1.
10174 if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10175 LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10176 << "to histogram operations.\n");
10177 IntDiagMsg = {
10178 "HistogramPreventsScalarInterleaving",
10179 "Unable to interleave without vectorization due to constraints on "
10180 "the order of histogram operations"};
10181 InterleaveLoop = false;
10182 }
10183
10184 // Override IC if user provided an interleave count.
10185 IC = UserIC > 0 ? UserIC : IC;
10186
10187 // Emit diagnostic messages, if any.
10188 const char *VAPassName = Hints.vectorizeAnalysisPassName();
10189 if (!VectorizeLoop && !InterleaveLoop) {
10190 // Do not vectorize or interleaving the loop.
10191 ORE->emit(RemarkBuilder: [&]() {
10192 return OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
10193 L->getStartLoc(), L->getHeader())
10194 << VecDiagMsg.second;
10195 });
10196 ORE->emit(RemarkBuilder: [&]() {
10197 return OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
10198 L->getStartLoc(), L->getHeader())
10199 << IntDiagMsg.second;
10200 });
10201 return false;
10202 }
10203
10204 if (!VectorizeLoop && InterleaveLoop) {
10205 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10206 ORE->emit(RemarkBuilder: [&]() {
10207 return OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
10208 L->getStartLoc(), L->getHeader())
10209 << VecDiagMsg.second;
10210 });
10211 } else if (VectorizeLoop && !InterleaveLoop) {
10212 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10213 << ") in " << L->getLocStr() << '\n');
10214 ORE->emit(RemarkBuilder: [&]() {
10215 return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
10216 L->getStartLoc(), L->getHeader())
10217 << IntDiagMsg.second;
10218 });
10219 } else if (VectorizeLoop && InterleaveLoop) {
10220 LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10221 << ") in " << L->getLocStr() << '\n');
10222 LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
10223 }
10224
10225 bool DisableRuntimeUnroll = false;
10226 MDNode *OrigLoopID = L->getLoopID();
10227 {
10228 using namespace ore;
10229 if (!VectorizeLoop) {
10230 assert(IC > 1 && "interleave count should not be 1 or 0");
10231 // If we decided that it is not legal to vectorize the loop, then
10232 // interleave it.
10233 VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
10234 InnerLoopVectorizer Unroller(
10235 L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(MinVal: 1),
10236 ElementCount::getFixed(MinVal: 1), IC, &CM, BFI, PSI, Checks, BestPlan);
10237
10238 LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: Unroller, DT, VectorizingEpilogue: false);
10239
10240 ORE->emit(RemarkBuilder: [&]() {
10241 return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
10242 L->getHeader())
10243 << "interleaved loop (interleaved count: "
10244 << NV("InterleaveCount", IC) << ")";
10245 });
10246 } else {
10247 // If we decided that it is *legal* to vectorize the loop, then do it.
10248
10249 VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
10250 // Consider vectorizing the epilogue too if it's profitable.
10251 VectorizationFactor EpilogueVF =
10252 LVP.selectEpilogueVectorizationFactor(MainLoopVF: VF.Width, IC);
10253 if (EpilogueVF.Width.isVector()) {
10254 std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10255
10256 // The first pass vectorizes the main loop and creates a scalar epilogue
10257 // to be vectorized by executing the plan (potentially with a different
10258 // factor) again shortly afterwards.
10259 VPlan &BestEpiPlan = LVP.getPlanFor(VF: EpilogueVF.Width);
10260 BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
10261 preparePlanForMainVectorLoop(MainPlan&: *BestMainPlan, EpiPlan&: BestEpiPlan);
10262 EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1,
10263 BestEpiPlan);
10264 EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10265 EPI, &CM, BFI, PSI, Checks,
10266 *BestMainPlan);
10267 auto ExpandedSCEVs = LVP.executePlan(BestVF: EPI.MainLoopVF, BestUF: EPI.MainLoopUF,
10268 BestVPlan&: *BestMainPlan, ILV&: MainILV, DT, VectorizingEpilogue: false);
10269 ++LoopsVectorized;
10270
10271 // Second pass vectorizes the epilogue and adjusts the control flow
10272 // edges from the first pass.
10273 EPI.MainLoopVF = EPI.EpilogueVF;
10274 EPI.MainLoopUF = EPI.EpilogueUF;
10275 EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10276 ORE, EPI, &CM, BFI, PSI,
10277 Checks, BestEpiPlan);
10278 EpilogILV.setTripCount(MainILV.getTripCount());
10279 preparePlanForEpilogueVectorLoop(Plan&: BestEpiPlan, L, ExpandedSCEVs, EPI);
10280
10281 LVP.executePlan(BestVF: EPI.EpilogueVF, BestUF: EPI.EpilogueUF, BestVPlan&: BestEpiPlan, ILV&: EpilogILV,
10282 DT, VectorizingEpilogue: true);
10283
10284 // Fix induction resume values from the additional bypass block.
10285 BasicBlock *BypassBlock = EpilogILV.getAdditionalBypassBlock();
10286 IRBuilder<> BypassBuilder(BypassBlock,
10287 BypassBlock->getFirstInsertionPt());
10288 BasicBlock *PH = L->getLoopPreheader();
10289 for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
10290 auto *Inc = cast<PHINode>(Val: IVPhi->getIncomingValueForBlock(BB: PH));
10291 Value *V = createInductionAdditionalBypassValues(
10292 OrigPhi: IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount: EPI.VectorTripCount,
10293 OldInduction: LVL.getPrimaryInduction());
10294 // TODO: Directly add as extra operand to the VPResumePHI recipe.
10295 Inc->setIncomingValueForBlock(BB: BypassBlock, V);
10296 }
10297 ++LoopsEpilogueVectorized;
10298
10299 if (!Checks.hasChecks())
10300 DisableRuntimeUnroll = true;
10301 } else {
10302 InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10303 VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
10304 Checks, BestPlan);
10305 LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: LB, DT, VectorizingEpilogue: false);
10306 ++LoopsVectorized;
10307
10308 // Add metadata to disable runtime unrolling a scalar loop when there
10309 // are no runtime checks about strides and memory. A scalar loop that is
10310 // rarely used is not worth unrolling.
10311 if (!Checks.hasChecks())
10312 DisableRuntimeUnroll = true;
10313 }
10314 // Report the vectorization decision.
10315 reportVectorization(ORE, TheLoop: L, VF, IC);
10316 }
10317
10318 if (ORE->allowExtraAnalysis(LV_NAME))
10319 checkMixedPrecision(L, ORE);
10320 }
10321
10322 assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10323 "DT not preserved correctly");
10324
10325 std::optional<MDNode *> RemainderLoopID =
10326 makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
10327 LLVMLoopVectorizeFollowupEpilogue});
10328 if (RemainderLoopID) {
10329 L->setLoopID(*RemainderLoopID);
10330 } else {
10331 if (DisableRuntimeUnroll)
10332 addRuntimeUnrollDisableMetaData(L);
10333
10334 // Mark the loop as already vectorized to avoid vectorizing again.
10335 Hints.setAlreadyVectorized();
10336 }
10337
10338 assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10339 return true;
10340}
10341
10342LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
10343
10344 // Don't attempt if
10345 // 1. the target claims to have no vector registers, and
10346 // 2. interleaving won't help ILP.
10347 //
10348 // The second condition is necessary because, even if the target has no
10349 // vector registers, loop vectorization may still enable scalar
10350 // interleaving.
10351 if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true)) &&
10352 TTI->getMaxInterleaveFactor(VF: ElementCount::getFixed(MinVal: 1)) < 2)
10353 return LoopVectorizeResult(false, false);
10354
10355 bool Changed = false, CFGChanged = false;
10356
10357 // The vectorizer requires loops to be in simplified form.
10358 // Since simplification may add new inner loops, it has to run before the
10359 // legality and profitability checks. This means running the loop vectorizer
10360 // will simplify all loops, regardless of whether anything end up being
10361 // vectorized.
10362 for (const auto &L : *LI)
10363 Changed |= CFGChanged |=
10364 simplifyLoop(L, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA: false /* PreserveLCSSA */);
10365
10366 // Build up a worklist of inner-loops to vectorize. This is necessary as
10367 // the act of vectorizing or partially unrolling a loop creates new loops
10368 // and can invalidate iterators across the loops.
10369 SmallVector<Loop *, 8> Worklist;
10370
10371 for (Loop *L : *LI)
10372 collectSupportedLoops(L&: *L, LI, ORE, V&: Worklist);
10373
10374 LoopsAnalyzed += Worklist.size();
10375
10376 // Now walk the identified inner loops.
10377 while (!Worklist.empty()) {
10378 Loop *L = Worklist.pop_back_val();
10379
10380 // For the inner loops we actually process, form LCSSA to simplify the
10381 // transform.
10382 Changed |= formLCSSARecursively(L&: *L, DT: *DT, LI, SE);
10383
10384 Changed |= CFGChanged |= processLoop(L);
10385
10386 if (Changed) {
10387 LAIs->clear();
10388
10389#ifndef NDEBUG
10390 if (VerifySCEV)
10391 SE->verify();
10392#endif
10393 }
10394 }
10395
10396 // Process each loop nest in the function.
10397 return LoopVectorizeResult(Changed, CFGChanged);
10398}
10399
10400PreservedAnalyses LoopVectorizePass::run(Function &F,
10401 FunctionAnalysisManager &AM) {
10402 LI = &AM.getResult<LoopAnalysis>(IR&: F);
10403 // There are no loops in the function. Return before computing other
10404 // expensive analyses.
10405 if (LI->empty())
10406 return PreservedAnalyses::all();
10407 SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
10408 TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
10409 DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
10410 TLI = &AM.getResult<TargetLibraryAnalysis>(IR&: F);
10411 AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
10412 DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
10413 ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
10414 LAIs = &AM.getResult<LoopAccessAnalysis>(IR&: F);
10415
10416 auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
10417 PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
10418 BFI = nullptr;
10419 if (PSI && PSI->hasProfileSummary())
10420 BFI = &AM.getResult<BlockFrequencyAnalysis>(IR&: F);
10421 LoopVectorizeResult Result = runImpl(F);
10422 if (!Result.MadeAnyChange)
10423 return PreservedAnalyses::all();
10424 PreservedAnalyses PA;
10425
10426 if (isAssignmentTrackingEnabled(M: *F.getParent())) {
10427 for (auto &BB : F)
10428 RemoveRedundantDbgInstrs(BB: &BB);
10429 }
10430
10431 PA.preserve<LoopAnalysis>();
10432 PA.preserve<DominatorTreeAnalysis>();
10433 PA.preserve<ScalarEvolutionAnalysis>();
10434 PA.preserve<LoopAccessAnalysis>();
10435
10436 if (Result.MadeCFGChange) {
10437 // Making CFG changes likely means a loop got vectorized. Indicate that
10438 // extra simplification passes should be run.
10439 // TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10440 // be run if runtime checks have been added.
10441 AM.getResult<ShouldRunExtraVectorPasses>(IR&: F);
10442 PA.preserve<ShouldRunExtraVectorPasses>();
10443 } else {
10444 PA.preserveSet<CFGAnalyses>();
10445 }
10446 return PA;
10447}
10448
10449void LoopVectorizePass::printPipeline(
10450 raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10451 static_cast<PassInfoMixin<LoopVectorizePass> *>(this)->printPipeline(
10452 OS, MapClassName2PassName);
10453
10454 OS << '<';
10455 OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10456 OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10457 OS << '>';
10458}
10459

source code of llvm/lib/Transforms/Vectorize/LoopVectorize.cpp