LoopVectorize.cpp source code [llvm/lib/Transforms/Vectorize/LoopVectorize.cpp]

1	//===- LoopVectorize.cpp - A Loop Vectorizer ------------------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This is the LLVM loop vectorizer. This pass modifies 'vectorizable' loops
10	// and generates target-independent LLVM-IR.
11	// The vectorizer uses the TargetTransformInfo analysis to estimate the costs
12	// of instructions in order to estimate the profitability of vectorization.
13	//
14	// The loop vectorizer combines consecutive loop iterations into a single
15	// 'wide' iteration. After this transformation the index is incremented
16	// by the SIMD vector width, and not by one.
17	//
18	// This pass has three parts:
19	// 1. The main loop pass that drives the different parts.
20	// 2. LoopVectorizationLegality - A unit that checks for the legality
21	// of the vectorization.
22	// 3. InnerLoopVectorizer - A unit that performs the actual
23	// widening of instructions.
24	// 4. LoopVectorizationCostModel - A unit that checks for the profitability
25	// of vectorization. It decides on the optimal vector width, which
26	// can be one, if vectorization is not profitable.
27	//
28	// There is a development effort going on to migrate loop vectorizer to the
29	// VPlan infrastructure and to introduce outer loop vectorization support (see
30	// docs/VectorizationPlan.rst and
31	// http://lists.llvm.org/pipermail/llvm-dev/2017-December/119523.html). For this
32	// purpose, we temporarily introduced the VPlan-native vectorization path: an
33	// alternative vectorization path that is natively implemented on top of the
34	// VPlan infrastructure. See EnableVPlanNativePath for enabling.
35	//
36	//===----------------------------------------------------------------------===//
37	//
38	// The reduction-variable vectorization is based on the paper:
39	// D. Nuzman and R. Henderson. Multi-platform Auto-vectorization.
40	//
41	// Variable uniformity checks are inspired by:
42	// Karrenberg, R. and Hack, S. Whole Function Vectorization.
43	//
44	// The interleaved access vectorization is based on the paper:
45	// Dorit Nuzman, Ira Rosen and Ayal Zaks. Auto-Vectorization of Interleaved
46	// Data for SIMD
47	//
48	// Other ideas/concepts are from:
49	// A. Zaks and D. Nuzman. Autovectorization in GCC-two years later.
50	//
51	// S. Maleki, Y. Gao, M. Garzaran, T. Wong and D. Padua. An Evaluation of
52	// Vectorizing Compilers.
53	//
54	//===----------------------------------------------------------------------===//
55
56	#include "llvm/Transforms/Vectorize/LoopVectorize.h"
57	#include "LoopVectorizationPlanner.h"
58	#include "VPRecipeBuilder.h"
59	#include "VPlan.h"
60	#include "VPlanAnalysis.h"
61	#include "VPlanCFG.h"
62	#include "VPlanHelpers.h"
63	#include "VPlanPatternMatch.h"
64	#include "VPlanTransforms.h"
65	#include "VPlanUtils.h"
66	#include "VPlanVerifier.h"
67	#include "llvm/ADT/APInt.h"
68	#include "llvm/ADT/ArrayRef.h"
69	#include "llvm/ADT/DenseMap.h"
70	#include "llvm/ADT/DenseMapInfo.h"
71	#include "llvm/ADT/Hashing.h"
72	#include "llvm/ADT/MapVector.h"
73	#include "llvm/ADT/STLExtras.h"
74	#include "llvm/ADT/SmallPtrSet.h"
75	#include "llvm/ADT/SmallVector.h"
76	#include "llvm/ADT/Statistic.h"
77	#include "llvm/ADT/StringRef.h"
78	#include "llvm/ADT/Twine.h"
79	#include "llvm/ADT/TypeSwitch.h"
80	#include "llvm/ADT/iterator_range.h"
81	#include "llvm/Analysis/AssumptionCache.h"
82	#include "llvm/Analysis/BasicAliasAnalysis.h"
83	#include "llvm/Analysis/BlockFrequencyInfo.h"
84	#include "llvm/Analysis/CFG.h"
85	#include "llvm/Analysis/CodeMetrics.h"
86	#include "llvm/Analysis/DemandedBits.h"
87	#include "llvm/Analysis/GlobalsModRef.h"
88	#include "llvm/Analysis/LoopAccessAnalysis.h"
89	#include "llvm/Analysis/LoopAnalysisManager.h"
90	#include "llvm/Analysis/LoopInfo.h"
91	#include "llvm/Analysis/LoopIterator.h"
92	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
93	#include "llvm/Analysis/ProfileSummaryInfo.h"
94	#include "llvm/Analysis/ScalarEvolution.h"
95	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
96	#include "llvm/Analysis/TargetLibraryInfo.h"
97	#include "llvm/Analysis/TargetTransformInfo.h"
98	#include "llvm/Analysis/ValueTracking.h"
99	#include "llvm/Analysis/VectorUtils.h"
100	#include "llvm/IR/Attributes.h"
101	#include "llvm/IR/BasicBlock.h"
102	#include "llvm/IR/CFG.h"
103	#include "llvm/IR/Constant.h"
104	#include "llvm/IR/Constants.h"
105	#include "llvm/IR/DataLayout.h"
106	#include "llvm/IR/DebugInfo.h"
107	#include "llvm/IR/DebugLoc.h"
108	#include "llvm/IR/DerivedTypes.h"
109	#include "llvm/IR/DiagnosticInfo.h"
110	#include "llvm/IR/Dominators.h"
111	#include "llvm/IR/Function.h"
112	#include "llvm/IR/IRBuilder.h"
113	#include "llvm/IR/InstrTypes.h"
114	#include "llvm/IR/Instruction.h"
115	#include "llvm/IR/Instructions.h"
116	#include "llvm/IR/IntrinsicInst.h"
117	#include "llvm/IR/Intrinsics.h"
118	#include "llvm/IR/MDBuilder.h"
119	#include "llvm/IR/Metadata.h"
120	#include "llvm/IR/Module.h"
121	#include "llvm/IR/Operator.h"
122	#include "llvm/IR/PatternMatch.h"
123	#include "llvm/IR/ProfDataUtils.h"
124	#include "llvm/IR/Type.h"
125	#include "llvm/IR/Use.h"
126	#include "llvm/IR/User.h"
127	#include "llvm/IR/Value.h"
128	#include "llvm/IR/Verifier.h"
129	#include "llvm/Support/Casting.h"
130	#include "llvm/Support/CommandLine.h"
131	#include "llvm/Support/Debug.h"
132	#include "llvm/Support/ErrorHandling.h"
133	#include "llvm/Support/InstructionCost.h"
134	#include "llvm/Support/MathExtras.h"
135	#include "llvm/Support/NativeFormatting.h"
136	#include "llvm/Support/raw_ostream.h"
137	#include "llvm/Transforms/Utils/BasicBlockUtils.h"
138	#include "llvm/Transforms/Utils/InjectTLIMappings.h"
139	#include "llvm/Transforms/Utils/Local.h"
140	#include "llvm/Transforms/Utils/LoopSimplify.h"
141	#include "llvm/Transforms/Utils/LoopUtils.h"
142	#include "llvm/Transforms/Utils/LoopVersioning.h"
143	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
144	#include "llvm/Transforms/Utils/SizeOpts.h"
145	#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
146	#include <algorithm>
147	#include <cassert>
148	#include <cstdint>
149	#include <functional>
150	#include <iterator>
151	#include <limits>
152	#include <memory>
153	#include <string>
154	#include <tuple>
155	#include <utility>
156
157	using namespace llvm;
158
159	#define LV_NAME "loop-vectorize"
160	#define DEBUG_TYPE LV_NAME
161
162	#ifndef NDEBUG
163	const char VerboseDebug[] = DEBUG_TYPE "-verbose";
164	#endif
165
166	/// @{
167	/// Metadata attribute names
168	const char LLVMLoopVectorizeFollowupAll[] = "llvm.loop.vectorize.followup_all";
169	const char LLVMLoopVectorizeFollowupVectorized[] =
170	"llvm.loop.vectorize.followup_vectorized";
171	const char LLVMLoopVectorizeFollowupEpilogue[] =
172	"llvm.loop.vectorize.followup_epilogue";
173	/// @}
174
175	STATISTIC(LoopsVectorized, "Number of loops vectorized");
176	STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
177	STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
178	STATISTIC(LoopsEarlyExitVectorized, "Number of early exit loops vectorized");
179
180	static cl::opt<bool> EnableEpilogueVectorization(
181	"enable-epilogue-vectorization", cl::init(Val: true), cl::Hidden,
182	cl::desc ("Enable vectorization of epilogue loops."));
183
184	static cl::opt<unsigned> EpilogueVectorizationForceVF(
185	"epilogue-vectorization-force-VF", cl::init(Val: `1`), cl::Hidden,
186	cl::desc ("When epilogue vectorization is enabled, and a value greater than "
187	"1 is specified, forces the given VF for all applicable epilogue "
188	"loops."));
189
190	static cl::opt<unsigned> EpilogueVectorizationMinVF(
191	"epilogue-vectorization-minimum-VF", cl::Hidden,
192	cl::desc ("Only loops with vectorization factor equal to or larger than "
193	"the specified value are considered for epilogue vectorization."));
194
195	/// Loops with a known constant trip count below this number are vectorized only
196	/// if no scalar iteration overheads are incurred.
197	static cl::opt<unsigned> TinyTripCountVectorThreshold(
198	"vectorizer-min-trip-count", cl::init(Val: `16`), cl::Hidden,
199	cl::desc ("Loops with a constant trip count that is smaller than this "
200	"value are vectorized only if no scalar iteration overheads "
201	"are incurred."));
202
203	static cl::opt<unsigned> VectorizeMemoryCheckThreshold(
204	"vectorize-memory-check-threshold", cl::init(Val: `128`), cl::Hidden,
205	cl::desc ("The maximum allowed number of runtime memory checks"));
206
207	// Option prefer-predicate-over-epilogue indicates that an epilogue is undesired,
208	// that predication is preferred, and this lists all options. I.e., the
209	// vectorizer will try to fold the tail-loop (epilogue) into the vector body
210	// and predicate the instructions accordingly. If tail-folding fails, there are
211	// different fallback strategies depending on these values:
212	namespace PreferPredicateTy {
213	enum Option {
214	ScalarEpilogue = `0`,
215	PredicateElseScalarEpilogue,
216	PredicateOrDontVectorize
217	};
218	} // namespace PreferPredicateTy
219
220	static cl::opt<PreferPredicateTy::Option> PreferPredicateOverEpilogue(
221	"prefer-predicate-over-epilogue",
222	cl::init(Val: PreferPredicateTy::ScalarEpilogue),
223	cl::Hidden,
224	cl::desc ("Tail-folding and predication preferences over creating a scalar "
225	"epilogue loop."),
226	cl::values(clEnumValN(PreferPredicateTy::ScalarEpilogue,
227	"scalar-epilogue",
228	"Don't tail-predicate loops, create scalar epilogue"),
229	clEnumValN(PreferPredicateTy::PredicateElseScalarEpilogue,
230	"predicate-else-scalar-epilogue",
231	"prefer tail-folding, create scalar epilogue if tail "
232	"folding fails."),
233	clEnumValN(PreferPredicateTy::PredicateOrDontVectorize,
234	"predicate-dont-vectorize",
235	"prefers tail-folding, don't attempt vectorization if "
236	"tail-folding fails.")));
237
238	static cl::opt<TailFoldingStyle> ForceTailFoldingStyle(
239	"force-tail-folding-style", cl::desc ("Force the tail folding style"),
240	cl::init(Val: TailFoldingStyle::None),
241	cl::values(
242	clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
243	clEnumValN(
244	TailFoldingStyle::Data, "data",
245	"Create lane mask for data only, using active.lane.mask intrinsic"),
246	clEnumValN(TailFoldingStyle::DataWithoutLaneMask,
247	"data-without-lane-mask",
248	"Create lane mask with compare/stepvector"),
249	clEnumValN(TailFoldingStyle::DataAndControlFlow, "data-and-control",
250	"Create lane mask using active.lane.mask intrinsic, and use "
251	"it for both data and control flow"),
252	clEnumValN(TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck,
253	"data-and-control-without-rt-check",
254	"Similar to data-and-control, but remove the runtime check"),
255	clEnumValN(TailFoldingStyle::DataWithEVL, "data-with-evl",
256	"Use predicated EVL instructions for tail folding. If EVL "
257	"is unsupported, fallback to data-without-lane-mask.")));
258
259	static cl::opt<bool> MaximizeBandwidth(
260	"vectorizer-maximize-bandwidth", cl::init(Val: false), cl::Hidden,
261	cl::desc ("Maximize bandwidth when selecting vectorization factor which "
262	"will be determined by the smallest type in loop."));
263
264	static cl::opt<bool> EnableInterleavedMemAccesses(
265	"enable-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
266	cl::desc ("Enable vectorization on interleaved memory accesses in a loop"));
267
268	/// An interleave-group may need masking if it resides in a block that needs
269	/// predication, or in order to mask away gaps.
270	static cl::opt<bool> EnableMaskedInterleavedMemAccesses(
271	"enable-masked-interleaved-mem-accesses", cl::init(Val: false), cl::Hidden,
272	cl::desc ("Enable vectorization on masked interleaved memory accesses in a loop"));
273
274	static cl::opt<unsigned> ForceTargetNumScalarRegs(
275	"force-target-num-scalar-regs", cl::init(Val: `0`), cl::Hidden,
276	cl::desc ("A flag that overrides the target's number of scalar registers."));
277
278	static cl::opt<unsigned> ForceTargetNumVectorRegs(
279	"force-target-num-vector-regs", cl::init(Val: `0`), cl::Hidden,
280	cl::desc ("A flag that overrides the target's number of vector registers."));
281
282	static cl::opt<unsigned> ForceTargetMaxScalarInterleaveFactor(
283	"force-target-max-scalar-interleave", cl::init(Val: `0`), cl::Hidden,
284	cl::desc ("A flag that overrides the target's max interleave factor for "
285	"scalar loops."));
286
287	static cl::opt<unsigned> ForceTargetMaxVectorInterleaveFactor(
288	"force-target-max-vector-interleave", cl::init(Val: `0`), cl::Hidden,
289	cl::desc ("A flag that overrides the target's max interleave factor for "
290	"vectorized loops."));
291
292	cl::opt<unsigned> llvm::ForceTargetInstructionCost(
293	"force-target-instruction-cost", cl::init(Val: `0`), cl::Hidden,
294	cl::desc ("A flag that overrides the target's expected cost for "
295	"an instruction to a single constant value. Mostly "
296	"useful for getting consistent testing."));
297
298	static cl::opt<bool> ForceTargetSupportsScalableVectors(
299	"force-target-supports-scalable-vectors", cl::init(Val: false), cl::Hidden,
300	cl::desc (
301	"Pretend that scalable vectors are supported, even if the target does "
302	"not support them. This flag should only be used for testing."));
303
304	static cl::opt<unsigned> SmallLoopCost(
305	"small-loop-cost", cl::init(Val: `20`), cl::Hidden,
306	cl::desc (
307	"The cost of a loop that is considered 'small' by the interleaver."));
308
309	static cl::opt<bool> LoopVectorizeWithBlockFrequency(
310	"loop-vectorize-with-block-frequency", cl::init(Val: true), cl::Hidden,
311	cl::desc ("Enable the use of the block frequency analysis to access PGO "
312	"heuristics minimizing code growth in cold regions and being more "
313	"aggressive in hot regions."));
314
315	// Runtime interleave loops for load/store throughput.
316	static cl::opt<bool> EnableLoadStoreRuntimeInterleave(
317	"enable-loadstore-runtime-interleave", cl::init(Val: true), cl::Hidden,
318	cl::desc (
319	"Enable runtime interleaving until load/store ports are saturated"));
320
321	/// The number of stores in a loop that are allowed to need predication.
322	static cl::opt<unsigned> NumberOfStoresToPredicate(
323	"vectorize-num-stores-pred", cl::init(Val: `1`), cl::Hidden,
324	cl::desc ("Max number of stores to be predicated behind an if."));
325
326	static cl::opt<bool> EnableIndVarRegisterHeur(
327	"enable-ind-var-reg-heur", cl::init(Val: true), cl::Hidden,
328	cl::desc ("Count the induction variable only once when interleaving"));
329
330	static cl::opt<bool> EnableCondStoresVectorization(
331	"enable-cond-stores-vec", cl::init(Val: true), cl::Hidden,
332	cl::desc ("Enable if predication of stores during vectorization."));
333
334	static cl::opt<unsigned> MaxNestedScalarReductionIC(
335	"max-nested-scalar-reduction-interleave", cl::init(Val: `2`), cl::Hidden,
336	cl::desc ("The maximum interleave count to use when interleaving a scalar "
337	"reduction in a nested loop."));
338
339	static cl::opt<bool>
340	PreferInLoopReductions("prefer-inloop-reductions", cl::init(Val: false),
341	cl::Hidden,
342	cl::desc ("Prefer in-loop vector reductions, "
343	"overriding the targets preference."));
344
345	static cl::opt<bool> ForceOrderedReductions(
346	"force-ordered-reductions", cl::init(Val: false), cl::Hidden,
347	cl::desc ("Enable the vectorisation of loops with in-order (strict) "
348	"FP reductions"));
349
350	static cl::opt<bool> PreferPredicatedReductionSelect(
351	"prefer-predicated-reduction-select", cl::init(Val: false), cl::Hidden,
352	cl::desc (
353	"Prefer predicating a reduction operation over an after loop select."));
354
355	cl::opt<bool> llvm::EnableVPlanNativePath(
356	"enable-vplan-native-path", cl::Hidden,
357	cl::desc ("Enable VPlan-native vectorization path with "
358	"support for outer loop vectorization."));
359
360	cl::opt<bool>
361	llvm::VerifyEachVPlan("vplan-verify-each",
362	#ifdef EXPENSIVE_CHECKS
363	cl::init(true),
364	#else
365	cl::init(Val: false),
366	#endif
367	cl::Hidden,
368	cl::desc ("Verfiy VPlans after VPlan transforms."));
369
370	// This flag enables the stress testing of the VPlan H-CFG construction in the
371	// VPlan-native vectorization path. It must be used in conjuction with
372	// -enable-vplan-native-path. -vplan-verify-hcfg can also be used to enable the
373	// verification of the H-CFGs built.
374	static cl::opt<bool> VPlanBuildStressTest(
375	"vplan-build-stress-test", cl::init(Val: false), cl::Hidden,
376	cl::desc (
377	"Build VPlan for every supported loop nest in the function and bail "
378	"out right after the build (stress test the VPlan H-CFG construction "
379	"in the VPlan-native vectorization path)."));
380
381	cl::opt<bool> llvm::EnableLoopInterleaving(
382	"interleave-loops", cl::init(Val: true), cl::Hidden,
383	cl::desc ("Enable loop interleaving in Loop vectorization passes"));
384	cl::opt<bool> llvm::EnableLoopVectorization(
385	"vectorize-loops", cl::init(Val: true), cl::Hidden,
386	cl::desc ("Run the Loop vectorization passes"));
387
388	static cl::opt<cl::boolOrDefault> ForceSafeDivisor(
389	"force-widen-divrem-via-safe-divisor", cl::Hidden,
390	cl::desc (
391	"Override cost based safe divisor widening for div/rem instructions"));
392
393	static cl::opt<bool> UseWiderVFIfCallVariantsPresent(
394	"vectorizer-maximize-bandwidth-for-vector-calls", cl::init(Val: true),
395	cl::Hidden,
396	cl::desc ("Try wider VFs if they enable the use of vector variants"));
397
398	static cl::opt<bool> EnableEarlyExitVectorization(
399	"enable-early-exit-vectorization", cl::init(Val: true), cl::Hidden,
400	cl::desc (
401	"Enable vectorization of early exit loops with uncountable exits."));
402
403	// Likelyhood of bypassing the vectorized loop because there are zero trips left
404	// after prolog. See `emitIterationCountCheck`.
405	static constexpr uint32_t MinItersBypassWeights[] = {`1`, `127`};
406
407	/// A helper function that returns true if the given type is irregular. The
408	/// type is irregular if its allocated size doesn't equal the store size of an
409	/// element of the corresponding vector type.
410	static bool hasIrregularType(Type Ty, const* DataLayout &DL) {
411	// Determine if an array of N elements of type Ty is "bitcast compatible"
412	// with a <N x Ty> vector.
413	// This is only true if there is no padding between the array elements.
414	return DL.getTypeAllocSizeInBits(Ty) != DL.getTypeSizeInBits(Ty);
415	}
416
417	/// A version of ScalarEvolution::getSmallConstantTripCount that returns an
418	/// ElementCount to include loops whose trip count is a function of vscale.
419	static ElementCount getSmallConstantTripCount(ScalarEvolution *SE,
420	const Loop *L) {
421	return ElementCount::getFixed(MinVal: SE->getSmallConstantTripCount(L));
422	}
423
424	/// Returns "best known" trip count, which is either a valid positive trip count
425	/// or std::nullopt when an estimate cannot be made (including when the trip
426	/// count would overflow), for the specified loop \p L as defined by the
427	/// following procedure:
428	/// 1) Returns exact trip count if it is known.
429	/// 2) Returns expected trip count according to profile data if any.
430	/// 3) Returns upper bound estimate if known, and if \p CanUseConstantMax.
431	/// 4) Returns std::nullopt if all of the above failed.
432	static std::optional<ElementCount>
433	getSmallBestKnownTC(PredicatedScalarEvolution &PSE, Loop *L,
434	bool CanUseConstantMax = true) {
435	// Check if exact trip count is known.
436	if (auto ExpectedTC = getSmallConstantTripCount(SE: PSE.getSE(), L))
437	return ExpectedTC;
438
439	// Check if there is an expected trip count available from profile data.
440	if (LoopVectorizeWithBlockFrequency)
441	if (auto EstimatedTC = getLoopEstimatedTripCount(L))
442	return ElementCount::getFixed(MinVal: *EstimatedTC);
443
444	if (!CanUseConstantMax)
445	return std::nullopt;
446
447	// Check if upper bound estimate is known.
448	if (unsigned ExpectedTC = PSE.getSmallConstantMaxTripCount())
449	return ElementCount::getFixed(MinVal: ExpectedTC);
450
451	return std::nullopt;
452	}
453
454	namespace {
455	// Forward declare GeneratedRTChecks.
456	class GeneratedRTChecks;
457
458	using SCEV2ValueTy = DenseMap<const SCEV , Value >;
459	} // namespace
460
461	namespace llvm {
462
463	AnalysisKey ShouldRunExtraVectorPasses::Key;
464
465	/// InnerLoopVectorizer vectorizes loops which contain only one basic
466	/// block to a specified vectorization factor (VF).
467	/// This class performs the widening of scalars into vectors, or multiple
468	/// scalars. This class also implements the following features:
469	/// It inserts an epilogue loop for handling loops that don't have iteration*
470	/// counts that are known to be a multiple of the vectorization factor.
471	/// It handles the code generation for reduction variables.*
472	/// Scalarization (implementation using scalars) of un-vectorizable*
473	/// instructions.
474	/// InnerLoopVectorizer does not perform any vectorization-legality
475	/// checks, and relies on the caller to check for the different legality
476	/// aspects. The InnerLoopVectorizer relies on the
477	/// LoopVectorizationLegality class to provide information about the induction
478	/// and reduction variables that were found to a given vectorization factor.
479	class InnerLoopVectorizer {
480	public:
481	InnerLoopVectorizer(Loop *OrigLoop, PredicatedScalarEvolution &PSE,
482	LoopInfo LI, DominatorTree DT,
483	const TargetLibraryInfo *TLI,
484	const TargetTransformInfo TTI, AssumptionCache AC,
485	OptimizationRemarkEmitter *ORE, ElementCount VecWidth,
486	ElementCount MinProfitableTripCount,
487	unsigned UnrollFactor, LoopVectorizationCostModel *CM,
488	BlockFrequencyInfo BFI, ProfileSummaryInfo PSI,
489	GeneratedRTChecks &RTChecks, VPlan &Plan)
490	: OrigLoop(OrigLoop), PSE(PSE), LI(LI), DT(DT), TLI(TLI), TTI(TTI),
491	AC(AC), ORE(ORE), VF (VecWidth),
492	MinProfitableTripCount (MinProfitableTripCount), UF(UnrollFactor),
493	Builder (PSE.getSE()->getContext()), Cost(CM), BFI(BFI), PSI(PSI),
494	RTChecks(RTChecks), Plan(Plan),
495	VectorPHVPB(Plan.getVectorLoopRegion()->getSinglePredecessor()) {}
496
497	virtual ~InnerLoopVectorizer() = default;
498
499	/// Create a new empty loop that will contain vectorized instructions later
500	/// on, while the old loop will be used as the scalar remainder. Control flow
501	/// is generated around the vectorized (and scalar epilogue) loops consisting
502	/// of various checks and bypasses. Return the pre-header block of the new
503	/// loop. In the case of epilogue vectorization, this function is overriden to
504	/// handle the more complex control flow around the loops.
505	virtual BasicBlock *createVectorizedLoopSkeleton();
506
507	/// Fix the vectorized code, taking care of header phi's, and more.
508	void fixVectorizedLoop(VPTransformState &State);
509
510	/// Fix the non-induction PHIs in \p Plan.
511	void fixNonInductionPHIs(VPTransformState &State);
512
513	/// Returns the original loop trip count.
514	Value getTripCount() const* { return TripCount; }
515
516	/// Used to set the trip count after ILV's construction and after the
517	/// preheader block has been executed. Note that this always holds the trip
518	/// count of the original loop for both main loop and epilogue vectorization.
519	void setTripCount(Value *TC) { TripCount = TC; }
520
521	/// Return the additional bypass block which targets the scalar loop by
522	/// skipping the epilogue loop after completing the main loop.
523	BasicBlock getAdditionalBypassBlock() const* {
524	assert(AdditionalBypassBlock &&
525	"Trying to access AdditionalBypassBlock but it has not been set");
526	return AdditionalBypassBlock;
527	}
528
529	protected:
530	friend class LoopVectorizationPlanner;
531
532	/// Returns (and creates if needed) the trip count of the widened loop.
533	Value getOrCreateVectorTripCount(BasicBlock InsertBlock);
534
535	// Create a check to see if the vector loop should be executed
536	Value createIterationCountCheck(ElementCount VF, unsigned* UF) const;
537
538	/// Emit a bypass check to see if the vector trip count is zero, including if
539	/// it overflows.
540	void emitIterationCountCheck(BasicBlock *Bypass);
541
542	/// Emit basic blocks (prefixed with \p Prefix) for the iteration check,
543	/// vector loop preheader, middle block and scalar preheader.
544	void createVectorLoopSkeleton(StringRef Prefix);
545
546	/// Allow subclasses to override and print debug traces before/after vplan
547	/// execution, when trace information is requested.
548	virtual void printDebugTracesAtStart() {}
549	virtual void printDebugTracesAtEnd() {}
550
551	/// Introduces a new VPIRBasicBlock for \p CheckIRBB to Plan between the
552	/// vector preheader and its predecessor, also connecting the new block to the
553	/// scalar preheader.
554	void introduceCheckBlockInVPlan(BasicBlock *CheckIRBB);
555
556	/// The original loop.
557	Loop *OrigLoop;
558
559	/// A wrapper around ScalarEvolution used to add runtime SCEV checks. Applies
560	/// dynamic knowledge to simplify SCEV expressions and converts them to a
561	/// more usable form.
562	PredicatedScalarEvolution &PSE;
563
564	/// Loop Info.
565	LoopInfo *LI;
566
567	/// Dominator Tree.
568	DominatorTree *DT;
569
570	/// Target Library Info.
571	const TargetLibraryInfo *TLI;
572
573	/// Target Transform Info.
574	const TargetTransformInfo *TTI;
575
576	/// Assumption Cache.
577	AssumptionCache *AC;
578
579	/// Interface to emit optimization remarks.
580	OptimizationRemarkEmitter *ORE;
581
582	/// The vectorization SIMD factor to use. Each vector will have this many
583	/// vector elements.
584	ElementCount VF;
585
586	ElementCount MinProfitableTripCount;
587
588	/// The vectorization unroll factor to use. Each scalar is vectorized to this
589	/// many different vector instructions.
590	unsigned UF;
591
592	/// The builder that we use
593	IRBuilder<> Builder;
594
595	// --- Vectorization state ---
596
597	/// The vector-loop preheader.
598	BasicBlock LoopVectorPreHeader = nullptr*;
599
600	/// The scalar-loop preheader.
601	BasicBlock LoopScalarPreHeader = nullptr*;
602
603	/// Middle Block between the vector and the scalar.
604	BasicBlock LoopMiddleBlock = nullptr*;
605
606	/// Trip count of the original loop.
607	Value TripCount = nullptr*;
608
609	/// Trip count of the widened loop (TripCount - TripCount % (VFUF))*
610	Value VectorTripCount = nullptr*;
611
612	/// The profitablity analysis.
613	LoopVectorizationCostModel *Cost;
614
615	/// BFI and PSI are used to check for profile guided size optimizations.
616	BlockFrequencyInfo *BFI;
617	ProfileSummaryInfo *PSI;
618
619	/// Structure to hold information about generated runtime checks, responsible
620	/// for cleaning the checks, if vectorization turns out unprofitable.
621	GeneratedRTChecks &RTChecks;
622
623	/// The additional bypass block which conditionally skips over the epilogue
624	/// loop after executing the main loop. Needed to resume inductions and
625	/// reductions during epilogue vectorization.
626	BasicBlock AdditionalBypassBlock = nullptr*;
627
628	VPlan &Plan;
629
630	/// The vector preheader block of \p Plan, used as target for check blocks
631	/// introduced during skeleton creation.
632	VPBlockBase *VectorPHVPB;
633	};
634
635	/// Encapsulate information regarding vectorization of a loop and its epilogue.
636	/// This information is meant to be updated and used across two stages of
637	/// epilogue vectorization.
638	struct EpilogueLoopVectorizationInfo {
639	ElementCount MainLoopVF = ElementCount::getFixed(MinVal: `0`);
640	unsigned MainLoopUF = `0`;
641	ElementCount EpilogueVF = ElementCount::getFixed(MinVal: `0`);
642	unsigned EpilogueUF = `0`;
643	BasicBlock MainLoopIterationCountCheck = nullptr*;
644	BasicBlock EpilogueIterationCountCheck = nullptr*;
645	Value TripCount = nullptr*;
646	Value VectorTripCount = nullptr*;
647	VPlan &EpiloguePlan;
648
649	EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
650	ElementCount EVF, unsigned EUF,
651	VPlan &EpiloguePlan)
652	: MainLoopVF (MVF), MainLoopUF(MUF), EpilogueVF (EVF), EpilogueUF(EUF),
653	EpiloguePlan(EpiloguePlan) {
654	assert(EUF == `1` &&
655	"A high UF for the epilogue loop is likely not beneficial.");
656	}
657	};
658
659	/// An extension of the inner loop vectorizer that creates a skeleton for a
660	/// vectorized loop that has its epilogue (residual) also vectorized.
661	/// The idea is to run the vplan on a given loop twice, firstly to setup the
662	/// skeleton and vectorize the main loop, and secondly to complete the skeleton
663	/// from the first step and vectorize the epilogue. This is achieved by
664	/// deriving two concrete strategy classes from this base class and invoking
665	/// them in succession from the loop vectorizer planner.
666	class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
667	public:
668	InnerLoopAndEpilogueVectorizer(
669	Loop OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo LI,
670	DominatorTree DT, const* TargetLibraryInfo *TLI,
671	const TargetTransformInfo TTI, AssumptionCache AC,
672	OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
673	LoopVectorizationCostModel CM, BlockFrequencyInfo BFI,
674	ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
675	: InnerLoopVectorizer (OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
676	EPI.MainLoopVF, EPI.MainLoopVF, EPI.MainLoopUF, CM,
677	BFI, PSI, Checks, Plan),
678	EPI(EPI) {}
679
680	// Override this function to handle the more complex control flow around the
681	// three loops.
682	BasicBlock *createVectorizedLoopSkeleton() final {
683	return createEpilogueVectorizedLoopSkeleton();
684	}
685
686	/// The interface for creating a vectorized skeleton using one of two
687	/// different strategies, each corresponding to one execution of the vplan
688	/// as described above.
689	virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = `0`;
690
691	/// Holds and updates state information required to vectorize the main loop
692	/// and its epilogue in two separate passes. This setup helps us avoid
693	/// regenerating and recomputing runtime safety checks. It also helps us to
694	/// shorten the iteration-count-check path length for the cases where the
695	/// iteration count of the loop is so small that the main vector loop is
696	/// completely skipped.
697	EpilogueLoopVectorizationInfo &EPI;
698	};
699
700	/// A specialized derived class of inner loop vectorizer that performs
701	/// vectorization of main* loops in the process of vectorizing loops and their*
702	/// epilogues.
703	class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
704	public:
705	EpilogueVectorizerMainLoop(
706	Loop OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo LI,
707	DominatorTree DT, const* TargetLibraryInfo *TLI,
708	const TargetTransformInfo TTI, AssumptionCache AC,
709	OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
710	LoopVectorizationCostModel CM, BlockFrequencyInfo BFI,
711	ProfileSummaryInfo *PSI, GeneratedRTChecks &Check, VPlan &Plan)
712	: InnerLoopAndEpilogueVectorizer (OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
713	EPI, CM, BFI, PSI, Check, Plan) {}
714	/// Implements the interface for creating a vectorized skeleton using the
715	/// main loop* strategy (ie the first pass of vplan execution).*
716	BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
717
718	protected:
719	/// Emits an iteration count bypass check once for the main loop (when \p
720	/// ForEpilogue is false) and once for the epilogue loop (when \p
721	/// ForEpilogue is true).
722	BasicBlock emitIterationCountCheck(BasicBlock Bypass, bool ForEpilogue);
723	void printDebugTracesAtStart() override;
724	void printDebugTracesAtEnd() override;
725	};
726
727	// A specialized derived class of inner loop vectorizer that performs
728	// vectorization of epilogue* loops in the process of vectorizing loops and*
729	// their epilogues.
730	class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
731	public:
732	EpilogueVectorizerEpilogueLoop(
733	Loop OrigLoop, PredicatedScalarEvolution &PSE, LoopInfo LI,
734	DominatorTree DT, const* TargetLibraryInfo *TLI,
735	const TargetTransformInfo TTI, AssumptionCache AC,
736	OptimizationRemarkEmitter *ORE, EpilogueLoopVectorizationInfo &EPI,
737	LoopVectorizationCostModel CM, BlockFrequencyInfo BFI,
738	ProfileSummaryInfo *PSI, GeneratedRTChecks &Checks, VPlan &Plan)
739	: InnerLoopAndEpilogueVectorizer (OrigLoop, PSE, LI, DT, TLI, TTI, AC, ORE,
740	EPI, CM, BFI, PSI, Checks, Plan) {
741	TripCount = EPI.TripCount;
742	}
743	/// Implements the interface for creating a vectorized skeleton using the
744	/// epilogue loop* strategy (ie the second pass of vplan execution).*
745	BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
746
747	protected:
748	/// Emits an iteration count bypass check after the main vector loop has
749	/// finished to see if there are any iterations left to execute by either
750	/// the vector epilogue or the scalar epilogue.
751	BasicBlock *emitMinimumVectorEpilogueIterCountCheck(
752	BasicBlock *Bypass,
753	BasicBlock *Insert);
754	void printDebugTracesAtStart() override;
755	void printDebugTracesAtEnd() override;
756	};
757	} // end namespace llvm
758
759	/// Look for a meaningful debug location on the instruction or its operands.
760	static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
761	if (!I)
762	return DebugLoc::getUnknown();
763
764	DebugLoc Empty;
765	if (I->getDebugLoc() != Empty)
766	return I->getDebugLoc();
767
768	for (Use &Op : I->operands()) {
769	if (Instruction *OpInst = dyn_cast<Instruction>(Val&: Op))
770	if (OpInst->getDebugLoc() != Empty)
771	return OpInst->getDebugLoc();
772	}
773
774	return I->getDebugLoc();
775	}
776
777	/// Write a \p DebugMsg about vectorization to the debug output stream. If \p I
778	/// is passed, the message relates to that particular instruction.
779	#ifndef NDEBUG
780	static void debugVectorizationMessage(const StringRef Prefix,
781	const StringRef DebugMsg,
782	Instruction *I) {
783	dbgs() << "LV: " << Prefix << DebugMsg;
784	if (I != nullptr)
785	dbgs() << " " << *I;
786	else
787	dbgs() << `'.'`;
788	dbgs() << `'\n'`;
789	}
790	#endif
791
792	/// Create an analysis remark that explains why vectorization failed
793	///
794	/// \p PassName is the name of the pass (e.g. can be AlwaysPrint). \p
795	/// RemarkName is the identifier for the remark. If \p I is passed it is an
796	/// instruction that prevents vectorization. Otherwise \p TheLoop is used for
797	/// the location of the remark. If \p DL is passed, use it as debug location for
798	/// the remark. \return the remark object that can be streamed to.
799	static OptimizationRemarkAnalysis
800	createLVAnalysis(const char PassName, StringRef RemarkName, Loop TheLoop,
801	Instruction *I, DebugLoc DL = {}) {
802	BasicBlock *CodeRegion = I ? I->getParent() : TheLoop->getHeader();
803	// If debug location is attached to the instruction, use it. Otherwise if DL
804	// was not provided, use the loop's.
805	if (I && I->getDebugLoc())
806	DL = I->getDebugLoc();
807	else if (!DL)
808	DL = TheLoop->getStartLoc();
809
810	return OptimizationRemarkAnalysis (PassName, RemarkName, DL, CodeRegion);
811	}
812
813	namespace llvm {
814
815	/// Return a value for Step multiplied by VF.
816	Value createStepForVF(IRBuilderBase &B, Type Ty, ElementCount VF,
817	int64_t Step) {
818	assert(Ty->isIntegerTy() && "Expected an integer step");
819	return B.CreateElementCount(Ty, EC: VF.multiplyCoefficientBy(RHS: Step));
820	}
821
822	/// Return the runtime value for VF.
823	Value getRuntimeVF(IRBuilderBase &B, Type Ty, ElementCount VF) {
824	return B.CreateElementCount(Ty, EC: VF);
825	}
826
827	void reportVectorizationFailure(const StringRef DebugMsg,
828	const StringRef OREMsg, const StringRef ORETag,
829	OptimizationRemarkEmitter ORE, Loop TheLoop,
830	Instruction *I) {
831	LLVM_DEBUG(debugVectorizationMessage("Not vectorizing: ", DebugMsg, I));
832	LoopVectorizeHints Hints(TheLoop, true / doesn't matter /, *ORE);
833	ORE->emit(
834	OptDiag: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop, I)
835	<< "loop not vectorized: " << OREMsg);
836	}
837
838	/// Reports an informative message: print \p Msg for debugging purposes as well
839	/// as an optimization remark. Uses either \p I as location of the remark, or
840	/// otherwise \p TheLoop. If \p DL is passed, use it as debug location for the
841	/// remark. If \p DL is passed, use it as debug location for the remark.
842	static void reportVectorizationInfo(const StringRef Msg, const StringRef ORETag,
843	OptimizationRemarkEmitter *ORE,
844	Loop TheLoop, Instruction I = nullptr,
845	DebugLoc DL = {}) {
846	LLVM_DEBUG(debugVectorizationMessage("", Msg, I));
847	LoopVectorizeHints Hints(TheLoop, true / doesn't matter /, *ORE);
848	ORE->emit(OptDiag: createLVAnalysis(PassName: Hints.vectorizeAnalysisPassName(), RemarkName: ORETag, TheLoop,
849	I, DL)
850	<< Msg);
851	}
852
853	/// Report successful vectorization of the loop. In case an outer loop is
854	/// vectorized, prepend "outer" to the vectorization remark.
855	static void reportVectorization(OptimizationRemarkEmitter ORE, Loop TheLoop,
856	VectorizationFactor VF, unsigned IC) {
857	LLVM_DEBUG(debugVectorizationMessage(
858	"Vectorizing: ", TheLoop->isInnermost() ? "innermost loop" : "outer loop",
859	nullptr));
860	StringRef LoopType = TheLoop->isInnermost() ? "" : "outer ";
861	ORE->emit(RemarkBuilder: [&]() {
862	return OptimizationRemark (LV_NAME, "Vectorized", TheLoop->getStartLoc(),
863	TheLoop->getHeader())
864	<< "vectorized " << LoopType << "loop (vectorization width: "
865	<< ore::NV ("VectorizationFactor", VF.Width)
866	<< ", interleaved count: " << ore::NV ("InterleaveCount", IC) << ")";
867	});
868	}
869
870	} // end namespace llvm
871
872	namespace llvm {
873
874	// Loop vectorization cost-model hints how the scalar epilogue loop should be
875	// lowered.
876	enum ScalarEpilogueLowering {
877
878	// The default: allowing scalar epilogues.
879	CM_ScalarEpilogueAllowed,
880
881	// Vectorization with OptForSize: don't allow epilogues.
882	CM_ScalarEpilogueNotAllowedOptSize,
883
884	// A special case of vectorisation with OptForSize: loops with a very small
885	// trip count are considered for vectorization under OptForSize, thereby
886	// making sure the cost of their loop body is dominant, free of runtime
887	// guards and scalar iteration overheads.
888	CM_ScalarEpilogueNotAllowedLowTripLoop,
889
890	// Loop hint predicate indicating an epilogue is undesired.
891	CM_ScalarEpilogueNotNeededUsePredicate,
892
893	// Directive indicating we must either tail fold or not vectorize
894	CM_ScalarEpilogueNotAllowedUsePredicate
895	};
896
897	/// LoopVectorizationCostModel - estimates the expected speedups due to
898	/// vectorization.
899	/// In many cases vectorization is not profitable. This can happen because of
900	/// a number of reasons. In this class we mainly attempt to predict the
901	/// expected speedup/slowdowns due to the supported instruction set. We use the
902	/// TargetTransformInfo to query the different backends for the cost of
903	/// different operations.
904	class LoopVectorizationCostModel {
905	friend class LoopVectorizationPlanner;
906
907	public:
908	LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
909	PredicatedScalarEvolution &PSE, LoopInfo *LI,
910	LoopVectorizationLegality *Legal,
911	const TargetTransformInfo &TTI,
912	const TargetLibraryInfo TLI, DemandedBits DB,
913	AssumptionCache *AC,
914	OptimizationRemarkEmitter ORE, const* Function *F,
915	const LoopVectorizeHints *Hints,
916	InterleavedAccessInfo &IAI,
917	ProfileSummaryInfo PSI, BlockFrequencyInfo BFI)
918	: ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
919	TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
920	Hints(Hints), InterleaveInfo(IAI) {
921	if (TTI.supportsScalableVectors() \|\| ForceTargetSupportsScalableVectors)
922	initializeVScaleForTuning();
923	CostKind = F->hasMinSize() ? TTI::TCK_CodeSize : TTI::TCK_RecipThroughput;
924	// Query this against the original loop and save it here because the profile
925	// of the original loop header may change as the transformation happens.
926	OptForSize = llvm::shouldOptimizeForSize(BB: L->getHeader(), PSI, BFI,
927	QueryType: PGSOQueryType::IRPass);
928	}
929
930	/// \return An upper bound for the vectorization factors (both fixed and
931	/// scalable). If the factors are 0, vectorization and interleaving should be
932	/// avoided up front.
933	FixedScalableVFPair computeMaxVF(ElementCount UserVF, unsigned UserIC);
934
935	/// \return True if runtime checks are required for vectorization, and false
936	/// otherwise.
937	bool runtimeChecksRequired();
938
939	/// Setup cost-based decisions for user vectorization factor.
940	/// \return true if the UserVF is a feasible VF to be chosen.
941	bool selectUserVectorizationFactor(ElementCount UserVF) {
942	collectNonVectorizedAndSetWideningDecisions(VF: UserVF);
943	return expectedCost(VF: UserVF).isValid();
944	}
945
946	/// \return True if maximizing vector bandwidth is enabled by the target or
947	/// user options, for the given register kind.
948	bool useMaxBandwidth(TargetTransformInfo::RegisterKind RegKind);
949
950	/// \return True if maximizing vector bandwidth is enabled by the target or
951	/// user options, for the given vector factor.
952	bool useMaxBandwidth(ElementCount VF);
953
954	/// \return The size (in bits) of the smallest and widest types in the code
955	/// that needs to be vectorized. We ignore values that remain scalar such as
956	/// 64 bit loop indices.
957	std::pair<unsigned, unsigned> getSmallestAndWidestTypes();
958
959	/// \return The desired interleave count.
960	/// If interleave count has been specified by metadata it will be returned.
961	/// Otherwise, the interleave count is computed and returned. VF and LoopCost
962	/// are the selected vectorization factor and the cost of the selected VF.
963	unsigned selectInterleaveCount(VPlan &Plan, ElementCount VF,
964	InstructionCost LoopCost);
965
966	/// Memory access instruction may be vectorized in more than one way.
967	/// Form of instruction after vectorization depends on cost.
968	/// This function takes cost-based decisions for Load/Store instructions
969	/// and collects them in a map. This decisions map is used for building
970	/// the lists of loop-uniform and loop-scalar instructions.
971	/// The calculated cost is saved with widening decision in order to
972	/// avoid redundant calculations.
973	void setCostBasedWideningDecision(ElementCount VF);
974
975	/// A call may be vectorized in different ways depending on whether we have
976	/// vectorized variants available and whether the target supports masking.
977	/// This function analyzes all calls in the function at the supplied VF,
978	/// makes a decision based on the costs of available options, and stores that
979	/// decision in a map for use in planning and plan execution.
980	void setVectorizedCallDecision(ElementCount VF);
981
982	/// Collect values we want to ignore in the cost model.
983	void collectValuesToIgnore();
984
985	/// Collect all element types in the loop for which widening is needed.
986	void collectElementTypesForWidening();
987
988	/// Split reductions into those that happen in the loop, and those that happen
989	/// outside. In loop reductions are collected into InLoopReductions.
990	void collectInLoopReductions();
991
992	/// Returns true if we should use strict in-order reductions for the given
993	/// RdxDesc. This is true if the -enable-strict-reductions flag is passed,
994	/// the IsOrdered flag of RdxDesc is set and we do not allow reordering
995	/// of FP operations.
996	bool useOrderedReductions(const RecurrenceDescriptor &RdxDesc) const {
997	return !Hints->allowReordering() && RdxDesc.isOrdered();
998	}
999
1000	/// \returns The smallest bitwidth each instruction can be represented with.
1001	/// The vector equivalents of these instructions should be truncated to this
1002	/// type.
1003	const MapVector<Instruction , uint64_t> &getMinimalBitwidths() const* {
1004	return MinBWs;
1005	}
1006
1007	/// \returns True if it is more profitable to scalarize instruction \p I for
1008	/// vectorization factor \p VF.
1009	bool isProfitableToScalarize(Instruction I, ElementCount VF) const* {
1010	assert(VF.isVector() &&
1011	"Profitable to scalarize relevant only for VF > 1.");
1012	assert(
1013	TheLoop->isInnermost() &&
1014	"cost-model should not be used for outer loops (in VPlan-native path)");
1015
1016	auto Scalars = InstsToScalarize.find(Val: VF);
1017	assert(Scalars != InstsToScalarize.end() &&
1018	"VF not yet analyzed for scalarization profitability");
1019	return Scalars ->second.contains(Val: I);
1020	}
1021
1022	/// Returns true if \p I is known to be uniform after vectorization.
1023	bool isUniformAfterVectorization(Instruction I, ElementCount VF) const* {
1024	assert(
1025	TheLoop->isInnermost() &&
1026	"cost-model should not be used for outer loops (in VPlan-native path)");
1027	// Pseudo probe needs to be duplicated for each unrolled iteration and
1028	// vector lane so that profiled loop trip count can be accurately
1029	// accumulated instead of being under counted.
1030	if (isa<PseudoProbeInst>(Val: I))
1031	return false;
1032
1033	if (VF.isScalar())
1034	return true;
1035
1036	auto UniformsPerVF = Uniforms.find(Val: VF);
1037	assert(UniformsPerVF != Uniforms.end() &&
1038	"VF not yet analyzed for uniformity");
1039	return UniformsPerVF ->second.count(Ptr: I);
1040	}
1041
1042	/// Returns true if \p I is known to be scalar after vectorization.
1043	bool isScalarAfterVectorization(Instruction I, ElementCount VF) const* {
1044	assert(
1045	TheLoop->isInnermost() &&
1046	"cost-model should not be used for outer loops (in VPlan-native path)");
1047	if (VF.isScalar())
1048	return true;
1049
1050	auto ScalarsPerVF = Scalars.find(Val: VF);
1051	assert(ScalarsPerVF != Scalars.end() &&
1052	"Scalar values are not calculated for VF");
1053	return ScalarsPerVF ->second.count(Ptr: I);
1054	}
1055
1056	/// \returns True if instruction \p I can be truncated to a smaller bitwidth
1057	/// for vectorization factor \p VF.
1058	bool canTruncateToMinimalBitwidth(Instruction I, ElementCount VF) const* {
1059	return VF.isVector() && MinBWs.contains(Key: I) &&
1060	!isProfitableToScalarize(I, VF) &&
1061	!isScalarAfterVectorization(I, VF);
1062	}
1063
1064	/// Decision that was taken during cost calculation for memory instruction.
1065	enum InstWidening {
1066	CM_Unknown,
1067	CM_Widen, // For consecutive accesses with stride +1.
1068	CM_Widen_Reverse, // For consecutive accesses with stride -1.
1069	CM_Interleave,
1070	CM_GatherScatter,
1071	CM_Scalarize,
1072	CM_VectorCall,
1073	CM_IntrinsicCall
1074	};
1075
1076	/// Save vectorization decision \p W and \p Cost taken by the cost model for
1077	/// instruction \p I and vector width \p VF.
1078	void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
1079	InstructionCost Cost) {
1080	assert(VF.isVector() && "Expected VF >=2");
1081	WideningDecisions [{I, VF}] = {W, Cost};
1082	}
1083
1084	/// Save vectorization decision \p W and \p Cost taken by the cost model for
1085	/// interleaving group \p Grp and vector width \p VF.
1086	void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
1087	ElementCount VF, InstWidening W,
1088	InstructionCost Cost) {
1089	assert(VF.isVector() && "Expected VF >=2");
1090	/// Broadcast this decicion to all instructions inside the group.
1091	/// When interleaving, the cost will only be assigned one instruction, the
1092	/// insert position. For other cases, add the appropriate fraction of the
1093	/// total cost to each instruction. This ensures accurate costs are used,
1094	/// even if the insert position instruction is not used.
1095	InstructionCost InsertPosCost = Cost;
1096	InstructionCost OtherMemberCost = `0`;
1097	if (W != CM_Interleave)
1098	OtherMemberCost = InsertPosCost = Cost / Grp->getNumMembers();
1099	;
1100	for (unsigned Idx = `0`; Idx < Grp->getFactor(); ++Idx) {
1101	if (auto *I = Grp->getMember(Index: Idx)) {
1102	if (Grp->getInsertPos() == I)
1103	WideningDecisions [{I, VF}] = {W, InsertPosCost};
1104	else
1105	WideningDecisions [{I, VF}] = {W, OtherMemberCost};
1106	}
1107	}
1108	}
1109
1110	/// Return the cost model decision for the given instruction \p I and vector
1111	/// width \p VF. Return CM_Unknown if this instruction did not pass
1112	/// through the cost modeling.
1113	InstWidening getWideningDecision(Instruction I, ElementCount VF) const* {
1114	assert(VF.isVector() && "Expected VF to be a vector VF");
1115	assert(
1116	TheLoop->isInnermost() &&
1117	"cost-model should not be used for outer loops (in VPlan-native path)");
1118
1119	std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1120	auto Itr = WideningDecisions.find(Val: InstOnVF);
1121	if (Itr == WideningDecisions.end())
1122	return CM_Unknown;
1123	return Itr ->second.first;
1124	}
1125
1126	/// Return the vectorization cost for the given instruction \p I and vector
1127	/// width \p VF.
1128	InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
1129	assert(VF.isVector() && "Expected VF >=2");
1130	std::pair<Instruction *, ElementCount> InstOnVF(I, VF);
1131	assert(WideningDecisions.contains(InstOnVF) &&
1132	"The cost is not calculated");
1133	return WideningDecisions [InstOnVF].second;
1134	}
1135
1136	struct CallWideningDecision {
1137	InstWidening Kind;
1138	Function *Variant;
1139	Intrinsic::ID IID;
1140	std::optional<unsigned> MaskPos;
1141	InstructionCost Cost;
1142	};
1143
1144	void setCallWideningDecision(CallInst *CI, ElementCount VF, InstWidening Kind,
1145	Function *Variant, Intrinsic::ID IID,
1146	std::optional<unsigned> MaskPos,
1147	InstructionCost Cost) {
1148	assert(!VF.isScalar() && "Expected vector VF");
1149	CallWideningDecisions [{CI, VF}] = {.Kind: Kind, .Variant: Variant, .IID: IID, .MaskPos: MaskPos, .Cost: Cost};
1150	}
1151
1152	CallWideningDecision getCallWideningDecision(CallInst *CI,
1153	ElementCount VF) const {
1154	assert(!VF.isScalar() && "Expected vector VF");
1155	return CallWideningDecisions.at(Val: {CI, VF});
1156	}
1157
1158	/// Return True if instruction \p I is an optimizable truncate whose operand
1159	/// is an induction variable. Such a truncate will be removed by adding a new
1160	/// induction variable with the destination type.
1161	bool isOptimizableIVTruncate(Instruction *I, ElementCount VF) {
1162	// If the instruction is not a truncate, return false.
1163	auto *Trunc = dyn_cast<TruncInst>(Val: I);
1164	if (!Trunc)
1165	return false;
1166
1167	// Get the source and destination types of the truncate.
1168	Type *SrcTy = toVectorTy(Scalar: Trunc->getSrcTy(), EC: VF);
1169	Type *DestTy = toVectorTy(Scalar: Trunc->getDestTy(), EC: VF);
1170
1171	// If the truncate is free for the given types, return false. Replacing a
1172	// free truncate with an induction variable would add an induction variable
1173	// update instruction to each iteration of the loop. We exclude from this
1174	// check the primary induction variable since it will need an update
1175	// instruction regardless.
1176	Value *Op = Trunc->getOperand(i_nocapture: `0`);
1177	if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(Ty1: SrcTy, Ty2: DestTy))
1178	return false;
1179
1180	// If the truncated value is not an induction variable, return false.
1181	return Legal->isInductionPhi(V: Op);
1182	}
1183
1184	/// Collects the instructions to scalarize for each predicated instruction in
1185	/// the loop.
1186	void collectInstsToScalarize(ElementCount VF);
1187
1188	/// Collect values that will not be widened, including Uniforms, Scalars, and
1189	/// Instructions to Scalarize for the given \p VF.
1190	/// The sets depend on CM decision for Load/Store instructions
1191	/// that may be vectorized as interleave, gather-scatter or scalarized.
1192	/// Also make a decision on what to do about call instructions in the loop
1193	/// at that VF -- scalarize, call a known vector routine, or call a
1194	/// vector intrinsic.
1195	void collectNonVectorizedAndSetWideningDecisions(ElementCount VF) {
1196	// Do the analysis once.
1197	if (VF.isScalar() \|\| Uniforms.contains(Val: VF))
1198	return;
1199	setCostBasedWideningDecision(VF);
1200	collectLoopUniforms(VF);
1201	setVectorizedCallDecision(VF);
1202	collectLoopScalars(VF);
1203	collectInstsToScalarize(VF);
1204	}
1205
1206	/// Returns true if the target machine supports masked store operation
1207	/// for the given \p DataType and kind of access to \p Ptr.
1208	bool isLegalMaskedStore(Type DataType, Value Ptr, Align Alignment,
1209	unsigned AddressSpace) const {
1210	return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1211	TTI.isLegalMaskedStore(DataType, Alignment, AddressSpace);
1212	}
1213
1214	/// Returns true if the target machine supports masked load operation
1215	/// for the given \p DataType and kind of access to \p Ptr.
1216	bool isLegalMaskedLoad(Type DataType, Value Ptr, Align Alignment,
1217	unsigned AddressSpace) const {
1218	return Legal->isConsecutivePtr(AccessTy: DataType, Ptr) &&
1219	TTI.isLegalMaskedLoad(DataType, Alignment, AddressSpace);
1220	}
1221
1222	/// Returns true if the target machine can represent \p V as a masked gather
1223	/// or scatter operation.
1224	bool isLegalGatherOrScatter(Value *V, ElementCount VF) {
1225	bool LI = isa<LoadInst>(Val: V);
1226	bool SI = isa<StoreInst>(Val: V);
1227	if (!LI && !SI)
1228	return false;
1229	auto *Ty = getLoadStoreType(I: V);
1230	Align Align = getLoadStoreAlignment(I: V);
1231	if (VF.isVector())
1232	Ty = VectorType::get(ElementType: Ty, EC: VF);
1233	return (LI && TTI.isLegalMaskedGather(DataType: Ty, Alignment: Align)) \|\|
1234	(SI && TTI.isLegalMaskedScatter(DataType: Ty, Alignment: Align));
1235	}
1236
1237	/// Returns true if the target machine supports all of the reduction
1238	/// variables found for the given VF.
1239	bool canVectorizeReductions(ElementCount VF) const {
1240	return (all_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
1241	const RecurrenceDescriptor &RdxDesc = Reduction.second;
1242	return TTI.isLegalToVectorizeReduction(RdxDesc, VF);
1243	}));
1244	}
1245
1246	/// Given costs for both strategies, return true if the scalar predication
1247	/// lowering should be used for div/rem. This incorporates an override
1248	/// option so it is not simply a cost comparison.
1249	bool isDivRemScalarWithPredication(InstructionCost ScalarCost,
1250	InstructionCost SafeDivisorCost) const {
1251	switch (ForceSafeDivisor) {
1252	case cl::BOU_UNSET:
1253	return ScalarCost < SafeDivisorCost;
1254	case cl::BOU_TRUE:
1255	return false;
1256	case cl::BOU_FALSE:
1257	return true;
1258	}
1259	llvm_unreachable("impossible case value");
1260	}
1261
1262	/// Returns true if \p I is an instruction which requires predication and
1263	/// for which our chosen predication strategy is scalarization (i.e. we
1264	/// don't have an alternate strategy such as masking available).
1265	/// \p VF is the vectorization factor that will be used to vectorize \p I.
1266	bool isScalarWithPredication(Instruction I, ElementCount VF) const*;
1267
1268	/// Returns true if \p I is an instruction that needs to be predicated
1269	/// at runtime. The result is independent of the predication mechanism.
1270	/// Superset of instructions that return true for isScalarWithPredication.
1271	bool isPredicatedInst(Instruction I) const*;
1272
1273	/// Return the costs for our two available strategies for lowering a
1274	/// div/rem operation which requires speculating at least one lane.
1275	/// First result is for scalarization (will be invalid for scalable
1276	/// vectors); second is for the safe-divisor strategy.
1277	std::pair<InstructionCost, InstructionCost>
1278	getDivRemSpeculationCost(Instruction *I,
1279	ElementCount VF) const;
1280
1281	/// Returns true if \p I is a memory instruction with consecutive memory
1282	/// access that can be widened.
1283	bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
1284
1285	/// Returns true if \p I is a memory instruction in an interleaved-group
1286	/// of memory accesses that can be vectorized with wide vector loads/stores
1287	/// and shuffles.
1288	bool interleavedAccessCanBeWidened(Instruction I, ElementCount VF) const*;
1289
1290	/// Check if \p Instr belongs to any interleaved access group.
1291	bool isAccessInterleaved(Instruction Instr) const* {
1292	return InterleaveInfo.isInterleaved(Instr);
1293	}
1294
1295	/// Get the interleaved access group that \p Instr belongs to.
1296	const InterleaveGroup<Instruction> *
1297	getInterleavedAccessGroup(Instruction Instr) const* {
1298	return InterleaveInfo.getInterleaveGroup(Instr);
1299	}
1300
1301	/// Returns true if we're required to use a scalar epilogue for at least
1302	/// the final iteration of the original loop.
1303	bool requiresScalarEpilogue(bool IsVectorizing) const {
1304	if (!isScalarEpilogueAllowed()) {
1305	LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1306	return false;
1307	}
1308	// If we might exit from anywhere but the latch and early exit vectorization
1309	// is disabled, we must run the exiting iteration in scalar form.
1310	if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
1311	!(EnableEarlyExitVectorization && Legal->hasUncountableEarlyExit())) {
1312	LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: not exiting "
1313	"from latch block\n");
1314	return true;
1315	}
1316	if (IsVectorizing && InterleaveInfo.requiresScalarEpilogue()) {
1317	LLVM_DEBUG(dbgs() << "LV: Loop requires scalar epilogue: "
1318	"interleaved group requires scalar epilogue\n");
1319	return true;
1320	}
1321	LLVM_DEBUG(dbgs() << "LV: Loop does not require scalar epilogue\n");
1322	return false;
1323	}
1324
1325	/// Returns true if a scalar epilogue is not allowed due to optsize or a
1326	/// loop hint annotation.
1327	bool isScalarEpilogueAllowed() const {
1328	return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
1329	}
1330
1331	/// Returns the TailFoldingStyle that is best for the current loop.
1332	TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
1333	if (!ChosenTailFoldingStyle)
1334	return TailFoldingStyle::None;
1335	return IVUpdateMayOverflow ? ChosenTailFoldingStyle ->first
1336	: ChosenTailFoldingStyle ->second;
1337	}
1338
1339	/// Selects and saves TailFoldingStyle for 2 options - if IV update may
1340	/// overflow or not.
1341	/// \param IsScalableVF true if scalable vector factors enabled.
1342	/// \param UserIC User specific interleave count.
1343	void setTailFoldingStyles(bool IsScalableVF, unsigned UserIC) {
1344	assert(!ChosenTailFoldingStyle && "Tail folding must not be selected yet.");
1345	if (!Legal->canFoldTailByMasking()) {
1346	ChosenTailFoldingStyle = {TailFoldingStyle::None, TailFoldingStyle::None};
1347	return;
1348	}
1349
1350	// Default to TTI preference, but allow command line override.
1351	ChosenTailFoldingStyle = {
1352	TTI.getPreferredTailFoldingStyle(/IVUpdateMayOverflow=/true),
1353	TTI.getPreferredTailFoldingStyle(/IVUpdateMayOverflow=/false)};
1354	if (ForceTailFoldingStyle.getNumOccurrences())
1355	ChosenTailFoldingStyle = {ForceTailFoldingStyle.getValue(),
1356	ForceTailFoldingStyle.getValue()};
1357
1358	if (ForceTailFoldingStyle != TailFoldingStyle::DataWithEVL)
1359	return;
1360	// Override forced styles if needed.
1361	// FIXME: Investigate opportunity for fixed vector factor.
1362	bool EVLIsLegal = UserIC <= `1` && IsScalableVF &&
1363	TTI.hasActiveVectorLength() && !EnableVPlanNativePath;
1364	if (EVLIsLegal)
1365	return;
1366	// If for some reason EVL mode is unsupported, fallback to
1367	// DataWithoutLaneMask to try to vectorize the loop with folded tail
1368	// in a generic way.
1369	ChosenTailFoldingStyle = {TailFoldingStyle::DataWithoutLaneMask,
1370	TailFoldingStyle::DataWithoutLaneMask};
1371	LLVM_DEBUG(
1372	dbgs() << "LV: Preference for VP intrinsics indicated. Will "
1373	"not try to generate VP Intrinsics "
1374	<< (UserIC > `1`
1375	? "since interleave count specified is greater than 1.\n"
1376	: "due to non-interleaving reasons.\n"));
1377	}
1378
1379	/// Returns true if all loop blocks should be masked to fold tail loop.
1380	bool foldTailByMasking() const {
1381	// TODO: check if it is possible to check for None style independent of
1382	// IVUpdateMayOverflow flag in getTailFoldingStyle.
1383	return getTailFoldingStyle() != TailFoldingStyle::None;
1384	}
1385
1386	/// Return maximum safe number of elements to be processed per vector
1387	/// iteration, which do not prevent store-load forwarding and are safe with
1388	/// regard to the memory dependencies. Required for EVL-based VPlans to
1389	/// correctly calculate AVL (application vector length) as min(remaining AVL,
1390	/// MaxSafeElements).
1391	/// TODO: need to consider adjusting cost model to use this value as a
1392	/// vectorization factor for EVL-based vectorization.
1393	std::optional<unsigned> getMaxSafeElements() const { return MaxSafeElements; }
1394
1395	/// Returns true if the instructions in this block requires predication
1396	/// for any reason, e.g. because tail folding now requires a predicate
1397	/// or because the block in the original loop was predicated.
1398	bool blockNeedsPredicationForAnyReason(BasicBlock BB) const* {
1399	return foldTailByMasking() \|\| Legal->blockNeedsPredication(BB);
1400	}
1401
1402	/// Returns true if VP intrinsics with explicit vector length support should
1403	/// be generated in the tail folded loop.
1404	bool foldTailWithEVL() const {
1405	return getTailFoldingStyle() == TailFoldingStyle::DataWithEVL;
1406	}
1407
1408	/// Returns true if the Phi is part of an inloop reduction.
1409	bool isInLoopReduction(PHINode Phi) const* {
1410	return InLoopReductions.contains(Ptr: Phi);
1411	}
1412
1413	/// Returns true if the predicated reduction select should be used to set the
1414	/// incoming value for the reduction phi.
1415	bool usePredicatedReductionSelect() const {
1416	// Force to use predicated reduction select since the EVL of the
1417	// second-to-last iteration might not be VFUF.*
1418	if (foldTailWithEVL())
1419	return true;
1420	return PreferPredicatedReductionSelect \|\|
1421	TTI.preferPredicatedReductionSelect();
1422	}
1423
1424	/// Estimate cost of an intrinsic call instruction CI if it were vectorized
1425	/// with factor VF. Return the cost of the instruction, including
1426	/// scalarization overhead if it's needed.
1427	InstructionCost getVectorIntrinsicCost(CallInst CI, ElementCount VF) const*;
1428
1429	/// Estimate cost of a call instruction CI if it were vectorized with factor
1430	/// VF. Return the cost of the instruction, including scalarization overhead
1431	/// if it's needed.
1432	InstructionCost getVectorCallCost(CallInst CI, ElementCount VF) const*;
1433
1434	/// Invalidates decisions already taken by the cost model.
1435	void invalidateCostModelingDecisions() {
1436	WideningDecisions.clear();
1437	CallWideningDecisions.clear();
1438	Uniforms.clear();
1439	Scalars.clear();
1440	}
1441
1442	/// Returns the expected execution cost. The unit of the cost does
1443	/// not matter because we use the 'cost' units to compare different
1444	/// vector widths. The cost that is returned is not* normalized by*
1445	/// the factor width.
1446	InstructionCost expectedCost(ElementCount VF);
1447
1448	bool hasPredStores() const { return NumPredStores > `0`; }
1449
1450	/// Returns true if epilogue vectorization is considered profitable, and
1451	/// false otherwise.
1452	/// \p VF is the vectorization factor chosen for the original loop.
1453	/// \p Multiplier is an aditional scaling factor applied to VF before
1454	/// comparing to EpilogueVectorizationMinVF.
1455	bool isEpilogueVectorizationProfitable(const ElementCount VF,
1456	const unsigned IC) const;
1457
1458	/// Returns the execution time cost of an instruction for a given vector
1459	/// width. Vector width of one means scalar.
1460	InstructionCost getInstructionCost(Instruction *I, ElementCount VF);
1461
1462	/// Return the cost of instructions in an inloop reduction pattern, if I is
1463	/// part of that pattern.
1464	std::optional<InstructionCost> getReductionPatternCost(Instruction *I,
1465	ElementCount VF,
1466	Type VectorTy) const*;
1467
1468	/// Returns true if \p Op should be considered invariant and if it is
1469	/// trivially hoistable.
1470	bool shouldConsiderInvariant(Value *Op);
1471
1472	/// Return the value of vscale used for tuning the cost model.
1473	std::optional<unsigned> getVScaleForTuning() const { return VScaleForTuning; }
1474
1475	private:
1476	unsigned NumPredStores = `0`;
1477
1478	/// Used to store the value of vscale used for tuning the cost model. It is
1479	/// initialized during object construction.
1480	std::optional<unsigned> VScaleForTuning;
1481
1482	/// Initializes the value of vscale used for tuning the cost model. If
1483	/// vscale_range.min == vscale_range.max then return vscale_range.max, else
1484	/// return the value returned by the corresponding TTI method.
1485	void initializeVScaleForTuning() {
1486	const Function *Fn = TheLoop->getHeader()->getParent();
1487	if (Fn->hasFnAttribute(Kind: Attribute::VScaleRange)) {
1488	auto Attr = Fn->getFnAttribute(Kind: Attribute::VScaleRange);
1489	auto Min = Attr.getVScaleRangeMin();
1490	auto Max = Attr.getVScaleRangeMax();
1491	if (Max && Min == Max) {
1492	VScaleForTuning = Max;
1493	return;
1494	}
1495	}
1496
1497	VScaleForTuning = TTI.getVScaleForTuning();
1498	}
1499
1500	/// \return An upper bound for the vectorization factors for both
1501	/// fixed and scalable vectorization, where the minimum-known number of
1502	/// elements is a power-of-2 larger than zero. If scalable vectorization is
1503	/// disabled or unsupported, then the scalable part will be equal to
1504	/// ElementCount::getScalable(0).
1505	FixedScalableVFPair computeFeasibleMaxVF(unsigned MaxTripCount,
1506	ElementCount UserVF,
1507	bool FoldTailByMasking);
1508
1509	/// \return the maximized element count based on the targets vector
1510	/// registers and the loop trip-count, but limited to a maximum safe VF.
1511	/// This is a helper function of computeFeasibleMaxVF.
1512	ElementCount getMaximizedVFForTarget(unsigned MaxTripCount,
1513	unsigned SmallestType,
1514	unsigned WidestType,
1515	ElementCount MaxSafeVF,
1516	bool FoldTailByMasking);
1517
1518	/// Checks if scalable vectorization is supported and enabled. Caches the
1519	/// result to avoid repeated debug dumps for repeated queries.
1520	bool isScalableVectorizationAllowed();
1521
1522	/// \return the maximum legal scalable VF, based on the safe max number
1523	/// of elements.
1524	ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
1525
1526	/// Calculate vectorization cost of memory instruction \p I.
1527	InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
1528
1529	/// The cost computation for scalarized memory instruction.
1530	InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
1531
1532	/// The cost computation for interleaving group of memory instructions.
1533	InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
1534
1535	/// The cost computation for Gather/Scatter instruction.
1536	InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
1537
1538	/// The cost computation for widening instruction \p I with consecutive
1539	/// memory access.
1540	InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
1541
1542	/// The cost calculation for Load/Store instruction \p I with uniform pointer -
1543	/// Load: scalar load + broadcast.
1544	/// Store: scalar store + (loop invariant value stored? 0 : extract of last
1545	/// element)
1546	InstructionCost getUniformMemOpCost(Instruction *I, ElementCount VF);
1547
1548	/// Estimate the overhead of scalarizing an instruction. This is a
1549	/// convenience wrapper for the type-based getScalarizationOverhead API.
1550	InstructionCost getScalarizationOverhead(Instruction *I,
1551	ElementCount VF) const;
1552
1553	/// Returns true if an artificially high cost for emulated masked memrefs
1554	/// should be used.
1555	bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
1556
1557	/// Map of scalar integer values to the smallest bitwidth they can be legally
1558	/// represented as. The vector equivalents of these values should be truncated
1559	/// to this type.
1560	MapVector<Instruction *, uint64_t> MinBWs;
1561
1562	/// A type representing the costs for instructions if they were to be
1563	/// scalarized rather than vectorized. The entries are Instruction-Cost
1564	/// pairs.
1565	using ScalarCostsTy = DenseMap<Instruction *, InstructionCost>;
1566
1567	/// A set containing all BasicBlocks that are known to present after
1568	/// vectorization as a predicated block.
1569	DenseMap<ElementCount, SmallPtrSet<BasicBlock *, `4`>>
1570	PredicatedBBsAfterVectorization;
1571
1572	/// Records whether it is allowed to have the original scalar loop execute at
1573	/// least once. This may be needed as a fallback loop in case runtime
1574	/// aliasing/dependence checks fail, or to handle the tail/remainder
1575	/// iterations when the trip count is unknown or doesn't divide by the VF,
1576	/// or as a peel-loop to handle gaps in interleave-groups.
1577	/// Under optsize and when the trip count is very small we don't allow any
1578	/// iterations to execute in the scalar loop.
1579	ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
1580
1581	/// Control finally chosen tail folding style. The first element is used if
1582	/// the IV update may overflow, the second element - if it does not.
1583	std::optional<std::pair<TailFoldingStyle, TailFoldingStyle>>
1584	ChosenTailFoldingStyle;
1585
1586	/// true if scalable vectorization is supported and enabled.
1587	std::optional<bool> IsScalableVectorizationAllowed;
1588
1589	/// Maximum safe number of elements to be processed per vector iteration,
1590	/// which do not prevent store-load forwarding and are safe with regard to the
1591	/// memory dependencies. Required for EVL-based veectorization, where this
1592	/// value is used as the upper bound of the safe AVL.
1593	std::optional<unsigned> MaxSafeElements;
1594
1595	/// A map holding scalar costs for different vectorization factors. The
1596	/// presence of a cost for an instruction in the mapping indicates that the
1597	/// instruction will be scalarized when vectorizing with the associated
1598	/// vectorization factor. The entries are VF-ScalarCostTy pairs.
1599	DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
1600
1601	/// Holds the instructions known to be uniform after vectorization.
1602	/// The data is collected per VF.
1603	DenseMap<ElementCount, SmallPtrSet<Instruction *, `4`>> Uniforms;
1604
1605	/// Holds the instructions known to be scalar after vectorization.
1606	/// The data is collected per VF.
1607	DenseMap<ElementCount, SmallPtrSet<Instruction *, `4`>> Scalars;
1608
1609	/// Holds the instructions (address computations) that are forced to be
1610	/// scalarized.
1611	DenseMap<ElementCount, SmallPtrSet<Instruction *, `4`>> ForcedScalars;
1612
1613	/// PHINodes of the reductions that should be expanded in-loop.
1614	SmallPtrSet<PHINode *, `4`> InLoopReductions;
1615
1616	/// A Map of inloop reduction operations and their immediate chain operand.
1617	/// FIXME: This can be removed once reductions can be costed correctly in
1618	/// VPlan. This was added to allow quick lookup of the inloop operations.
1619	DenseMap<Instruction , Instruction > InLoopReductionImmediateChains;
1620
1621	/// Returns the expected difference in cost from scalarizing the expression
1622	/// feeding a predicated instruction \p PredInst. The instructions to
1623	/// scalarize and their scalar costs are collected in \p ScalarCosts. A
1624	/// non-negative return value implies the expression will be scalarized.
1625	/// Currently, only single-use chains are considered for scalarization.
1626	InstructionCost computePredInstDiscount(Instruction *PredInst,
1627	ScalarCostsTy &ScalarCosts,
1628	ElementCount VF);
1629
1630	/// Collect the instructions that are uniform after vectorization. An
1631	/// instruction is uniform if we represent it with a single scalar value in
1632	/// the vectorized loop corresponding to each vector iteration. Examples of
1633	/// uniform instructions include pointer operands of consecutive or
1634	/// interleaved memory accesses. Note that although uniformity implies an
1635	/// instruction will be scalar, the reverse is not true. In general, a
1636	/// scalarized instruction will be represented by VF scalar values in the
1637	/// vectorized loop, each corresponding to an iteration of the original
1638	/// scalar loop.
1639	void collectLoopUniforms(ElementCount VF);
1640
1641	/// Collect the instructions that are scalar after vectorization. An
1642	/// instruction is scalar if it is known to be uniform or will be scalarized
1643	/// during vectorization. collectLoopScalars should only add non-uniform nodes
1644	/// to the list if they are used by a load/store instruction that is marked as
1645	/// CM_Scalarize. Non-uniform scalarized instructions will be represented by
1646	/// VF values in the vectorized loop, each corresponding to an iteration of
1647	/// the original scalar loop.
1648	void collectLoopScalars(ElementCount VF);
1649
1650	/// Keeps cost model vectorization decision and cost for instructions.
1651	/// Right now it is used for memory instructions only.
1652	using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
1653	std::pair<InstWidening, InstructionCost>>;
1654
1655	DecisionList WideningDecisions;
1656
1657	using CallDecisionList =
1658	DenseMap<std::pair<CallInst *, ElementCount>, CallWideningDecision>;
1659
1660	CallDecisionList CallWideningDecisions;
1661
1662	/// Returns true if \p V is expected to be vectorized and it needs to be
1663	/// extracted.
1664	bool needsExtract(Value V, ElementCount VF) const* {
1665	Instruction *I = dyn_cast<Instruction>(Val: V);
1666	if (VF.isScalar() \|\| !I \|\| !TheLoop->contains(Inst: I) \|\|
1667	TheLoop->isLoopInvariant(V: I) \|\|
1668	getWideningDecision(I, VF) == CM_Scalarize)
1669	return false;
1670
1671	// Assume we can vectorize V (and hence we need extraction) if the
1672	// scalars are not computed yet. This can happen, because it is called
1673	// via getScalarizationOverhead from setCostBasedWideningDecision, before
1674	// the scalars are collected. That should be a safe assumption in most
1675	// cases, because we check if the operands have vectorizable types
1676	// beforehand in LoopVectorizationLegality.
1677	return !Scalars.contains(Val: VF) \|\| !isScalarAfterVectorization(I, VF);
1678	};
1679
1680	/// Returns a range containing only operands needing to be extracted.
1681	SmallVector<Value *, `4`> filterExtractingOperands(Instruction::op_range Ops,
1682	ElementCount VF) const {
1683	return SmallVector<Value *, `4`>(make_filter_range(
1684	Range&: Ops, Pred: [this, VF](Value V) { return* this->needsExtract(V, VF); }));
1685	}
1686
1687	public:
1688	/// The loop that we evaluate.
1689	Loop *TheLoop;
1690
1691	/// Predicated scalar evolution analysis.
1692	PredicatedScalarEvolution &PSE;
1693
1694	/// Loop Info analysis.
1695	LoopInfo *LI;
1696
1697	/// Vectorization legality.
1698	LoopVectorizationLegality *Legal;
1699
1700	/// Vector target information.
1701	const TargetTransformInfo &TTI;
1702
1703	/// Target Library Info.
1704	const TargetLibraryInfo *TLI;
1705
1706	/// Demanded bits analysis.
1707	DemandedBits *DB;
1708
1709	/// Assumption cache.
1710	AssumptionCache *AC;
1711
1712	/// Interface to emit optimization remarks.
1713	OptimizationRemarkEmitter *ORE;
1714
1715	const Function *TheFunction;
1716
1717	/// Loop Vectorize Hint.
1718	const LoopVectorizeHints *Hints;
1719
1720	/// The interleave access information contains groups of interleaved accesses
1721	/// with the same stride and close to each other.
1722	InterleavedAccessInfo &InterleaveInfo;
1723
1724	/// Values to ignore in the cost model.
1725	SmallPtrSet<const Value *, `16`> ValuesToIgnore;
1726
1727	/// Values to ignore in the cost model when VF > 1.
1728	SmallPtrSet<const Value *, `16`> VecValuesToIgnore;
1729
1730	/// All element types found in the loop.
1731	SmallPtrSet<Type *, `16`> ElementTypesInLoop;
1732
1733	/// The kind of cost that we are calculating
1734	TTI::TargetCostKind CostKind;
1735
1736	/// Whether this loop should be optimized for size based on function attribute
1737	/// or profile information.
1738	bool OptForSize;
1739	};
1740	} // end namespace llvm
1741
1742	namespace {
1743	/// Helper struct to manage generating runtime checks for vectorization.
1744	///
1745	/// The runtime checks are created up-front in temporary blocks to allow better
1746	/// estimating the cost and un-linked from the existing IR. After deciding to
1747	/// vectorize, the checks are moved back. If deciding not to vectorize, the
1748	/// temporary blocks are completely removed.
1749	class GeneratedRTChecks {
1750	/// Basic block which contains the generated SCEV checks, if any.
1751	BasicBlock SCEVCheckBlock = nullptr*;
1752
1753	/// The value representing the result of the generated SCEV checks. If it is
1754	/// nullptr no SCEV checks have been generated.
1755	Value SCEVCheckCond = nullptr*;
1756
1757	/// Basic block which contains the generated memory runtime checks, if any.
1758	BasicBlock MemCheckBlock = nullptr*;
1759
1760	/// The value representing the result of the generated memory runtime checks.
1761	/// If it is nullptr no memory runtime checks have been generated.
1762	Value MemRuntimeCheckCond = nullptr*;
1763
1764	DominatorTree *DT;
1765	LoopInfo *LI;
1766	TargetTransformInfo *TTI;
1767
1768	SCEVExpander SCEVExp;
1769	SCEVExpander MemCheckExp;
1770
1771	bool CostTooHigh = false;
1772
1773	Loop OuterLoop = nullptr*;
1774
1775	PredicatedScalarEvolution &PSE;
1776
1777	/// The kind of cost that we are calculating
1778	TTI::TargetCostKind CostKind;
1779
1780	public:
1781	GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
1782	LoopInfo LI, TargetTransformInfo TTI,
1783	const DataLayout &DL, TTI::TargetCostKind CostKind)
1784	: DT(DT), LI(LI), TTI(TTI), SCEVExp (*PSE.getSE(), DL, "scev.check"),
1785	MemCheckExp (*PSE.getSE(), DL, "scev.check"), PSE(PSE),
1786	CostKind(CostKind) {}
1787
1788	/// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
1789	/// accurately estimate the cost of the runtime checks. The blocks are
1790	/// un-linked from the IR and are added back during vector code generation. If
1791	/// there is no vector code generation, the check blocks are removed
1792	/// completely.
1793	void create(Loop L, const* LoopAccessInfo &LAI,
1794	const SCEVPredicate &UnionPred, ElementCount VF, unsigned IC) {
1795
1796	// Hard cutoff to limit compile-time increase in case a very large number of
1797	// runtime checks needs to be generated.
1798	// TODO: Skip cutoff if the loop is guaranteed to execute, e.g. due to
1799	// profile info.
1800	CostTooHigh =
1801	LAI.getNumRuntimePointerChecks() > VectorizeMemoryCheckThreshold;
1802	if (CostTooHigh)
1803	return;
1804
1805	BasicBlock *LoopHeader = L->getHeader();
1806	BasicBlock *Preheader = L->getLoopPreheader();
1807
1808	// Use SplitBlock to create blocks for SCEV & memory runtime checks to
1809	// ensure the blocks are properly added to LoopInfo & DominatorTree. Those
1810	// may be used by SCEVExpander. The blocks will be un-linked from their
1811	// predecessors and removed from LI & DT at the end of the function.
1812	if (!UnionPred.isAlwaysTrue()) {
1813	SCEVCheckBlock = SplitBlock(Old: Preheader, SplitPt: Preheader->getTerminator(), DT, LI,
1814	MSSAU: nullptr, BBName: "vector.scevcheck");
1815
1816	SCEVCheckCond = SCEVExp.expandCodeForPredicate(
1817	Pred: &UnionPred, Loc: SCEVCheckBlock->getTerminator());
1818	if (isa<Constant>(Val: SCEVCheckCond)) {
1819	// Clean up directly after expanding the predicate to a constant, to
1820	// avoid further expansions re-using anything left over from SCEVExp.
1821	SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1822	SCEVCleaner.cleanup();
1823	}
1824	}
1825
1826	const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
1827	if (RtPtrChecking.Need) {
1828	auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1829	MemCheckBlock = SplitBlock(Old: Pred, SplitPt: Pred->getTerminator(), DT, LI, MSSAU: nullptr,
1830	BBName: "vector.memcheck");
1831
1832	auto DiffChecks = RtPtrChecking.getDiffChecks();
1833	if (DiffChecks) {
1834	Value RuntimeVF = nullptr*;
1835	MemRuntimeCheckCond = addDiffRuntimeChecks(
1836	Loc: MemCheckBlock->getTerminator(), Checks: *DiffChecks, Expander&: MemCheckExp,
1837	GetVF: [VF, &RuntimeVF](IRBuilderBase &B, unsigned Bits) {
1838	if (!RuntimeVF)
1839	RuntimeVF = getRuntimeVF(B, Ty: B.getIntNTy(N: Bits), VF);
1840	return RuntimeVF;
1841	},
1842	IC);
1843	} else {
1844	MemRuntimeCheckCond = addRuntimeChecks(
1845	Loc: MemCheckBlock->getTerminator(), TheLoop: L, PointerChecks: RtPtrChecking.getChecks(),
1846	Expander&: MemCheckExp, HoistRuntimeChecks: VectorizerParams::HoistRuntimeChecks);
1847	}
1848	assert(MemRuntimeCheckCond &&
1849	"no RT checks generated although RtPtrChecking "
1850	"claimed checks are required");
1851	}
1852
1853	if (!MemCheckBlock && !SCEVCheckBlock)
1854	return;
1855
1856	// Unhook the temporary block with the checks, update various places
1857	// accordingly.
1858	if (SCEVCheckBlock)
1859	SCEVCheckBlock->replaceAllUsesWith(V: Preheader);
1860	if (MemCheckBlock)
1861	MemCheckBlock->replaceAllUsesWith(V: Preheader);
1862
1863	if (SCEVCheckBlock) {
1864	SCEVCheckBlock->getTerminator()->moveBefore(
1865	InsertPos: Preheader->getTerminator()->getIterator());
1866	auto UI = new* UnreachableInst (Preheader->getContext(), SCEVCheckBlock);
1867	UI->setDebugLoc(DebugLoc::getTemporary());
1868	Preheader->getTerminator()->eraseFromParent();
1869	}
1870	if (MemCheckBlock) {
1871	MemCheckBlock->getTerminator()->moveBefore(
1872	InsertPos: Preheader->getTerminator()->getIterator());
1873	auto UI = new* UnreachableInst (Preheader->getContext(), MemCheckBlock);
1874	UI->setDebugLoc(DebugLoc::getTemporary());
1875	Preheader->getTerminator()->eraseFromParent();
1876	}
1877
1878	DT->changeImmediateDominator(BB: LoopHeader, NewBB: Preheader);
1879	if (MemCheckBlock) {
1880	DT->eraseNode(BB: MemCheckBlock);
1881	LI->removeBlock(BB: MemCheckBlock);
1882	}
1883	if (SCEVCheckBlock) {
1884	DT->eraseNode(BB: SCEVCheckBlock);
1885	LI->removeBlock(BB: SCEVCheckBlock);
1886	}
1887
1888	// Outer loop is used as part of the later cost calculations.
1889	OuterLoop = L->getParentLoop();
1890	}
1891
1892	InstructionCost getCost() {
1893	if (SCEVCheckBlock \|\| MemCheckBlock)
1894	LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
1895
1896	if (CostTooHigh) {
1897	InstructionCost Cost;
1898	Cost.setInvalid();
1899	LLVM_DEBUG(dbgs() << " number of checks exceeded threshold\n");
1900	return Cost;
1901	}
1902
1903	InstructionCost RTCheckCost = `0`;
1904	if (SCEVCheckBlock)
1905	for (Instruction &I : *SCEVCheckBlock) {
1906	if (SCEVCheckBlock->getTerminator() == &I)
1907	continue;
1908	InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1909	LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1910	RTCheckCost += C;
1911	}
1912	if (MemCheckBlock) {
1913	InstructionCost MemCheckCost = `0`;
1914	for (Instruction &I : *MemCheckBlock) {
1915	if (MemCheckBlock->getTerminator() == &I)
1916	continue;
1917	InstructionCost C = TTI->getInstructionCost(U: &I, CostKind);
1918	LLVM_DEBUG(dbgs() << " " << C << " for " << I << "\n");
1919	MemCheckCost += C;
1920	}
1921
1922	// If the runtime memory checks are being created inside an outer loop
1923	// we should find out if these checks are outer loop invariant. If so,
1924	// the checks will likely be hoisted out and so the effective cost will
1925	// reduce according to the outer loop trip count.
1926	if (OuterLoop) {
1927	ScalarEvolution *SE = MemCheckExp.getSE();
1928	// TODO: If profitable, we could refine this further by analysing every
1929	// individual memory check, since there could be a mixture of loop
1930	// variant and invariant checks that mean the final condition is
1931	// variant.
1932	const SCEV *Cond = SE->getSCEV(V: MemRuntimeCheckCond);
1933	if (SE->isLoopInvariant(S: Cond, L: OuterLoop)) {
1934	// It seems reasonable to assume that we can reduce the effective
1935	// cost of the checks even when we know nothing about the trip
1936	// count. Assume that the outer loop executes at least twice.
1937	unsigned BestTripCount = `2`;
1938
1939	// Get the best known TC estimate.
1940	if (auto EstimatedTC = getSmallBestKnownTC(
1941	PSE, L: OuterLoop, / CanUseConstantMax = / false))
1942	if (EstimatedTC ->isFixed())
1943	BestTripCount = EstimatedTC ->getFixedValue();
1944
1945	InstructionCost NewMemCheckCost = MemCheckCost / BestTripCount;
1946
1947	// Let's ensure the cost is always at least 1.
1948	NewMemCheckCost = std::max(a: NewMemCheckCost.getValue(),
1949	b: (InstructionCost::CostType)`1`);
1950
1951	if (BestTripCount > `1`)
1952	LLVM_DEBUG(dbgs()
1953	<< "We expect runtime memory checks to be hoisted "
1954	<< "out of the outer loop. Cost reduced from "
1955	<< MemCheckCost << " to " << NewMemCheckCost << `'\n'`);
1956
1957	MemCheckCost = NewMemCheckCost;
1958	}
1959	}
1960
1961	RTCheckCost += MemCheckCost;
1962	}
1963
1964	if (SCEVCheckBlock \|\| MemCheckBlock)
1965	LLVM_DEBUG(dbgs() << "Total cost of runtime checks: " << RTCheckCost
1966	<< "\n");
1967
1968	return RTCheckCost;
1969	}
1970
1971	/// Remove the created SCEV & memory runtime check blocks & instructions, if
1972	/// unused.
1973	~GeneratedRTChecks() {
1974	SCEVExpanderCleaner SCEVCleaner(SCEVExp);
1975	SCEVExpanderCleaner MemCheckCleaner(MemCheckExp);
1976	bool SCEVChecksUsed = !SCEVCheckBlock \|\| !pred_empty(BB: SCEVCheckBlock);
1977	bool MemChecksUsed = !MemCheckBlock \|\| !pred_empty(BB: MemCheckBlock);
1978	if (SCEVChecksUsed)
1979	SCEVCleaner.markResultUsed();
1980
1981	if (MemChecksUsed) {
1982	MemCheckCleaner.markResultUsed();
1983	} else {
1984	auto &SE = *MemCheckExp.getSE();
1985	// Memory runtime check generation creates compares that use expanded
1986	// values. Remove them before running the SCEVExpanderCleaners.
1987	for (auto &I : make_early_inc_range(Range: reverse(C&: *MemCheckBlock))) {
1988	if (MemCheckExp.isInsertedInstruction(I: &I))
1989	continue;
1990	SE.forgetValue(V: &I);
1991	I.eraseFromParent();
1992	}
1993	}
1994	MemCheckCleaner.cleanup();
1995	SCEVCleaner.cleanup();
1996
1997	if (!SCEVChecksUsed)
1998	SCEVCheckBlock->eraseFromParent();
1999	if (!MemChecksUsed)
2000	MemCheckBlock->eraseFromParent();
2001	}
2002
2003	/// Retrieves the SCEVCheckCond and SCEVCheckBlock that were generated as IR
2004	/// outside VPlan.
2005	std::pair<Value , BasicBlock > getSCEVChecks() {
2006	using namespace llvm::PatternMatch;
2007	if (!SCEVCheckCond \|\| match(V: SCEVCheckCond, P: m_ZeroInt()))
2008	return {nullptr, nullptr};
2009
2010	return {SCEVCheckCond, SCEVCheckBlock};
2011	}
2012
2013	/// Retrieves the MemCheckCond and MemCheckBlock that were generated as IR
2014	/// outside VPlan.
2015	std::pair<Value , BasicBlock > getMemRuntimeChecks() {
2016	return {MemRuntimeCheckCond, MemCheckBlock};
2017	}
2018
2019	/// Return true if any runtime checks have been added
2020	bool hasChecks() const {
2021	using namespace llvm::PatternMatch;
2022	return (SCEVCheckCond && !match(V: SCEVCheckCond, P: m_ZeroInt())) \|\|
2023	MemRuntimeCheckCond;
2024	}
2025	};
2026	} // namespace
2027
2028	static bool useActiveLaneMask(TailFoldingStyle Style) {
2029	return Style == TailFoldingStyle::Data \|\|
2030	Style == TailFoldingStyle::DataAndControlFlow \|\|
2031	Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2032	}
2033
2034	static bool useActiveLaneMaskForControlFlow(TailFoldingStyle Style) {
2035	return Style == TailFoldingStyle::DataAndControlFlow \|\|
2036	Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
2037	}
2038
2039	// Return true if \p OuterLp is an outer loop annotated with hints for explicit
2040	// vectorization. The loop needs to be annotated with #pragma omp simd
2041	// simdlen(#) or #pragma clang vectorize(enable) vectorize_width(#). If the
2042	// vector length information is not provided, vectorization is not considered
2043	// explicit. Interleave hints are not allowed either. These limitations will be
2044	// relaxed in the future.
2045	// Please, note that we are currently forced to abuse the pragma 'clang
2046	// vectorize' semantics. This pragma provides auto-vectorization hints
2047	// (i.e., LV must check that vectorization is legal) whereas pragma 'omp simd'
2048	// provides explicit vectorization hints* (LV can bypass legal checks and*
2049	// assume that vectorization is legal). However, both hints are implemented
2050	// using the same metadata (llvm.loop.vectorize, processed by
2051	// LoopVectorizeHints). This will be fixed in the future when the native IR
2052	// representation for pragma 'omp simd' is introduced.
2053	static bool isExplicitVecOuterLoop(Loop *OuterLp,
2054	OptimizationRemarkEmitter *ORE) {
2055	assert(!OuterLp->isInnermost() && "This is not an outer loop");
2056	LoopVectorizeHints Hints(OuterLp, true /DisableInterleaving/, *ORE);
2057
2058	// Only outer loops with an explicit vectorization hint are supported.
2059	// Unannotated outer loops are ignored.
2060	if (Hints.getForce() == LoopVectorizeHints::FK_Undefined)
2061	return false;
2062
2063	Function *Fn = OuterLp->getHeader()->getParent();
2064	if (!Hints.allowVectorization(F: Fn, L: OuterLp,
2065	VectorizeOnlyWhenForced: true /VectorizeOnlyWhenForced/)) {
2066	LLVM_DEBUG(dbgs() << "LV: Loop hints prevent outer loop vectorization.\n");
2067	return false;
2068	}
2069
2070	if (Hints.getInterleave() > `1`) {
2071	// TODO: Interleave support is future work.
2072	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Interleave is not supported for "
2073	"outer loops.\n");
2074	Hints.emitRemarkWithHints();
2075	return false;
2076	}
2077
2078	return true;
2079	}
2080
2081	static void collectSupportedLoops(Loop &L, LoopInfo *LI,
2082	OptimizationRemarkEmitter *ORE,
2083	SmallVectorImpl<Loop *> &V) {
2084	// Collect inner loops and outer loops without irreducible control flow. For
2085	// now, only collect outer loops that have explicit vectorization hints. If we
2086	// are stress testing the VPlan H-CFG construction, we collect the outermost
2087	// loop of every loop nest.
2088	if (L.isInnermost() \|\| VPlanBuildStressTest \|\|
2089	(EnableVPlanNativePath && isExplicitVecOuterLoop(OuterLp: &L, ORE))) {
2090	LoopBlocksRPO RPOT(&L);
2091	RPOT.perform(LI);
2092	if (!containsIrreducibleCFG<const BasicBlock >(RPOTraversal&: RPOT, LI: LI)) {
2093	V.push_back(Elt: &L);
2094	// TODO: Collect inner loops inside marked outer loops in case
2095	// vectorization fails for the outer loop. Do not invoke
2096	// 'containsIrreducibleCFG' again for inner loops when the outer loop is
2097	// already known to be reducible. We can use an inherited attribute for
2098	// that.
2099	return;
2100	}
2101	}
2102	for (Loop *InnerL : L)
2103	collectSupportedLoops(L&: *InnerL, LI, ORE, V);
2104	}
2105
2106	//===----------------------------------------------------------------------===//
2107	// Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
2108	// LoopVectorizationCostModel and LoopVectorizationPlanner.
2109	//===----------------------------------------------------------------------===//
2110
2111	/// Compute the transformed value of Index at offset StartValue using step
2112	/// StepValue.
2113	/// For integer induction, returns StartValue + Index StepValue.*
2114	/// For pointer induction, returns StartValue[Index StepValue].*
2115	/// FIXME: The newly created binary instructions should contain nsw/nuw
2116	/// flags, which can be found from the original scalar operations.
2117	static Value *
2118	emitTransformedIndex(IRBuilderBase &B, Value Index, Value StartValue,
2119	Value *Step,
2120	InductionDescriptor::InductionKind InductionKind,
2121	const BinaryOperator *InductionBinOp) {
2122	using namespace llvm::PatternMatch;
2123	Type *StepTy = Step->getType();
2124	Value *CastedIndex = StepTy->isIntegerTy()
2125	? B.CreateSExtOrTrunc(V: Index, DestTy: StepTy)
2126	: B.CreateCast(Op: Instruction::SIToFP, V: Index, DestTy: StepTy);
2127	if (CastedIndex != Index) {
2128	CastedIndex->setName(CastedIndex->getName() + ".cast");
2129	Index = CastedIndex;
2130	}
2131
2132	// Note: the IR at this point is broken. We cannot use SE to create any new
2133	// SCEV and then expand it, hoping that SCEV's simplification will give us
2134	// a more optimal code. Unfortunately, attempt of doing so on invalid IR may
2135	// lead to various SCEV crashes. So all we can do is to use builder and rely
2136	// on InstCombine for future simplifications. Here we handle some trivial
2137	// cases only.
2138	auto CreateAdd = [&B](Value X, Value Y) {
2139	assert(X->getType() == Y->getType() && "Types don't match!");
2140	if (match(V: X, P: m_ZeroInt()))
2141	return Y;
2142	if (match(V: Y, P: m_ZeroInt()))
2143	return X;
2144	return B.CreateAdd(LHS: X, RHS: Y);
2145	};
2146
2147	// We allow X to be a vector type, in which case Y will potentially be
2148	// splatted into a vector with the same element count.
2149	auto CreateMul = [&B](Value X, Value Y) {
2150	assert(X->getType()->getScalarType() == Y->getType() &&
2151	"Types don't match!");
2152	if (match(V: X, P: m_One()))
2153	return Y;
2154	if (match(V: Y, P: m_One()))
2155	return X;
2156	VectorType *XVTy = dyn_cast<VectorType>(Val: X->getType());
2157	if (XVTy && !isa<VectorType>(Val: Y->getType()))
2158	Y = B.CreateVectorSplat(EC: XVTy->getElementCount(), V: Y);
2159	return B.CreateMul(LHS: X, RHS: Y);
2160	};
2161
2162	switch (InductionKind) {
2163	case InductionDescriptor::IK_IntInduction: {
2164	assert(!isa<VectorType>(Index->getType()) &&
2165	"Vector indices not supported for integer inductions yet");
2166	assert(Index->getType() == StartValue->getType() &&
2167	"Index type does not match StartValue type");
2168	if (isa<ConstantInt>(Val: Step) && cast<ConstantInt>(Val: Step)->isMinusOne())
2169	return B.CreateSub(LHS: StartValue, RHS: Index);
2170	auto *Offset = CreateMul (Index, Step);
2171	return CreateAdd (StartValue, Offset);
2172	}
2173	case InductionDescriptor::IK_PtrInduction:
2174	return B.CreatePtrAdd(Ptr: StartValue, Offset: CreateMul (Index, Step));
2175	case InductionDescriptor::IK_FpInduction: {
2176	assert(!isa<VectorType>(Index->getType()) &&
2177	"Vector indices not supported for FP inductions yet");
2178	assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
2179	assert(InductionBinOp &&
2180	(InductionBinOp->getOpcode() == Instruction::FAdd \|\|
2181	InductionBinOp->getOpcode() == Instruction::FSub) &&
2182	"Original bin op should be defined for FP induction");
2183
2184	Value *MulExp = B.CreateFMul(L: Step, R: Index);
2185	return B.CreateBinOp(Opc: InductionBinOp->getOpcode(), LHS: StartValue, RHS: MulExp,
2186	Name: "induction");
2187	}
2188	case InductionDescriptor::IK_NoInduction:
2189	return nullptr;
2190	}
2191	llvm_unreachable("invalid enum");
2192	}
2193
2194	static std::optional<unsigned> getMaxVScale(const Function &F,
2195	const TargetTransformInfo &TTI) {
2196	if (std::optional<unsigned> MaxVScale = TTI.getMaxVScale())
2197	return MaxVScale;
2198
2199	if (F.hasFnAttribute(Kind: Attribute::VScaleRange))
2200	return F.getFnAttribute(Kind: Attribute::VScaleRange).getVScaleRangeMax();
2201
2202	return std::nullopt;
2203	}
2204
2205	/// For the given VF and UF and maximum trip count computed for the loop, return
2206	/// whether the induction variable might overflow in the vectorized loop. If not,
2207	/// then we know a runtime overflow check always evaluates to false and can be
2208	/// removed.
2209	static bool isIndvarOverflowCheckKnownFalse(
2210	const LoopVectorizationCostModel *Cost,
2211	ElementCount VF, std::optional<unsigned> UF = std::nullopt) {
2212	// Always be conservative if we don't know the exact unroll factor.
2213	unsigned MaxUF = UF ? *UF : Cost->TTI.getMaxInterleaveFactor(VF);
2214
2215	IntegerType *IdxTy = Cost->Legal->getWidestInductionType();
2216	APInt MaxUIntTripCount = IdxTy->getMask();
2217
2218	// We know the runtime overflow check is known false iff the (max) trip-count
2219	// is known and (max) trip-count + (VF UF) does not overflow in the type of*
2220	// the vector loop induction variable.
2221	if (unsigned TC = Cost->PSE.getSmallConstantMaxTripCount()) {
2222	uint64_t MaxVF = VF.getKnownMinValue();
2223	if (VF.isScalable()) {
2224	std::optional<unsigned> MaxVScale =
2225	getMaxVScale(F: *Cost->TheFunction, TTI: Cost->TTI);
2226	if (!MaxVScale)
2227	return false;
2228	MaxVF = MaxVScale;
2229	}
2230
2231	return (MaxUIntTripCount - TC).ugt(RHS: MaxVF * MaxUF);
2232	}
2233
2234	return false;
2235	}
2236
2237	// Return whether we allow using masked interleave-groups (for dealing with
2238	// strided loads/stores that reside in predicated blocks, or for dealing
2239	// with gaps).
2240	static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
2241	// If an override option has been passed in for interleaved accesses, use it.
2242	if (EnableMaskedInterleavedMemAccesses.getNumOccurrences() > `0`)
2243	return EnableMaskedInterleavedMemAccesses;
2244
2245	return TTI.enableMaskedInterleavedAccessVectorization();
2246	}
2247
2248	Value *
2249	InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
2250	if (VectorTripCount)
2251	return VectorTripCount;
2252
2253	Value *TC = getTripCount();
2254	IRBuilder<> Builder(InsertBlock->getTerminator());
2255
2256	Type *Ty = TC->getType();
2257	// This is where we can make the step a runtime constant.
2258	Value *Step = createStepForVF(B&: Builder, Ty, VF, Step: UF);
2259
2260	// If the tail is to be folded by masking, round the number of iterations N
2261	// up to a multiple of Step instead of rounding down. This is done by first
2262	// adding Step-1 and then rounding down. Note that it's ok if this addition
2263	// overflows: the vector induction variable will eventually wrap to zero given
2264	// that it starts at zero and its Step is a power of two; the loop will then
2265	// exit, with the last early-exit vector comparison also producing all-true.
2266	// For scalable vectors the VF is not guaranteed to be a power of 2, but this
2267	// is accounted for in emitIterationCountCheck that adds an overflow check.
2268	if (Cost->foldTailByMasking()) {
2269	assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
2270	"VF*UF must be a power of 2 when folding tail by masking");
2271	TC = Builder.CreateAdd(LHS: TC, RHS: Builder.CreateSub(LHS: Step, RHS: ConstantInt::get(Ty, V: `1`)),
2272	Name: "n.rnd.up");
2273	}
2274
2275	// Now we need to generate the expression for the part of the loop that the
2276	// vectorized body will execute. This is equal to N - (N % Step) if scalar
2277	// iterations are not required for correctness, or N - Step, otherwise. Step
2278	// is equal to the vectorization factor (number of SIMD elements) times the
2279	// unroll factor (number of SIMD instructions).
2280	Value *R = Builder.CreateURem(LHS: TC, RHS: Step, Name: "n.mod.vf");
2281
2282	// There are cases where we must* run at least one iteration in the remainder*
2283	// loop. See the cost model for when this can happen. If the step evenly
2284	// divides the trip count, we set the remainder to be equal to the step. If
2285	// the step does not evenly divide the trip count, no adjustment is necessary
2286	// since there will already be scalar iterations. Note that the minimum
2287	// iterations check ensures that N >= Step.
2288	if (Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector())) {
2289	auto *IsZero = Builder.CreateICmpEQ(LHS: R, RHS: ConstantInt::get(Ty: R->getType(), V: `0`));
2290	R = Builder.CreateSelect(C: IsZero, True: Step, False: R);
2291	}
2292
2293	VectorTripCount = Builder.CreateSub(LHS: TC, RHS: R, Name: "n.vec");
2294
2295	return VectorTripCount;
2296	}
2297
2298	void InnerLoopVectorizer::introduceCheckBlockInVPlan(BasicBlock *CheckIRBB) {
2299	// Note: The block with the minimum trip-count check is already connected
2300	// during earlier VPlan construction.
2301	VPBlockBase *ScalarPH = Plan.getScalarPreheader();
2302	VPBlockBase *PreVectorPH = VectorPHVPB->getSinglePredecessor();
2303	assert(PreVectorPH->getNumSuccessors() == `2` && "Expected 2 successors");
2304	assert(PreVectorPH->getSuccessors()[`0`] == ScalarPH && "Unexpected successor");
2305	VPIRBasicBlock *CheckVPIRBB = Plan.createVPIRBasicBlock(IRBB: CheckIRBB);
2306	VPBlockUtils::insertOnEdge(From: PreVectorPH, To: VectorPHVPB, BlockPtr: CheckVPIRBB);
2307	PreVectorPH = CheckVPIRBB;
2308	VPBlockUtils::connectBlocks(From: PreVectorPH, To: ScalarPH);
2309	PreVectorPH->swapSuccessors();
2310
2311	// We just connected a new block to the scalar preheader. Update all
2312	// VPPhis by adding an incoming value for it, replicating the last value.
2313	unsigned NumPredecessors = ScalarPH->getNumPredecessors();
2314	for (VPRecipeBase &R : cast<VPBasicBlock>(Val: ScalarPH)->phis()) {
2315	assert(isa<VPPhi>(&R) && "Phi expected to be VPPhi");
2316	assert(cast<VPPhi>(&R)->getNumIncoming() == NumPredecessors - `1` &&
2317	"must have incoming values for all operands");
2318	R.addOperand(Operand: R.getOperand(N: NumPredecessors - `2`));
2319	}
2320	}
2321
2322	Value *InnerLoopVectorizer::createIterationCountCheck(ElementCount VF,
2323	unsigned UF) const {
2324	// Generate code to check if the loop's trip count is less than VF UF, or*
2325	// equal to it in case a scalar epilogue is required; this implies that the
2326	// vector trip count is zero. This check also covers the case where adding one
2327	// to the backedge-taken count overflowed leading to an incorrect trip count
2328	// of zero. In this case we will also jump to the scalar loop.
2329	auto P = Cost->requiresScalarEpilogue(IsVectorizing: VF.isVector()) ? ICmpInst::ICMP_ULE
2330	: ICmpInst::ICMP_ULT;
2331
2332	// Reuse existing vector loop preheader for TC checks.
2333	// Note that new preheader block is generated for vector loop.
2334	BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2335	IRBuilder<> Builder(TCCheckBlock->getTerminator());
2336
2337	// If tail is to be folded, vector loop takes care of all iterations.
2338	Value *Count = getTripCount();
2339	Type *CountTy = Count->getType();
2340	Value *CheckMinIters = Builder.getFalse();
2341	auto CreateStep = [&]() -> Value * {
2342	// Create step with max(MinProTripCount, UF VF).*
2343	if (UF * VF.getKnownMinValue() >= MinProfitableTripCount.getKnownMinValue())
2344	return createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF);
2345
2346	Value *MinProfTC =
2347	createStepForVF(B&: Builder, Ty: CountTy, VF: MinProfitableTripCount, Step: `1`);
2348	if (!VF.isScalable())
2349	return MinProfTC;
2350	return Builder.CreateBinaryIntrinsic(
2351	ID: Intrinsic::umax, LHS: MinProfTC, RHS: createStepForVF(B&: Builder, Ty: CountTy, VF, Step: UF));
2352	};
2353
2354	TailFoldingStyle Style = Cost->getTailFoldingStyle();
2355	if (Style == TailFoldingStyle::None) {
2356	Value *Step = CreateStep ();
2357	ScalarEvolution &SE = *PSE.getSE();
2358	// TODO: Emit unconditional branch to vector preheader instead of
2359	// conditional branch with known condition.
2360	const SCEV *TripCountSCEV = SE.applyLoopGuards(Expr: SE.getSCEV(V: Count), L: OrigLoop);
2361	// Check if the trip count is < the step.
2362	if (SE.isKnownPredicate(Pred: P, LHS: TripCountSCEV, RHS: SE.getSCEV(V: Step))) {
2363	// TODO: Ensure step is at most the trip count when determining max VF and
2364	// UF, w/o tail folding.
2365	CheckMinIters = Builder.getTrue();
2366	} else if (!SE.isKnownPredicate(Pred: CmpInst::getInversePredicate(pred: P),
2367	LHS: TripCountSCEV, RHS: SE.getSCEV(V: Step))) {
2368	// Generate the minimum iteration check only if we cannot prove the
2369	// check is known to be true, or known to be false.
2370	CheckMinIters = Builder.CreateICmp(P, LHS: Count, RHS: Step, Name: "min.iters.check");
2371	} // else step known to be < trip count, use CheckMinIters preset to false.
2372	} else if (VF.isScalable() && !TTI->isVScaleKnownToBeAPowerOfTwo() &&
2373	!isIndvarOverflowCheckKnownFalse(Cost, VF, UF) &&
2374	Style != TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck) {
2375	// vscale is not necessarily a power-of-2, which means we cannot guarantee
2376	// an overflow to zero when updating induction variables and so an
2377	// additional overflow check is required before entering the vector loop.
2378
2379	// Get the maximum unsigned value for the type.
2380	Value *MaxUIntTripCount =
2381	ConstantInt::get(Ty: CountTy, V: cast<IntegerType>(Val: CountTy)->getMask());
2382	Value *LHS = Builder.CreateSub(LHS: MaxUIntTripCount, RHS: Count);
2383
2384	// Don't execute the vector loop if (UMax - n) < (VF UF).*
2385	CheckMinIters = Builder.CreateICmp(P: ICmpInst::ICMP_ULT, LHS, RHS: CreateStep ());
2386	}
2387	return CheckMinIters;
2388	}
2389
2390	void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
2391	BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
2392	Value *CheckMinIters = createIterationCountCheck(VF, UF);
2393	// Create new preheader for vector loop.
2394	LoopVectorPreHeader = SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(),
2395	DT: static_cast<DominatorTree >(nullptr*), LI,
2396	MSSAU: nullptr, BBName: "vector.ph");
2397
2398	BranchInst &BI =
2399	*BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
2400	if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator()))
2401	setBranchWeights(I&: BI, Weights: MinItersBypassWeights, /IsExpected=/false);
2402	ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI);
2403
2404	assert(cast<VPIRBasicBlock>(Plan.getEntry())->getIRBasicBlock() ==
2405	TCCheckBlock &&
2406	"Plan's entry must be TCCCheckBlock");
2407	}
2408
2409	/// Replace \p VPBB with a VPIRBasicBlock wrapping \p IRBB. All recipes from \p
2410	/// VPBB are moved to the end of the newly created VPIRBasicBlock. VPBB must
2411	/// have a single predecessor, which is rewired to the new VPIRBasicBlock. All
2412	/// successors of VPBB, if any, are rewired to the new VPIRBasicBlock.
2413	static void replaceVPBBWithIRVPBB(VPBasicBlock VPBB, BasicBlock IRBB) {
2414	VPIRBasicBlock *IRVPBB = VPBB->getPlan()->createVPIRBasicBlock(IRBB);
2415	for (auto &R : make_early_inc_range(Range&: *VPBB)) {
2416	assert((IRVPBB->empty() \|\| IRVPBB->back().isPhi() \|\| !R.isPhi()) &&
2417	"Tried to move phi recipe after a non-phi recipe");
2418	R.moveBefore(BB&: *IRVPBB, I: IRVPBB->end());
2419	}
2420
2421	VPBlockUtils::reassociateBlocks(Old: VPBB, New: IRVPBB);
2422	// VPBB is now dead and will be cleaned up when the plan gets destroyed.
2423	}
2424
2425	void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
2426	LoopVectorPreHeader = OrigLoop->getLoopPreheader();
2427	assert(LoopVectorPreHeader && "Invalid loop structure");
2428	assert((OrigLoop->getUniqueLatchExitBlock() \|\|
2429	Cost->requiresScalarEpilogue(VF.isVector())) &&
2430	"loops not exiting via the latch without required epilogue?");
2431
2432	LoopScalarPreHeader =
2433	SplitBlock(Old: LoopVectorPreHeader, SplitPt: LoopVectorPreHeader->getTerminator(), DT,
2434	LI, MSSAU: nullptr, BBName: Twine (Prefix) + "scalar.ph");
2435	// NOTE: The Plan's scalar preheader VPBB isn't replaced with a VPIRBasicBlock
2436	// wrapping LoopScalarPreHeader here at the moment, because the Plan's scalar
2437	// preheader may be unreachable at this point. Instead it is replaced in
2438	// createVectorizedLoopSkeleton.
2439	}
2440
2441	/// Return the expanded step for \p ID using \p ExpandedSCEVs to look up SCEV
2442	/// expansion results.
2443	static Value getExpandedStep(const* InductionDescriptor &ID,
2444	const SCEV2ValueTy &ExpandedSCEVs) {
2445	const SCEV *Step = ID.getStep();
2446	if (auto *C = dyn_cast<SCEVConstant>(Val: Step))
2447	return C->getValue();
2448	if (auto *U = dyn_cast<SCEVUnknown>(Val: Step))
2449	return U->getValue();
2450	Value *V = ExpandedSCEVs.lookup(Val: Step);
2451	assert(V && "SCEV must be expanded at this point");
2452	return V;
2453	}
2454
2455	/// Knowing that loop \p L executes a single vector iteration, add instructions
2456	/// that will get simplified and thus should not have any cost to \p
2457	/// InstsToIgnore.
2458	static void addFullyUnrolledInstructionsToIgnore(
2459	Loop L, const* LoopVectorizationLegality::InductionList &IL,
2460	SmallPtrSetImpl<Instruction *> &InstsToIgnore) {
2461	auto *Cmp = L->getLatchCmpInst();
2462	if (Cmp)
2463	InstsToIgnore.insert(Ptr: Cmp);
2464	for (const auto &KV : IL) {
2465	// Extract the key by hand so that it can be used in the lambda below. Note
2466	// that captured structured bindings are a C++20 extension.
2467	const PHINode *IV = KV.first;
2468
2469	// Get next iteration value of the induction variable.
2470	Instruction *IVInst =
2471	cast<Instruction>(Val: IV->getIncomingValueForBlock(BB: L->getLoopLatch()));
2472	if (all_of(Range: IVInst->users(),
2473	P: [&](const User U) { return* U == IV \|\| U == Cmp; }))
2474	InstsToIgnore.insert(Ptr: IVInst);
2475	}
2476	}
2477
2478	BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
2479	/*
2480	In this function we generate a new loop. The new loop will contain
2481	the vectorized instructions while the old loop will continue to run the
2482	scalar remainder.
2483
2484	[ ] <-- old preheader - loop iteration number check and SCEVs in Plan's
2485	/ \| preheader are expanded here. Eventually all required SCEV
2486	/ \| expansion should happen here.
2487	/ v
2488	\| [ ] <-- vector loop bypass (may consist of multiple blocks).
2489	\| / \|
2490	\| / v
2491	\|\| [ ] <-- vector pre header.
2492	\|/ \|
2493	\| v
2494	\| [ ] \
2495	\| [ ]_\| <-- vector loop (created during VPlan execution).
2496	\| \|
2497	\| v
2498	\ -[ ] <--- middle-block (wrapped in VPIRBasicBlock with the branch to
2499	\| \| successors created during VPlan execution)
2500	\/ \|
2501	/\ v
2502	\| ->[ ] <--- new preheader (wrapped in VPIRBasicBlock).
2503	\| \|
2504	(opt) v <-- edge from middle to exit iff epilogue is not required.
2505	\| [ ] \
2506	\| [ ]_\| <-- old scalar loop to handle remainder (scalar epilogue, header
2507	\| \| wrapped in VPIRBasicBlock).
2508	\ \|
2509	\ v
2510	>[ ] <-- exit block(s). (wrapped in VPIRBasicBlock)
2511	...
2512	*/
2513
2514	// Create an empty vector loop, and prepare basic blocks for the runtime
2515	// checks.
2516	createVectorLoopSkeleton(Prefix: "");
2517
2518	// Now, compare the new count to zero. If it is zero skip the vector loop and
2519	// jump to the scalar loop. This check also covers the case where the
2520	// backedge-taken count is uint##_max: adding one to it will overflow leading
2521	// to an incorrect trip count of zero. In this (rare) case we will also jump
2522	// to the scalar loop.
2523	emitIterationCountCheck(Bypass: LoopScalarPreHeader);
2524
2525	replaceVPBBWithIRVPBB(VPBB: Plan.getScalarPreheader(), IRBB: LoopScalarPreHeader);
2526	return LoopVectorPreHeader;
2527	}
2528
2529	namespace {
2530
2531	struct CSEDenseMapInfo {
2532	static bool canHandle(const Instruction *I) {
2533	return isa<InsertElementInst>(Val: I) \|\| isa<ExtractElementInst>(Val: I) \|\|
2534	isa<ShuffleVectorInst>(Val: I) \|\| isa<GetElementPtrInst>(Val: I);
2535	}
2536
2537	static inline Instruction *getEmptyKey() {
2538	return DenseMapInfo<Instruction *>::getEmptyKey();
2539	}
2540
2541	static inline Instruction *getTombstoneKey() {
2542	return DenseMapInfo<Instruction *>::getTombstoneKey();
2543	}
2544
2545	static unsigned getHashValue(const Instruction *I) {
2546	assert(canHandle(I) && "Unknown instruction!");
2547	return hash_combine(args: I->getOpcode(),
2548	args: hash_combine_range(R: I->operand_values()));
2549	}
2550
2551	static bool isEqual(const Instruction LHS, const* Instruction *RHS) {
2552	if (LHS == getEmptyKey() \|\| RHS == getEmptyKey() \|\|
2553	LHS == getTombstoneKey() \|\| RHS == getTombstoneKey())
2554	return LHS == RHS;
2555	return LHS->isIdenticalTo(I: RHS);
2556	}
2557	};
2558
2559	} // end anonymous namespace
2560
2561	///Perform cse of induction variable instructions.
2562	static void cse(BasicBlock *BB) {
2563	// Perform simple cse.
2564	SmallDenseMap<Instruction , Instruction , `4`, CSEDenseMapInfo> CSEMap;
2565	for (Instruction &In : llvm::make_early_inc_range(Range&: *BB)) {
2566	if (!CSEDenseMapInfo::canHandle(I: &In))
2567	continue;
2568
2569	// Check if we can replace this instruction with any of the
2570	// visited instructions.
2571	if (Instruction *V = CSEMap.lookup(Val: &In)) {
2572	In.replaceAllUsesWith(V);
2573	In.eraseFromParent();
2574	continue;
2575	}
2576
2577	CSEMap [&In] = &In;
2578	}
2579	}
2580
2581	/// This function attempts to return a value that represents the vectorization
2582	/// factor at runtime. For fixed-width VFs we know this precisely at compile
2583	/// time, but for scalable VFs we calculate it based on an estimate of the
2584	/// vscale value.
2585	static unsigned getEstimatedRuntimeVF(ElementCount VF,
2586	std::optional<unsigned> VScale) {
2587	unsigned EstimatedVF = VF.getKnownMinValue();
2588	if (VF.isScalable())
2589	if (VScale)
2590	EstimatedVF = VScale;
2591	assert(EstimatedVF >= `1` && "Estimated VF shouldn't be less than 1");
2592	return EstimatedVF;
2593	}
2594
2595	InstructionCost
2596	LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
2597	ElementCount VF) const {
2598	// We only need to calculate a cost if the VF is scalar; for actual vectors
2599	// we should already have a pre-calculated cost at each VF.
2600	if (!VF.isScalar())
2601	return getCallWideningDecision(CI, VF).Cost;
2602
2603	Type *RetTy = CI->getType();
2604	if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
2605	if (auto RedCost = getReductionPatternCost(I: CI, VF, VectorTy: RetTy))
2606	return *RedCost;
2607
2608	SmallVector<Type *, `4`> Tys;
2609	for (auto &ArgOp : CI->args())
2610	Tys.push_back(Elt: ArgOp ->getType());
2611
2612	InstructionCost ScalarCallCost =
2613	TTI.getCallInstrCost(F: CI->getCalledFunction(), RetTy, Tys, CostKind);
2614
2615	// If this is an intrinsic we may have a lower cost for it.
2616	if (getVectorIntrinsicIDForCall(CI, TLI)) {
2617	InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
2618	return std::min(a: ScalarCallCost, b: IntrinsicCost);
2619	}
2620	return ScalarCallCost;
2621	}
2622
2623	static Type maybeVectorizeType(Type Ty, ElementCount VF) {
2624	if (VF.isScalar() \|\| !canVectorizeTy(Ty))
2625	return Ty;
2626	return toVectorizedTy(Ty, EC: VF);
2627	}
2628
2629	InstructionCost
2630	LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
2631	ElementCount VF) const {
2632	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
2633	assert(ID && "Expected intrinsic call!");
2634	Type *RetTy = maybeVectorizeType(Ty: CI->getType(), VF);
2635	FastMathFlags FMF;
2636	if (auto *FPMO = dyn_cast<FPMathOperator>(Val: CI))
2637	FMF = FPMO->getFastMathFlags();
2638
2639	SmallVector<const Value *> Arguments(CI->args());
2640	FunctionType *FTy = CI->getCalledFunction()->getFunctionType();
2641	SmallVector<Type *> ParamTys;
2642	std::transform(first: FTy->param_begin(), last: FTy->param_end(),
2643	result: std::back_inserter(x&: ParamTys),
2644	unary_op: [&](Type Ty) { return* maybeVectorizeType(Ty, VF); });
2645
2646	IntrinsicCostAttributes CostAttrs(ID, RetTy, Arguments, ParamTys, FMF,
2647	dyn_cast<IntrinsicInst>(Val: CI),
2648	InstructionCost::getInvalid(), TLI);
2649	return TTI.getIntrinsicInstrCost(ICA: CostAttrs, CostKind);
2650	}
2651
2652	void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
2653	// Fix widened non-induction PHIs by setting up the PHI operands.
2654	fixNonInductionPHIs(State);
2655
2656	// After vectorization, the exit blocks of the original loop will have
2657	// additional predecessors. Invalidate SCEVs for the exit phis in case SE
2658	// looked through single-entry phis.
2659	SmallVector<BasicBlock *> ExitBlocks;
2660	OrigLoop->getExitBlocks(ExitBlocks);
2661	for (BasicBlock *Exit : ExitBlocks)
2662	for (PHINode &PN : Exit->phis())
2663	PSE.getSE()->forgetLcssaPhiWithNewPredecessor(L: OrigLoop, V: &PN);
2664
2665	// Forget the original basic block.
2666	PSE.getSE()->forgetLoop(L: OrigLoop);
2667	PSE.getSE()->forgetBlockAndLoopDispositions();
2668
2669	// Don't apply optimizations below when no (vector) loop remains, as they all
2670	// require one at the moment.
2671	VPBasicBlock *HeaderVPBB =
2672	vputils::getFirstLoopHeader(Plan&: *State.Plan, VPDT&: State.VPDT);
2673	if (!HeaderVPBB)
2674	return;
2675
2676	BasicBlock *HeaderBB = State.CFG.VPBB2IRBB [HeaderVPBB];
2677
2678	// Remove redundant induction instructions.
2679	cse(BB: HeaderBB);
2680
2681	// Set/update profile weights for the vector and remainder loops as original
2682	// loop iterations are now distributed among them. Note that original loop
2683	// becomes the scalar remainder loop after vectorization.
2684	//
2685	// For cases like foldTailByMasking() and requiresScalarEpiloque() we may
2686	// end up getting slightly roughened result but that should be OK since
2687	// profile is not inherently precise anyway. Note also possible bypass of
2688	// vector code caused by legality checks is ignored, assigning all the weight
2689	// to the vector loop, optimistically.
2690	//
2691	// For scalable vectorization we can't know at compile time how many
2692	// iterations of the loop are handled in one vector iteration, so instead
2693	// use the value of vscale used for tuning.
2694	Loop *VectorLoop = LI->getLoopFor(BB: HeaderBB);
2695	unsigned EstimatedVFxUF =
2696	getEstimatedRuntimeVF(VF: VF * UF, VScale: Cost->getVScaleForTuning());
2697	setProfileInfoAfterUnrolling(OrigLoop, UnrolledLoop: VectorLoop, RemainderLoop: OrigLoop, UF: EstimatedVFxUF);
2698	}
2699
2700	void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
2701	auto Iter = vp_depth_first_shallow(G: Plan.getEntry());
2702	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
2703	for (VPRecipeBase &P : VPBB->phis()) {
2704	VPWidenPHIRecipe *VPPhi = dyn_cast<VPWidenPHIRecipe>(Val: &P);
2705	if (!VPPhi)
2706	continue;
2707	PHINode *NewPhi = cast<PHINode>(Val: State.get(Def: VPPhi));
2708	// Make sure the builder has a valid insert point.
2709	Builder.SetInsertPoint(NewPhi);
2710	for (unsigned Idx = `0`; Idx < VPPhi->getNumIncoming(); ++Idx) {
2711	VPValue *Inc = VPPhi->getIncomingValue(Idx);
2712	const VPBasicBlock *VPBB = VPPhi->getIncomingBlock(Idx);
2713	NewPhi->addIncoming(V: State.get(Def: Inc), BB: State.CFG.VPBB2IRBB [VPBB]);
2714	}
2715	}
2716	}
2717	}
2718
2719	void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
2720	// We should not collect Scalars more than once per VF. Right now, this
2721	// function is called from collectUniformsAndScalars(), which already does
2722	// this check. Collecting Scalars for VF=1 does not make any sense.
2723	assert(VF.isVector() && !Scalars.contains(VF) &&
2724	"This function should not be visited twice for the same VF");
2725
2726	// This avoids any chances of creating a REPLICATE recipe during planning
2727	// since that would result in generation of scalarized code during execution,
2728	// which is not supported for scalable vectors.
2729	if (VF.isScalable()) {
2730	Scalars [VF].insert_range(R&: Uniforms [VF]);
2731	return;
2732	}
2733
2734	SmallSetVector<Instruction *, `8`> Worklist;
2735
2736	// These sets are used to seed the analysis with pointers used by memory
2737	// accesses that will remain scalar.
2738	SmallSetVector<Instruction *, `8`> ScalarPtrs;
2739	SmallPtrSet<Instruction *, `8`> PossibleNonScalarPtrs;
2740	auto *Latch = TheLoop->getLoopLatch();
2741
2742	// A helper that returns true if the use of Ptr by MemAccess will be scalar.
2743	// The pointer operands of loads and stores will be scalar as long as the
2744	// memory access is not a gather or scatter operation. The value operand of a
2745	// store will remain scalar if the store is scalarized.
2746	auto IsScalarUse = [&](Instruction MemAccess, Value Ptr) {
2747	InstWidening WideningDecision = getWideningDecision(I: MemAccess, VF);
2748	assert(WideningDecision != CM_Unknown &&
2749	"Widening decision should be ready at this moment");
2750	if (auto *Store = dyn_cast<StoreInst>(Val: MemAccess))
2751	if (Ptr == Store->getValueOperand())
2752	return WideningDecision == CM_Scalarize;
2753	assert(Ptr == getLoadStorePointerOperand(MemAccess) &&
2754	"Ptr is neither a value or pointer operand");
2755	return WideningDecision != CM_GatherScatter;
2756	};
2757
2758	// A helper that returns true if the given value is a getelementptr
2759	// instruction contained in the loop.
2760	auto IsLoopVaryingGEP = [&](Value *V) {
2761	return isa<GetElementPtrInst>(Val: V) && !TheLoop->isLoopInvariant(V);
2762	};
2763
2764	// A helper that evaluates a memory access's use of a pointer. If the use will
2765	// be a scalar use and the pointer is only used by memory accesses, we place
2766	// the pointer in ScalarPtrs. Otherwise, the pointer is placed in
2767	// PossibleNonScalarPtrs.
2768	auto EvaluatePtrUse = [&](Instruction MemAccess, Value Ptr) {
2769	// We only care about bitcast and getelementptr instructions contained in
2770	// the loop.
2771	if (!IsLoopVaryingGEP (Ptr))
2772	return;
2773
2774	// If the pointer has already been identified as scalar (e.g., if it was
2775	// also identified as uniform), there's nothing to do.
2776	auto *I = cast<Instruction>(Val: Ptr);
2777	if (Worklist.count(key: I))
2778	return;
2779
2780	// If the use of the pointer will be a scalar use, and all users of the
2781	// pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
2782	// place the pointer in PossibleNonScalarPtrs.
2783	if (IsScalarUse (MemAccess, Ptr) &&
2784	all_of(Range: I->users(), P: IsaPred<LoadInst, StoreInst>))
2785	ScalarPtrs.insert(X: I);
2786	else
2787	PossibleNonScalarPtrs.insert(Ptr: I);
2788	};
2789
2790	// We seed the scalars analysis with three classes of instructions: (1)
2791	// instructions marked uniform-after-vectorization and (2) bitcast,
2792	// getelementptr and (pointer) phi instructions used by memory accesses
2793	// requiring a scalar use.
2794	//
2795	// (1) Add to the worklist all instructions that have been identified as
2796	// uniform-after-vectorization.
2797	Worklist.insert_range(R&: Uniforms [VF]);
2798
2799	// (2) Add to the worklist all bitcast and getelementptr instructions used by
2800	// memory accesses requiring a scalar use. The pointer operands of loads and
2801	// stores will be scalar unless the operation is a gather or scatter.
2802	// The value operand of a store will remain scalar if the store is scalarized.
2803	for (auto *BB : TheLoop->blocks())
2804	for (auto &I : *BB) {
2805	if (auto *Load = dyn_cast<LoadInst>(Val: &I)) {
2806	EvaluatePtrUse (Load, Load->getPointerOperand());
2807	} else if (auto *Store = dyn_cast<StoreInst>(Val: &I)) {
2808	EvaluatePtrUse (Store, Store->getPointerOperand());
2809	EvaluatePtrUse (Store, Store->getValueOperand());
2810	}
2811	}
2812	for (auto *I : ScalarPtrs)
2813	if (!PossibleNonScalarPtrs.count(Ptr: I)) {
2814	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
2815	Worklist.insert(X: I);
2816	}
2817
2818	// Insert the forced scalars.
2819	// FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
2820	// induction variable when the PHI user is scalarized.
2821	auto ForcedScalar = ForcedScalars.find(Val: VF);
2822	if (ForcedScalar != ForcedScalars.end())
2823	for (auto *I : ForcedScalar ->second) {
2824	LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
2825	Worklist.insert(X: I);
2826	}
2827
2828	// Expand the worklist by looking through any bitcasts and getelementptr
2829	// instructions we've already identified as scalar. This is similar to the
2830	// expansion step in collectLoopUniforms(); however, here we're only
2831	// expanding to include additional bitcasts and getelementptr instructions.
2832	unsigned Idx = `0`;
2833	while (Idx != Worklist.size()) {
2834	Instruction *Dst = Worklist [Idx++];
2835	if (!IsLoopVaryingGEP (Dst->getOperand(i: `0`)))
2836	continue;
2837	auto *Src = cast<Instruction>(Val: Dst->getOperand(i: `0`));
2838	if (llvm::all_of(Range: Src->users(), P: [&](User U) -> bool* {
2839	auto *J = cast<Instruction>(Val: U);
2840	return !TheLoop->contains(Inst: J) \|\| Worklist.count(key: J) \|\|
2841	((isa<LoadInst>(Val: J) \|\| isa<StoreInst>(Val: J)) &&
2842	IsScalarUse (J, Src));
2843	})) {
2844	Worklist.insert(X: Src);
2845	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
2846	}
2847	}
2848
2849	// An induction variable will remain scalar if all users of the induction
2850	// variable and induction variable update remain scalar.
2851	for (const auto &Induction : Legal->getInductionVars()) {
2852	auto *Ind = Induction.first;
2853	auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
2854
2855	// If tail-folding is applied, the primary induction variable will be used
2856	// to feed a vector compare.
2857	if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
2858	continue;
2859
2860	// Returns true if \p Indvar is a pointer induction that is used directly by
2861	// load/store instruction \p I.
2862	auto IsDirectLoadStoreFromPtrIndvar = [&](Instruction *Indvar,
2863	Instruction *I) {
2864	return Induction.second.getKind() ==
2865	InductionDescriptor::IK_PtrInduction &&
2866	(isa<LoadInst>(Val: I) \|\| isa<StoreInst>(Val: I)) &&
2867	Indvar == getLoadStorePointerOperand(V: I) && IsScalarUse (I, Indvar);
2868	};
2869
2870	// Determine if all users of the induction variable are scalar after
2871	// vectorization.
2872	bool ScalarInd = all_of(Range: Ind->users(), P: [&](User U) -> bool* {
2873	auto *I = cast<Instruction>(Val: U);
2874	return I == IndUpdate \|\| !TheLoop->contains(Inst: I) \|\| Worklist.count(key: I) \|\|
2875	IsDirectLoadStoreFromPtrIndvar (Ind, I);
2876	});
2877	if (!ScalarInd)
2878	continue;
2879
2880	// If the induction variable update is a fixed-order recurrence, neither the
2881	// induction variable or its update should be marked scalar after
2882	// vectorization.
2883	auto *IndUpdatePhi = dyn_cast<PHINode>(Val: IndUpdate);
2884	if (IndUpdatePhi && Legal->isFixedOrderRecurrence(Phi: IndUpdatePhi))
2885	continue;
2886
2887	// Determine if all users of the induction variable update instruction are
2888	// scalar after vectorization.
2889	bool ScalarIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User U) -> bool* {
2890	auto *I = cast<Instruction>(Val: U);
2891	return I == Ind \|\| !TheLoop->contains(Inst: I) \|\| Worklist.count(key: I) \|\|
2892	IsDirectLoadStoreFromPtrIndvar (IndUpdate, I);
2893	});
2894	if (!ScalarIndUpdate)
2895	continue;
2896
2897	// The induction variable and its update instruction will remain scalar.
2898	Worklist.insert(X: Ind);
2899	Worklist.insert(X: IndUpdate);
2900	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
2901	LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate
2902	<< "\n");
2903	}
2904
2905	Scalars [VF].insert_range(R&: Worklist);
2906	}
2907
2908	bool LoopVectorizationCostModel::isScalarWithPredication(
2909	Instruction I, ElementCount VF) const* {
2910	if (!isPredicatedInst(I))
2911	return false;
2912
2913	// Do we have a non-scalar lowering for this predicated
2914	// instruction? No - it is scalar with predication.
2915	switch(I->getOpcode()) {
2916	default:
2917	return true;
2918	case Instruction::Call:
2919	if (VF.isScalar())
2920	return true;
2921	return getCallWideningDecision(CI: cast<CallInst>(Val: I), VF).Kind == CM_Scalarize;
2922	case Instruction::Load:
2923	case Instruction::Store: {
2924	auto *Ptr = getLoadStorePointerOperand(V: I);
2925	auto *Ty = getLoadStoreType(I);
2926	unsigned AS = getLoadStoreAddressSpace(I);
2927	Type *VTy = Ty;
2928	if (VF.isVector())
2929	VTy = VectorType::get(ElementType: Ty, EC: VF);
2930	const Align Alignment = getLoadStoreAlignment(I);
2931	return isa<LoadInst>(Val: I) ? !(isLegalMaskedLoad(DataType: Ty, Ptr, Alignment, AddressSpace: AS) \|\|
2932	TTI.isLegalMaskedGather(DataType: VTy, Alignment))
2933	: !(isLegalMaskedStore(DataType: Ty, Ptr, Alignment, AddressSpace: AS) \|\|
2934	TTI.isLegalMaskedScatter(DataType: VTy, Alignment));
2935	}
2936	case Instruction::UDiv:
2937	case Instruction::SDiv:
2938	case Instruction::SRem:
2939	case Instruction::URem: {
2940	// We have the option to use the safe-divisor idiom to avoid predication.
2941	// The cost based decision here will always select safe-divisor for
2942	// scalable vectors as scalarization isn't legal.
2943	const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
2944	return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
2945	}
2946	}
2947	}
2948
2949	// TODO: Fold into LoopVectorizationLegality::isMaskRequired.
2950	bool LoopVectorizationCostModel::isPredicatedInst(Instruction I) const* {
2951	// TODO: We can use the loop-preheader as context point here and get
2952	// context sensitive reasoning for isSafeToSpeculativelyExecute.
2953	if (isSafeToSpeculativelyExecute(I) \|\|
2954	(isa<LoadInst, StoreInst, CallInst>(Val: I) && !Legal->isMaskRequired(I)) \|\|
2955	isa<BranchInst, SwitchInst, PHINode, AllocaInst>(Val: I))
2956	return false;
2957
2958	// If the instruction was executed conditionally in the original scalar loop,
2959	// predication is needed with a mask whose lanes are all possibly inactive.
2960	if (Legal->blockNeedsPredication(BB: I->getParent()))
2961	return true;
2962
2963	// If we're not folding the tail by masking, predication is unnecessary.
2964	if (!foldTailByMasking())
2965	return false;
2966
2967	// All that remain are instructions with side-effects originally executed in
2968	// the loop unconditionally, but now execute under a tail-fold mask (only)
2969	// having at least one active lane (the first). If the side-effects of the
2970	// instruction are invariant, executing it w/o (the tail-folding) mask is safe
2971	// - it will cause the same side-effects as when masked.
2972	switch(I->getOpcode()) {
2973	default:
2974	llvm_unreachable(
2975	"instruction should have been considered by earlier checks");
2976	case Instruction::Call:
2977	// Side-effects of a Call are assumed to be non-invariant, needing a
2978	// (fold-tail) mask.
2979	assert(Legal->isMaskRequired(I) &&
2980	"should have returned earlier for calls not needing a mask");
2981	return true;
2982	case Instruction::Load:
2983	// If the address is loop invariant no predication is needed.
2984	return !Legal->isInvariant(V: getLoadStorePointerOperand(V: I));
2985	case Instruction::Store: {
2986	// For stores, we need to prove both speculation safety (which follows from
2987	// the same argument as loads), but also must prove the value being stored
2988	// is correct. The easiest form of the later is to require that all values
2989	// stored are the same.
2990	return !(Legal->isInvariant(V: getLoadStorePointerOperand(V: I)) &&
2991	Legal->isInvariant(V: cast<StoreInst>(Val: I)->getValueOperand()));
2992	}
2993	case Instruction::UDiv:
2994	case Instruction::SDiv:
2995	case Instruction::SRem:
2996	case Instruction::URem:
2997	// If the divisor is loop-invariant no predication is needed.
2998	return !Legal->isInvariant(V: I->getOperand(i: `1`));
2999	}
3000	}
3001
3002	std::pair<InstructionCost, InstructionCost>
3003	LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
3004	ElementCount VF) const {
3005	assert(I->getOpcode() == Instruction::UDiv \|\|
3006	I->getOpcode() == Instruction::SDiv \|\|
3007	I->getOpcode() == Instruction::SRem \|\|
3008	I->getOpcode() == Instruction::URem);
3009	assert(!isSafeToSpeculativelyExecute(I));
3010
3011	// Scalarization isn't legal for scalable vector types
3012	InstructionCost ScalarizationCost = InstructionCost::getInvalid();
3013	if (!VF.isScalable()) {
3014	// Get the scalarization cost and scale this amount by the probability of
3015	// executing the predicated block. If the instruction is not predicated,
3016	// we fall through to the next case.
3017	ScalarizationCost = `0`;
3018
3019	// These instructions have a non-void type, so account for the phi nodes
3020	// that we will create. This cost is likely to be zero. The phi node
3021	// cost, if any, should be scaled by the block probability because it
3022	// models a copy at the end of each predicated block.
3023	ScalarizationCost +=
3024	VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
3025
3026	// The cost of the non-predicated instruction.
3027	ScalarizationCost +=
3028	VF.getFixedValue() *
3029	TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: I->getType(), CostKind);
3030
3031	// The cost of insertelement and extractelement instructions needed for
3032	// scalarization.
3033	ScalarizationCost += getScalarizationOverhead(I, VF);
3034
3035	// Scale the cost by the probability of executing the predicated blocks.
3036	// This assumes the predicated block for each vector lane is equally
3037	// likely.
3038	ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
3039	}
3040	InstructionCost SafeDivisorCost = `0`;
3041
3042	auto *VecTy = toVectorTy(Scalar: I->getType(), EC: VF);
3043
3044	// The cost of the select guard to ensure all lanes are well defined
3045	// after we speculate above any internal control flow.
3046	SafeDivisorCost +=
3047	TTI.getCmpSelInstrCost(Opcode: Instruction::Select, ValTy: VecTy,
3048	CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
3049	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
3050
3051	// Certain instructions can be cheaper to vectorize if they have a constant
3052	// second vector operand. One example of this are shifts on x86.
3053	Value *Op2 = I->getOperand(i: `1`);
3054	auto Op2Info = TTI.getOperandInfo(V: Op2);
3055	if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
3056	Legal->isInvariant(V: Op2))
3057	Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
3058
3059	SmallVector<const Value *, `4`> Operands(I->operand_values());
3060	SafeDivisorCost += TTI.getArithmeticInstrCost(
3061	Opcode: I->getOpcode(), Ty: VecTy, CostKind,
3062	Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
3063	Opd2Info: Op2Info, Args: Operands, CxtI: I);
3064	return {ScalarizationCost, SafeDivisorCost};
3065	}
3066
3067	bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
3068	Instruction I, ElementCount VF) const* {
3069	assert(isAccessInterleaved(I) && "Expecting interleaved access.");
3070	assert(getWideningDecision(I, VF) == CM_Unknown &&
3071	"Decision should not be set yet.");
3072	auto *Group = getInterleavedAccessGroup(Instr: I);
3073	assert(Group && "Must have a group.");
3074	unsigned InterleaveFactor = Group->getFactor();
3075
3076	// If the instruction's allocated size doesn't equal its type size, it
3077	// requires padding and will be scalarized.
3078	auto &DL = I->getDataLayout();
3079	auto *ScalarTy = getLoadStoreType(I);
3080	if (hasIrregularType(Ty: ScalarTy, DL))
3081	return false;
3082
3083	// For scalable vectors, the interleave factors must be <= 8 since we require
3084	// the (de)interleaveN intrinsics instead of shufflevectors.
3085	if (VF.isScalable() && InterleaveFactor > `8`)
3086	return false;
3087
3088	// If the group involves a non-integral pointer, we may not be able to
3089	// losslessly cast all values to a common type.
3090	bool ScalarNI = DL.isNonIntegralPointerType(Ty: ScalarTy);
3091	for (unsigned Idx = `0`; Idx < InterleaveFactor; Idx++) {
3092	Instruction *Member = Group->getMember(Index: Idx);
3093	if (!Member)
3094	continue;
3095	auto *MemberTy = getLoadStoreType(I: Member);
3096	bool MemberNI = DL.isNonIntegralPointerType(Ty: MemberTy);
3097	// Don't coerce non-integral pointers to integers or vice versa.
3098	if (MemberNI != ScalarNI)
3099	// TODO: Consider adding special nullptr value case here
3100	return false;
3101	if (MemberNI && ScalarNI &&
3102	ScalarTy->getPointerAddressSpace() !=
3103	MemberTy->getPointerAddressSpace())
3104	return false;
3105	}
3106
3107	// Check if masking is required.
3108	// A Group may need masking for one of two reasons: it resides in a block that
3109	// needs predication, or it was decided to use masking to deal with gaps
3110	// (either a gap at the end of a load-access that may result in a speculative
3111	// load, or any gaps in a store-access).
3112	bool PredicatedAccessRequiresMasking =
3113	blockNeedsPredicationForAnyReason(BB: I->getParent()) &&
3114	Legal->isMaskRequired(I);
3115	bool LoadAccessWithGapsRequiresEpilogMasking =
3116	isa<LoadInst>(Val: I) && Group->requiresScalarEpilogue() &&
3117	!isScalarEpilogueAllowed();
3118	bool StoreAccessWithGapsRequiresMasking =
3119	isa<StoreInst>(Val: I) && (Group->getNumMembers() < Group->getFactor());
3120	if (!PredicatedAccessRequiresMasking &&
3121	!LoadAccessWithGapsRequiresEpilogMasking &&
3122	!StoreAccessWithGapsRequiresMasking)
3123	return true;
3124
3125	// If masked interleaving is required, we expect that the user/target had
3126	// enabled it, because otherwise it either wouldn't have been created or
3127	// it should have been invalidated by the CostModel.
3128	assert(useMaskedInterleavedAccesses(TTI) &&
3129	"Masked interleave-groups for predicated accesses are not enabled.");
3130
3131	if (Group->isReverse())
3132	return false;
3133
3134	auto *Ty = getLoadStoreType(I);
3135	const Align Alignment = getLoadStoreAlignment(I);
3136	unsigned AS = getLoadStoreAddressSpace(I);
3137	return isa<LoadInst>(Val: I) ? TTI.isLegalMaskedLoad(DataType: Ty, Alignment, AddressSpace: AS)
3138	: TTI.isLegalMaskedStore(DataType: Ty, Alignment, AddressSpace: AS);
3139	}
3140
3141	bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
3142	Instruction *I, ElementCount VF) {
3143	// Get and ensure we have a valid memory instruction.
3144	assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
3145
3146	auto *Ptr = getLoadStorePointerOperand(V: I);
3147	auto *ScalarTy = getLoadStoreType(I);
3148
3149	// In order to be widened, the pointer should be consecutive, first of all.
3150	if (!Legal->isConsecutivePtr(AccessTy: ScalarTy, Ptr))
3151	return false;
3152
3153	// If the instruction is a store located in a predicated block, it will be
3154	// scalarized.
3155	if (isScalarWithPredication(I, VF))
3156	return false;
3157
3158	// If the instruction's allocated size doesn't equal it's type size, it
3159	// requires padding and will be scalarized.
3160	auto &DL = I->getDataLayout();
3161	if (hasIrregularType(Ty: ScalarTy, DL))
3162	return false;
3163
3164	return true;
3165	}
3166
3167	void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
3168	// We should not collect Uniforms more than once per VF. Right now,
3169	// this function is called from collectUniformsAndScalars(), which
3170	// already does this check. Collecting Uniforms for VF=1 does not make any
3171	// sense.
3172
3173	assert(VF.isVector() && !Uniforms.contains(VF) &&
3174	"This function should not be visited twice for the same VF");
3175
3176	// Visit the list of Uniforms. If we find no uniform value, we won't
3177	// analyze again. Uniforms.count(VF) will return 1.
3178	Uniforms [VF].clear();
3179
3180	// Now we know that the loop is vectorizable!
3181	// Collect instructions inside the loop that will remain uniform after
3182	// vectorization.
3183
3184	// Global values, params and instructions outside of current loop are out of
3185	// scope.
3186	auto IsOutOfScope = [&](Value V) -> bool* {
3187	Instruction *I = dyn_cast<Instruction>(Val: V);
3188	return (!I \|\| !TheLoop->contains(Inst: I));
3189	};
3190
3191	// Worklist containing uniform instructions demanding lane 0.
3192	SetVector<Instruction *> Worklist;
3193
3194	// Add uniform instructions demanding lane 0 to the worklist. Instructions
3195	// that require predication must not be considered uniform after
3196	// vectorization, because that would create an erroneous replicating region
3197	// where only a single instance out of VF should be formed.
3198	auto AddToWorklistIfAllowed = [&](Instruction I) -> void* {
3199	if (IsOutOfScope (I)) {
3200	LLVM_DEBUG(dbgs() << "LV: Found not uniform due to scope: "
3201	<< *I << "\n");
3202	return;
3203	}
3204	if (isPredicatedInst(I)) {
3205	LLVM_DEBUG(
3206	dbgs() << "LV: Found not uniform due to requiring predication: " << *I
3207	<< "\n");
3208	return;
3209	}
3210	LLVM_DEBUG(dbgs() << "LV: Found uniform instruction: " << *I << "\n");
3211	Worklist.insert(X: I);
3212	};
3213
3214	// Start with the conditional branches exiting the loop. If the branch
3215	// condition is an instruction contained in the loop that is only used by the
3216	// branch, it is uniform. Note conditions from uncountable early exits are not
3217	// uniform.
3218	SmallVector<BasicBlock *> Exiting;
3219	TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
3220	for (BasicBlock *E : Exiting) {
3221	if (Legal->hasUncountableEarlyExit() && TheLoop->getLoopLatch() != E)
3222	continue;
3223	auto *Cmp = dyn_cast<Instruction>(Val: E->getTerminator()->getOperand(i: `0`));
3224	if (Cmp && TheLoop->contains(Inst: Cmp) && Cmp->hasOneUse())
3225	AddToWorklistIfAllowed (Cmp);
3226	}
3227
3228	auto PrevVF = VF.divideCoefficientBy(RHS: `2`);
3229	// Return true if all lanes perform the same memory operation, and we can
3230	// thus choose to execute only one.
3231	auto IsUniformMemOpUse = [&](Instruction *I) {
3232	// If the value was already known to not be uniform for the previous
3233	// (smaller VF), it cannot be uniform for the larger VF.
3234	if (PrevVF.isVector()) {
3235	auto Iter = Uniforms.find(Val: PrevVF);
3236	if (Iter != Uniforms.end() && !Iter ->second.contains(Ptr: I))
3237	return false;
3238	}
3239	if (!Legal->isUniformMemOp(I&: *I, VF))
3240	return false;
3241	if (isa<LoadInst>(Val: I))
3242	// Loading the same address always produces the same result - at least
3243	// assuming aliasing and ordering which have already been checked.
3244	return true;
3245	// Storing the same value on every iteration.
3246	return TheLoop->isLoopInvariant(V: cast<StoreInst>(Val: I)->getValueOperand());
3247	};
3248
3249	auto IsUniformDecision = [&](Instruction *I, ElementCount VF) {
3250	InstWidening WideningDecision = getWideningDecision(I, VF);
3251	assert(WideningDecision != CM_Unknown &&
3252	"Widening decision should be ready at this moment");
3253
3254	if (IsUniformMemOpUse (I))
3255	return true;
3256
3257	return (WideningDecision == CM_Widen \|\|
3258	WideningDecision == CM_Widen_Reverse \|\|
3259	WideningDecision == CM_Interleave);
3260	};
3261
3262	// Returns true if Ptr is the pointer operand of a memory access instruction
3263	// I, I is known to not require scalarization, and the pointer is not also
3264	// stored.
3265	auto IsVectorizedMemAccessUse = [&](Instruction I, Value Ptr) -> bool {
3266	if (isa<StoreInst>(Val: I) && I->getOperand(i: `0`) == Ptr)
3267	return false;
3268	return getLoadStorePointerOperand(V: I) == Ptr &&
3269	(IsUniformDecision (I, VF) \|\| Legal->isInvariant(V: Ptr));
3270	};
3271
3272	// Holds a list of values which are known to have at least one uniform use.
3273	// Note that there may be other uses which aren't uniform. A "uniform use"
3274	// here is something which only demands lane 0 of the unrolled iterations;
3275	// it does not imply that all lanes produce the same value (e.g. this is not
3276	// the usual meaning of uniform)
3277	SetVector<Value *> HasUniformUse;
3278
3279	// Scan the loop for instructions which are either a) known to have only
3280	// lane 0 demanded or b) are uses which demand only lane 0 of their operand.
3281	for (auto *BB : TheLoop->blocks())
3282	for (auto &I : *BB) {
3283	if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Val: &I)) {
3284	switch (II->getIntrinsicID()) {
3285	case Intrinsic::sideeffect:
3286	case Intrinsic::experimental_noalias_scope_decl:
3287	case Intrinsic::assume:
3288	case Intrinsic::lifetime_start:
3289	case Intrinsic::lifetime_end:
3290	if (TheLoop->hasLoopInvariantOperands(I: &I))
3291	AddToWorklistIfAllowed (&I);
3292	break;
3293	default:
3294	break;
3295	}
3296	}
3297
3298	if (auto *EVI = dyn_cast<ExtractValueInst>(Val: &I)) {
3299	if (IsOutOfScope (EVI->getAggregateOperand())) {
3300	AddToWorklistIfAllowed (EVI);
3301	continue;
3302	}
3303	// Only ExtractValue instructions where the aggregate value comes from a
3304	// call are allowed to be non-uniform.
3305	assert(isa<CallInst>(EVI->getAggregateOperand()) &&
3306	"Expected aggregate value to be call return value");
3307	}
3308
3309	// If there's no pointer operand, there's nothing to do.
3310	auto *Ptr = getLoadStorePointerOperand(V: &I);
3311	if (!Ptr)
3312	continue;
3313
3314	if (IsUniformMemOpUse (&I))
3315	AddToWorklistIfAllowed (&I);
3316
3317	if (IsVectorizedMemAccessUse (&I, Ptr))
3318	HasUniformUse.insert(X: Ptr);
3319	}
3320
3321	// Add to the worklist any operands which have only* uniform (e.g. lane 0*
3322	// demanding) users. Since loops are assumed to be in LCSSA form, this
3323	// disallows uses outside the loop as well.
3324	for (auto *V : HasUniformUse) {
3325	if (IsOutOfScope (V))
3326	continue;
3327	auto *I = cast<Instruction>(Val: V);
3328	bool UsersAreMemAccesses = all_of(Range: I->users(), P: [&](User U) -> bool* {
3329	auto *UI = cast<Instruction>(Val: U);
3330	return TheLoop->contains(Inst: UI) && IsVectorizedMemAccessUse (UI, V);
3331	});
3332	if (UsersAreMemAccesses)
3333	AddToWorklistIfAllowed (I);
3334	}
3335
3336	// Expand Worklist in topological order: whenever a new instruction
3337	// is added , its users should be already inside Worklist. It ensures
3338	// a uniform instruction will only be used by uniform instructions.
3339	unsigned Idx = `0`;
3340	while (Idx != Worklist.size()) {
3341	Instruction *I = Worklist [Idx++];
3342
3343	for (auto *OV : I->operand_values()) {
3344	// isOutOfScope operands cannot be uniform instructions.
3345	if (IsOutOfScope (OV))
3346	continue;
3347	// First order recurrence Phi's should typically be considered
3348	// non-uniform.
3349	auto *OP = dyn_cast<PHINode>(Val: OV);
3350	if (OP && Legal->isFixedOrderRecurrence(Phi: OP))
3351	continue;
3352	// If all the users of the operand are uniform, then add the
3353	// operand into the uniform worklist.
3354	auto *OI = cast<Instruction>(Val: OV);
3355	if (llvm::all_of(Range: OI->users(), P: [&](User U) -> bool* {
3356	auto *J = cast<Instruction>(Val: U);
3357	return Worklist.count(key: J) \|\| IsVectorizedMemAccessUse (J, OI);
3358	}))
3359	AddToWorklistIfAllowed (OI);
3360	}
3361	}
3362
3363	// For an instruction to be added into Worklist above, all its users inside
3364	// the loop should also be in Worklist. However, this condition cannot be
3365	// true for phi nodes that form a cyclic dependence. We must process phi
3366	// nodes separately. An induction variable will remain uniform if all users
3367	// of the induction variable and induction variable update remain uniform.
3368	// The code below handles both pointer and non-pointer induction variables.
3369	BasicBlock *Latch = TheLoop->getLoopLatch();
3370	for (const auto &Induction : Legal->getInductionVars()) {
3371	auto *Ind = Induction.first;
3372	auto *IndUpdate = cast<Instruction>(Val: Ind->getIncomingValueForBlock(BB: Latch));
3373
3374	// Determine if all users of the induction variable are uniform after
3375	// vectorization.
3376	bool UniformInd = all_of(Range: Ind->users(), P: [&](User U) -> bool* {
3377	auto *I = cast<Instruction>(Val: U);
3378	return I == IndUpdate \|\| !TheLoop->contains(Inst: I) \|\| Worklist.count(key: I) \|\|
3379	IsVectorizedMemAccessUse (I, Ind);
3380	});
3381	if (!UniformInd)
3382	continue;
3383
3384	// Determine if all users of the induction variable update instruction are
3385	// uniform after vectorization.
3386	bool UniformIndUpdate = all_of(Range: IndUpdate->users(), P: [&](User U) -> bool* {
3387	auto *I = cast<Instruction>(Val: U);
3388	return I == Ind \|\| Worklist.count(key: I) \|\|
3389	IsVectorizedMemAccessUse (I, IndUpdate);
3390	});
3391	if (!UniformIndUpdate)
3392	continue;
3393
3394	// The induction variable and its update instruction will remain uniform.
3395	AddToWorklistIfAllowed (Ind);
3396	AddToWorklistIfAllowed (IndUpdate);
3397	}
3398
3399	Uniforms [VF].insert_range(R&: Worklist);
3400	}
3401
3402	bool LoopVectorizationCostModel::runtimeChecksRequired() {
3403	LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
3404
3405	if (Legal->getRuntimePointerChecking()->Need) {
3406	reportVectorizationFailure(DebugMsg: "Runtime ptr check is required with -Os/-Oz",
3407	OREMsg: "runtime pointer checks needed. Enable vectorization of this "
3408	"loop with '#pragma clang loop vectorize(enable)' when "
3409	"compiling with -Os/-Oz",
3410	ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3411	return true;
3412	}
3413
3414	if (!PSE.getPredicate().isAlwaysTrue()) {
3415	reportVectorizationFailure(DebugMsg: "Runtime SCEV check is required with -Os/-Oz",
3416	OREMsg: "runtime SCEV checks needed. Enable vectorization of this "
3417	"loop with '#pragma clang loop vectorize(enable)' when "
3418	"compiling with -Os/-Oz",
3419	ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3420	return true;
3421	}
3422
3423	// FIXME: Avoid specializing for stride==1 instead of bailing out.
3424	if (!Legal->getLAI()->getSymbolicStrides().empty()) {
3425	reportVectorizationFailure(DebugMsg: "Runtime stride check for small trip count",
3426	OREMsg: "runtime stride == 1 checks needed. Enable vectorization of "
3427	"this loop without such check by compiling with -Os/-Oz",
3428	ORETag: "CantVersionLoopWithOptForSize", ORE, TheLoop);
3429	return true;
3430	}
3431
3432	return false;
3433	}
3434
3435	bool LoopVectorizationCostModel::isScalableVectorizationAllowed() {
3436	if (IsScalableVectorizationAllowed)
3437	return *IsScalableVectorizationAllowed;
3438
3439	IsScalableVectorizationAllowed = false;
3440	if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors)
3441	return false;
3442
3443	if (Hints->isScalableVectorizationDisabled()) {
3444	reportVectorizationInfo(Msg: "Scalable vectorization is explicitly disabled",
3445	ORETag: "ScalableVectorizationDisabled", ORE, TheLoop);
3446	return false;
3447	}
3448
3449	LLVM_DEBUG(dbgs() << "LV: Scalable vectorization is available\n");
3450
3451	auto MaxScalableVF = ElementCount::getScalable(
3452	MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3453
3454	// Test that the loop-vectorizer can legalize all operations for this MaxVF.
3455	// FIXME: While for scalable vectors this is currently sufficient, this should
3456	// be replaced by a more detailed mechanism that filters out specific VFs,
3457	// instead of invalidating vectorization for a whole set of VFs based on the
3458	// MaxVF.
3459
3460	// Disable scalable vectorization if the loop contains unsupported reductions.
3461	if (!canVectorizeReductions(VF: MaxScalableVF)) {
3462	reportVectorizationInfo(
3463	Msg: "Scalable vectorization not supported for the reduction "
3464	"operations found in this loop.",
3465	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3466	return false;
3467	}
3468
3469	// Disable scalable vectorization if the loop contains any instructions
3470	// with element types not supported for scalable vectors.
3471	if (any_of(Range&: ElementTypesInLoop, P: [&](Type *Ty) {
3472	return !Ty->isVoidTy() &&
3473	!this->TTI.isElementTypeLegalForScalableVector(Ty);
3474	})) {
3475	reportVectorizationInfo(Msg: "Scalable vectorization is not supported "
3476	"for all element types found in this loop.",
3477	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3478	return false;
3479	}
3480
3481	if (!Legal->isSafeForAnyVectorWidth() && !getMaxVScale(F: *TheFunction, TTI)) {
3482	reportVectorizationInfo(Msg: "The target does not provide maximum vscale value "
3483	"for safe distance analysis.",
3484	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3485	return false;
3486	}
3487
3488	IsScalableVectorizationAllowed = true;
3489	return true;
3490	}
3491
3492	ElementCount
3493	LoopVectorizationCostModel::getMaxLegalScalableVF(unsigned MaxSafeElements) {
3494	if (!isScalableVectorizationAllowed())
3495	return ElementCount::getScalable(MinVal: `0`);
3496
3497	auto MaxScalableVF = ElementCount::getScalable(
3498	MinVal: std::numeric_limits<ElementCount::ScalarTy>::max());
3499	if (Legal->isSafeForAnyVectorWidth())
3500	return MaxScalableVF;
3501
3502	std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3503	// Limit MaxScalableVF by the maximum safe dependence distance.
3504	MaxScalableVF = ElementCount::getScalable(MinVal: MaxSafeElements / *MaxVScale);
3505
3506	if (!MaxScalableVF)
3507	reportVectorizationInfo(
3508	Msg: "Max legal vector width too small, scalable vectorization "
3509	"unfeasible.",
3510	ORETag: "ScalableVFUnfeasible", ORE, TheLoop);
3511
3512	return MaxScalableVF;
3513	}
3514
3515	FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
3516	unsigned MaxTripCount, ElementCount UserVF, bool FoldTailByMasking) {
3517	MinBWs = computeMinimumValueSizes(Blocks: TheLoop->getBlocks(), DB&: *DB, TTI: &TTI);
3518	unsigned SmallestType, WidestType;
3519	std::tie(args&: SmallestType, args&: WidestType) = getSmallestAndWidestTypes();
3520
3521	// Get the maximum safe dependence distance in bits computed by LAA.
3522	// It is computed by MaxVF sizeOf(type) * 8, where type is taken from*
3523	// the memory accesses that is most restrictive (involved in the smallest
3524	// dependence distance).
3525	unsigned MaxSafeElementsPowerOf2 =
3526	bit_floor(Value: Legal->getMaxSafeVectorWidthInBits() / WidestType);
3527	if (!Legal->isSafeForAnyStoreLoadForwardDistances()) {
3528	unsigned SLDist = Legal->getMaxStoreLoadForwardSafeDistanceInBits();
3529	MaxSafeElementsPowerOf2 =
3530	std::min(a: MaxSafeElementsPowerOf2, b: SLDist / WidestType);
3531	}
3532	auto MaxSafeFixedVF = ElementCount::getFixed(MinVal: MaxSafeElementsPowerOf2);
3533	auto MaxSafeScalableVF = getMaxLegalScalableVF(MaxSafeElements: MaxSafeElementsPowerOf2);
3534
3535	if (!Legal->isSafeForAnyVectorWidth())
3536	this->MaxSafeElements = MaxSafeElementsPowerOf2;
3537
3538	LLVM_DEBUG(dbgs() << "LV: The max safe fixed VF is: " << MaxSafeFixedVF
3539	<< ".\n");
3540	LLVM_DEBUG(dbgs() << "LV: The max safe scalable VF is: " << MaxSafeScalableVF
3541	<< ".\n");
3542
3543	// First analyze the UserVF, fall back if the UserVF should be ignored.
3544	if (UserVF) {
3545	auto MaxSafeUserVF =
3546	UserVF.isScalable() ? MaxSafeScalableVF : MaxSafeFixedVF;
3547
3548	if (ElementCount::isKnownLE(LHS: UserVF, RHS: MaxSafeUserVF)) {
3549	// If `VF=vscale x N` is safe, then so is `VF=N`
3550	if (UserVF.isScalable())
3551	return FixedScalableVFPair (
3552	ElementCount::getFixed(MinVal: UserVF.getKnownMinValue()), UserVF);
3553
3554	return UserVF;
3555	}
3556
3557	assert(ElementCount::isKnownGT(UserVF, MaxSafeUserVF));
3558
3559	// Only clamp if the UserVF is not scalable. If the UserVF is scalable, it
3560	// is better to ignore the hint and let the compiler choose a suitable VF.
3561	if (!UserVF.isScalable()) {
3562	LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3563	<< " is unsafe, clamping to max safe VF="
3564	<< MaxSafeFixedVF << ".\n");
3565	ORE->emit(RemarkBuilder: [&]() {
3566	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationFactor",
3567	TheLoop->getStartLoc(),
3568	TheLoop->getHeader())
3569	<< "User-specified vectorization factor "
3570	<< ore::NV ("UserVectorizationFactor", UserVF)
3571	<< " is unsafe, clamping to maximum safe vectorization factor "
3572	<< ore::NV ("VectorizationFactor", MaxSafeFixedVF);
3573	});
3574	return MaxSafeFixedVF;
3575	}
3576
3577	if (!TTI.supportsScalableVectors() && !ForceTargetSupportsScalableVectors) {
3578	LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3579	<< " is ignored because scalable vectors are not "
3580	"available.\n");
3581	ORE->emit(RemarkBuilder: [&]() {
3582	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationFactor",
3583	TheLoop->getStartLoc(),
3584	TheLoop->getHeader())
3585	<< "User-specified vectorization factor "
3586	<< ore::NV ("UserVectorizationFactor", UserVF)
3587	<< " is ignored because the target does not support scalable "
3588	"vectors. The compiler will pick a more suitable value.";
3589	});
3590	} else {
3591	LLVM_DEBUG(dbgs() << "LV: User VF=" << UserVF
3592	<< " is unsafe. Ignoring scalable UserVF.\n");
3593	ORE->emit(RemarkBuilder: [&]() {
3594	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationFactor",
3595	TheLoop->getStartLoc(),
3596	TheLoop->getHeader())
3597	<< "User-specified vectorization factor "
3598	<< ore::NV ("UserVectorizationFactor", UserVF)
3599	<< " is unsafe. Ignoring the hint to let the compiler pick a "
3600	"more suitable value.";
3601	});
3602	}
3603	}
3604
3605	LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
3606	<< " / " << WidestType << " bits.\n");
3607
3608	FixedScalableVFPair Result(ElementCount::getFixed(MinVal: `1`),
3609	ElementCount::getScalable(MinVal: `0`));
3610	if (auto MaxVF =
3611	getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3612	MaxSafeVF: MaxSafeFixedVF, FoldTailByMasking))
3613	Result.FixedVF = MaxVF;
3614
3615	if (auto MaxVF =
3616	getMaximizedVFForTarget(MaxTripCount, SmallestType, WidestType,
3617	MaxSafeVF: MaxSafeScalableVF, FoldTailByMasking))
3618	if (MaxVF.isScalable()) {
3619	Result.ScalableVF = MaxVF;
3620	LLVM_DEBUG(dbgs() << "LV: Found feasible scalable VF = " << MaxVF
3621	<< "\n");
3622	}
3623
3624	return Result;
3625	}
3626
3627	FixedScalableVFPair
3628	LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
3629	if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
3630	// TODO: It may be useful to do since it's still likely to be dynamically
3631	// uniform if the target can skip.
3632	reportVectorizationFailure(
3633	DebugMsg: "Not inserting runtime ptr check for divergent target",
3634	OREMsg: "runtime pointer checks needed. Not enabled for divergent target",
3635	ORETag: "CantVersionLoopWithDivergentTarget", ORE, TheLoop);
3636	return FixedScalableVFPair::getNone();
3637	}
3638
3639	ScalarEvolution *SE = PSE.getSE();
3640	ElementCount TC = getSmallConstantTripCount(SE, L: TheLoop);
3641	unsigned MaxTC = PSE.getSmallConstantMaxTripCount();
3642	LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << `'\n'`);
3643	if (TC != ElementCount::getFixed(MinVal: MaxTC))
3644	LLVM_DEBUG(dbgs() << "LV: Found maximum trip count: " << MaxTC << `'\n'`);
3645	if (TC.isScalar()) {
3646	reportVectorizationFailure(DebugMsg: "Single iteration (non) loop",
3647	OREMsg: "loop trip count is one, irrelevant for vectorization",
3648	ORETag: "SingleIterationLoop", ORE, TheLoop);
3649	return FixedScalableVFPair::getNone();
3650	}
3651
3652	// If BTC matches the widest induction type and is -1 then the trip count
3653	// computation will wrap to 0 and the vector trip count will be 0. Do not try
3654	// to vectorize.
3655	const SCEV *BTC = SE->getBackedgeTakenCount(L: TheLoop);
3656	if (!isa<SCEVCouldNotCompute>(Val: BTC) &&
3657	BTC->getType()->getScalarSizeInBits() >=
3658	Legal->getWidestInductionType()->getScalarSizeInBits() &&
3659	SE->isKnownPredicate(Pred: CmpInst::ICMP_EQ, LHS: BTC,
3660	RHS: SE->getMinusOne(Ty: BTC->getType()))) {
3661	reportVectorizationFailure(
3662	DebugMsg: "Trip count computation wrapped",
3663	OREMsg: "backedge-taken count is -1, loop trip count wrapped to 0",
3664	ORETag: "TripCountWrapped", ORE, TheLoop);
3665	return FixedScalableVFPair::getNone();
3666	}
3667
3668	switch (ScalarEpilogueStatus) {
3669	case CM_ScalarEpilogueAllowed:
3670	return computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: false);
3671	case CM_ScalarEpilogueNotAllowedUsePredicate:
3672	[[fallthrough]];
3673	case CM_ScalarEpilogueNotNeededUsePredicate:
3674	LLVM_DEBUG(
3675	dbgs() << "LV: vector predicate hint/switch found.\n"
3676	<< "LV: Not allowing scalar epilogue, creating predicated "
3677	<< "vector loop.\n");
3678	break;
3679	case CM_ScalarEpilogueNotAllowedLowTripLoop:
3680	// fallthrough as a special case of OptForSize
3681	case CM_ScalarEpilogueNotAllowedOptSize:
3682	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedOptSize)
3683	LLVM_DEBUG(
3684	dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
3685	else
3686	LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
3687	<< "count.\n");
3688
3689	// Bail if runtime checks are required, which are not good when optimising
3690	// for size.
3691	if (runtimeChecksRequired())
3692	return FixedScalableVFPair::getNone();
3693
3694	break;
3695	}
3696
3697	// Now try the tail folding
3698
3699	// Invalidate interleave groups that require an epilogue if we can't mask
3700	// the interleave-group.
3701	if (!useMaskedInterleavedAccesses(TTI)) {
3702	assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
3703	"No decisions should have been taken at this point");
3704	// Note: There is no need to invalidate any cost modeling decisions here, as
3705	// none were taken so far.
3706	InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
3707	}
3708
3709	FixedScalableVFPair MaxFactors = computeFeasibleMaxVF(MaxTripCount: MaxTC, UserVF, FoldTailByMasking: true);
3710
3711	// Avoid tail folding if the trip count is known to be a multiple of any VF
3712	// we choose.
3713	std::optional<unsigned> MaxPowerOf2RuntimeVF =
3714	MaxFactors.FixedVF.getFixedValue();
3715	if (MaxFactors.ScalableVF) {
3716	std::optional<unsigned> MaxVScale = getMaxVScale(F: *TheFunction, TTI);
3717	if (MaxVScale && TTI.isVScaleKnownToBeAPowerOfTwo()) {
3718	MaxPowerOf2RuntimeVF = std::max<unsigned>(
3719	a: *MaxPowerOf2RuntimeVF,
3720	b: MaxVScale MaxFactors.ScalableVF.getKnownMinValue());
3721	} else
3722	MaxPowerOf2RuntimeVF = std::nullopt; // Stick with tail-folding for now.
3723	}
3724
3725	auto NoScalarEpilogueNeeded = [this, &UserIC](unsigned MaxVF) {
3726	// Return false if the loop is neither a single-latch-exit loop nor an
3727	// early-exit loop as tail-folding is not supported in that case.
3728	if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch() &&
3729	!Legal->hasUncountableEarlyExit())
3730	return false;
3731	unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
3732	ScalarEvolution *SE = PSE.getSE();
3733	// Calling getSymbolicMaxBackedgeTakenCount enables support for loops
3734	// with uncountable exits. For countable loops, the symbolic maximum must
3735	// remain identical to the known back-edge taken count.
3736	const SCEV *BackedgeTakenCount = PSE.getSymbolicMaxBackedgeTakenCount();
3737	assert((Legal->hasUncountableEarlyExit() \|\|
3738	BackedgeTakenCount == PSE.getBackedgeTakenCount()) &&
3739	"Invalid loop count");
3740	const SCEV *ExitCount = SE->getAddExpr(
3741	LHS: BackedgeTakenCount, RHS: SE->getOne(Ty: BackedgeTakenCount->getType()));
3742	const SCEV *Rem = SE->getURemExpr(
3743	LHS: SE->applyLoopGuards(Expr: ExitCount, L: TheLoop),
3744	RHS: SE->getConstant(Ty: BackedgeTakenCount->getType(), V: MaxVFtimesIC));
3745	return Rem->isZero();
3746	};
3747
3748	if (MaxPowerOf2RuntimeVF > `0u`) {
3749	assert((UserVF.isNonZero() \|\| isPowerOf2_32(*MaxPowerOf2RuntimeVF)) &&
3750	"MaxFixedVF must be a power of 2");
3751	if (NoScalarEpilogueNeeded (*MaxPowerOf2RuntimeVF)) {
3752	// Accept MaxFixedVF if we do not have a tail.
3753	LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
3754	return MaxFactors;
3755	}
3756	}
3757
3758	auto ExpectedTC = getSmallBestKnownTC(PSE, L: TheLoop);
3759	if (ExpectedTC && ExpectedTC ->isFixed() &&
3760	ExpectedTC ->getFixedValue() <=
3761	TTI.getMinTripCountTailFoldingThreshold()) {
3762	if (MaxPowerOf2RuntimeVF > `0u`) {
3763	// If we have a low-trip-count, and the fixed-width VF is known to divide
3764	// the trip count but the scalable factor does not, use the fixed-width
3765	// factor in preference to allow the generation of a non-predicated loop.
3766	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedLowTripLoop &&
3767	NoScalarEpilogueNeeded (MaxFactors.FixedVF.getFixedValue())) {
3768	LLVM_DEBUG(dbgs() << "LV: Picking a fixed-width so that no tail will "
3769	"remain for any chosen VF.\n");
3770	MaxFactors.ScalableVF = ElementCount::getScalable(MinVal: `0`);
3771	return MaxFactors;
3772	}
3773	}
3774
3775	reportVectorizationFailure(
3776	DebugMsg: "The trip count is below the minial threshold value.",
3777	OREMsg: "loop trip count is too low, avoiding vectorization", ORETag: "LowTripCount",
3778	ORE, TheLoop);
3779	return FixedScalableVFPair::getNone();
3780	}
3781
3782	// If we don't know the precise trip count, or if the trip count that we
3783	// found modulo the vectorization factor is not zero, try to fold the tail
3784	// by masking.
3785	// FIXME: look for a smaller MaxVF that does divide TC rather than masking.
3786	bool ContainsScalableVF = MaxFactors.ScalableVF.isNonZero();
3787	setTailFoldingStyles(IsScalableVF: ContainsScalableVF, UserIC);
3788	if (foldTailByMasking()) {
3789	if (getTailFoldingStyle() == TailFoldingStyle::DataWithEVL) {
3790	LLVM_DEBUG(
3791	dbgs()
3792	<< "LV: tail is folded with EVL, forcing unroll factor to be 1. Will "
3793	"try to generate VP Intrinsics with scalable vector "
3794	"factors only.\n");
3795	// Tail folded loop using VP intrinsics restricts the VF to be scalable
3796	// for now.
3797	// TODO: extend it for fixed vectors, if required.
3798	assert(ContainsScalableVF && "Expected scalable vector factor.");
3799
3800	MaxFactors.FixedVF = ElementCount::getFixed(MinVal: `1`);
3801	}
3802	return MaxFactors;
3803	}
3804
3805	// If there was a tail-folding hint/switch, but we can't fold the tail by
3806	// masking, fallback to a vectorization with a scalar epilogue.
3807	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
3808	LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
3809	"scalar epilogue instead.\n");
3810	ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
3811	return MaxFactors;
3812	}
3813
3814	if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate) {
3815	LLVM_DEBUG(dbgs() << "LV: Can't fold tail by masking: don't vectorize\n");
3816	return FixedScalableVFPair::getNone();
3817	}
3818
3819	if (TC.isZero()) {
3820	reportVectorizationFailure(
3821	DebugMsg: "unable to calculate the loop count due to complex control flow",
3822	ORETag: "UnknownLoopCountComplexCFG", ORE, TheLoop);
3823	return FixedScalableVFPair::getNone();
3824	}
3825
3826	reportVectorizationFailure(
3827	DebugMsg: "Cannot optimize for size and vectorize at the same time.",
3828	OREMsg: "cannot optimize for size and vectorize at the same time. "
3829	"Enable vectorization of this loop with '#pragma clang loop "
3830	"vectorize(enable)' when compiling with -Os/-Oz",
3831	ORETag: "NoTailLoopWithOptForSize", ORE, TheLoop);
3832	return FixedScalableVFPair::getNone();
3833	}
3834
3835	bool LoopVectorizationCostModel::useMaxBandwidth(ElementCount VF) {
3836	return useMaxBandwidth(RegKind: VF.isScalable()
3837	? TargetTransformInfo::RGK_ScalableVector
3838	: TargetTransformInfo::RGK_FixedWidthVector);
3839	}
3840
3841	bool LoopVectorizationCostModel::useMaxBandwidth(
3842	TargetTransformInfo::RegisterKind RegKind) {
3843	return MaximizeBandwidth \|\| (MaximizeBandwidth.getNumOccurrences() == `0` &&
3844	(TTI.shouldMaximizeVectorBandwidth(K: RegKind) \|\|
3845	(UseWiderVFIfCallVariantsPresent &&
3846	Legal->hasVectorCallVariants())));
3847	}
3848
3849	ElementCount LoopVectorizationCostModel::getMaximizedVFForTarget(
3850	unsigned MaxTripCount, unsigned SmallestType, unsigned WidestType,
3851	ElementCount MaxSafeVF, bool FoldTailByMasking) {
3852	bool ComputeScalableMaxVF = MaxSafeVF.isScalable();
3853	const TypeSize WidestRegister = TTI.getRegisterBitWidth(
3854	K: ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3855	: TargetTransformInfo::RGK_FixedWidthVector);
3856
3857	// Convenience function to return the minimum of two ElementCounts.
3858	auto MinVF = [](const ElementCount &LHS, const ElementCount &RHS) {
3859	assert((LHS.isScalable() == RHS.isScalable()) &&
3860	"Scalable flags must match");
3861	return ElementCount::isKnownLT(LHS, RHS) ? LHS : RHS;
3862	};
3863
3864	// Ensure MaxVF is a power of 2; the dependence distance bound may not be.
3865	// Note that both WidestRegister and WidestType may not be a powers of 2.
3866	auto MaxVectorElementCount = ElementCount::get(
3867	MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / WidestType),
3868	Scalable: ComputeScalableMaxVF);
3869	MaxVectorElementCount = MinVF (MaxVectorElementCount, MaxSafeVF);
3870	LLVM_DEBUG(dbgs() << "LV: The Widest register safe to use is: "
3871	<< (MaxVectorElementCount * WidestType) << " bits.\n");
3872
3873	if (!MaxVectorElementCount) {
3874	LLVM_DEBUG(dbgs() << "LV: The target has no "
3875	<< (ComputeScalableMaxVF ? "scalable" : "fixed")
3876	<< " vector registers.\n");
3877	return ElementCount::getFixed(MinVal: `1`);
3878	}
3879
3880	unsigned WidestRegisterMinEC = MaxVectorElementCount.getKnownMinValue();
3881	if (MaxVectorElementCount.isScalable() &&
3882	TheFunction->hasFnAttribute(Kind: Attribute::VScaleRange)) {
3883	auto Attr = TheFunction->getFnAttribute(Kind: Attribute::VScaleRange);
3884	auto Min = Attr.getVScaleRangeMin();
3885	WidestRegisterMinEC *= Min;
3886	}
3887
3888	// When a scalar epilogue is required, at least one iteration of the scalar
3889	// loop has to execute. Adjust MaxTripCount accordingly to avoid picking a
3890	// max VF that results in a dead vector loop.
3891	if (MaxTripCount > `0` && requiresScalarEpilogue(IsVectorizing: true))
3892	MaxTripCount -= `1`;
3893
3894	if (MaxTripCount && MaxTripCount <= WidestRegisterMinEC &&
3895	(!FoldTailByMasking \|\| isPowerOf2_32(Value: MaxTripCount))) {
3896	// If upper bound loop trip count (TC) is known at compile time there is no
3897	// point in choosing VF greater than TC (as done in the loop below). Select
3898	// maximum power of two which doesn't exceed TC. If MaxVectorElementCount is
3899	// scalable, we only fall back on a fixed VF when the TC is less than or
3900	// equal to the known number of lanes.
3901	auto ClampedUpperTripCount = llvm::bit_floor(Value: MaxTripCount);
3902	LLVM_DEBUG(dbgs() << "LV: Clamping the MaxVF to maximum power of two not "
3903	"exceeding the constant trip count: "
3904	<< ClampedUpperTripCount << "\n");
3905	return ElementCount::get(
3906	MinVal: ClampedUpperTripCount,
3907	Scalable: FoldTailByMasking ? MaxVectorElementCount.isScalable() : false);
3908	}
3909
3910	TargetTransformInfo::RegisterKind RegKind =
3911	ComputeScalableMaxVF ? TargetTransformInfo::RGK_ScalableVector
3912	: TargetTransformInfo::RGK_FixedWidthVector;
3913	ElementCount MaxVF = MaxVectorElementCount;
3914	if (useMaxBandwidth(RegKind)) {
3915	auto MaxVectorElementCountMaxBW = ElementCount::get(
3916	MinVal: llvm::bit_floor(Value: WidestRegister.getKnownMinValue() / SmallestType),
3917	Scalable: ComputeScalableMaxVF);
3918	MaxVF = MinVF (MaxVectorElementCountMaxBW, MaxSafeVF);
3919
3920	if (ElementCount MinVF =
3921	TTI.getMinimumVF(ElemWidth: SmallestType, IsScalable: ComputeScalableMaxVF)) {
3922	if (ElementCount::isKnownLT(LHS: MaxVF, RHS: MinVF)) {
3923	LLVM_DEBUG(dbgs() << "LV: Overriding calculated MaxVF(" << MaxVF
3924	<< ") with target's minimum: " << MinVF << `'\n'`);
3925	MaxVF = MinVF;
3926	}
3927	}
3928
3929	// Invalidate any widening decisions we might have made, in case the loop
3930	// requires prediction (decided later), but we have already made some
3931	// load/store widening decisions.
3932	invalidateCostModelingDecisions();
3933	}
3934	return MaxVF;
3935	}
3936
3937	bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
3938	const VectorizationFactor &B,
3939	const unsigned MaxTripCount,
3940	bool HasTail) const {
3941	InstructionCost CostA = A.Cost;
3942	InstructionCost CostB = B.Cost;
3943
3944	// Improve estimate for the vector width if it is scalable.
3945	unsigned EstimatedWidthA = A.Width.getKnownMinValue();
3946	unsigned EstimatedWidthB = B.Width.getKnownMinValue();
3947	if (std::optional<unsigned> VScale = CM.getVScaleForTuning()) {
3948	if (A.Width.isScalable())
3949	EstimatedWidthA = VScale;
3950	if (B.Width.isScalable())
3951	EstimatedWidthB = VScale;
3952	}
3953
3954	// When optimizing for size choose whichever is smallest, which will be the
3955	// one with the smallest cost for the whole loop. On a tie pick the larger
3956	// vector width, on the assumption that throughput will be greater.
3957	if (CM.CostKind == TTI::TCK_CodeSize)
3958	return CostA < CostB \|\|
3959	(CostA == CostB && EstimatedWidthA > EstimatedWidthB);
3960
3961	// Assume vscale may be larger than 1 (or the value being tuned for),
3962	// so that scalable vectorization is slightly favorable over fixed-width
3963	// vectorization.
3964	bool PreferScalable = !TTI.preferFixedOverScalableIfEqualCost() &&
3965	A.Width.isScalable() && !B.Width.isScalable();
3966
3967	auto CmpFn = [PreferScalable](const InstructionCost &LHS,
3968	const InstructionCost &RHS) {
3969	return PreferScalable ? LHS <= RHS : LHS < RHS;
3970	};
3971
3972	// To avoid the need for FP division:
3973	// (CostA / EstimatedWidthA) < (CostB / EstimatedWidthB)
3974	// <=> (CostA EstimatedWidthB) < (CostB * EstimatedWidthA)*
3975	if (!MaxTripCount)
3976	return CmpFn (CostA * EstimatedWidthB, CostB * EstimatedWidthA);
3977
3978	auto GetCostForTC = [MaxTripCount, HasTail](unsigned VF,
3979	InstructionCost VectorCost,
3980	InstructionCost ScalarCost) {
3981	// If the trip count is a known (possibly small) constant, the trip count
3982	// will be rounded up to an integer number of iterations under
3983	// FoldTailByMasking. The total cost in that case will be
3984	// VecCostceil(TripCount/VF). When not folding the tail, the total*
3985	// cost will be VecCostfloor(TC/VF) + ScalarCost(TC%VF). There will be
3986	// some extra overheads, but for the purpose of comparing the costs of
3987	// different VFs we can use this to compare the total loop-body cost
3988	// expected after vectorization.
3989	if (HasTail)
3990	return VectorCost * (MaxTripCount / VF) +
3991	ScalarCost * (MaxTripCount % VF);
3992	return VectorCost * divideCeil(Numerator: MaxTripCount, Denominator: VF);
3993	};
3994
3995	auto RTCostA = GetCostForTC (EstimatedWidthA, CostA, A.ScalarCost);
3996	auto RTCostB = GetCostForTC (EstimatedWidthB, CostB, B.ScalarCost);
3997	return CmpFn (RTCostA, RTCostB);
3998	}
3999
4000	bool LoopVectorizationPlanner::isMoreProfitable(const VectorizationFactor &A,
4001	const VectorizationFactor &B,
4002	bool HasTail) const {
4003	const unsigned MaxTripCount = PSE.getSmallConstantMaxTripCount();
4004	return LoopVectorizationPlanner::isMoreProfitable(A, B, MaxTripCount,
4005	HasTail);
4006	}
4007
4008	void LoopVectorizationPlanner::emitInvalidCostRemarks(
4009	OptimizationRemarkEmitter *ORE) {
4010	using RecipeVFPair = std::pair<VPRecipeBase *, ElementCount>;
4011	SmallVector<RecipeVFPair> InvalidCosts;
4012	for (const auto &Plan : VPlans) {
4013	for (ElementCount VF : Plan ->vectorFactors()) {
4014	// The VPlan-based cost model is designed for computing vector cost.
4015	// Querying VPlan-based cost model with a scarlar VF will cause some
4016	// errors because we expect the VF is vector for most of the widen
4017	// recipes.
4018	if (VF.isScalar())
4019	continue;
4020
4021	VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(),
4022	CM, CM.CostKind);
4023	precomputeCosts(Plan&: *Plan, VF, CostCtx);
4024	auto Iter = vp_depth_first_deep(G: Plan ->getVectorLoopRegion()->getEntry());
4025	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: Iter)) {
4026	for (auto &R : *VPBB) {
4027	if (!R.cost(VF, Ctx&: CostCtx).isValid())
4028	InvalidCosts.emplace_back(Args: &R, Args&: VF);
4029	}
4030	}
4031	}
4032	}
4033	if (InvalidCosts.empty())
4034	return;
4035
4036	// Emit a report of VFs with invalid costs in the loop.
4037
4038	// Group the remarks per recipe, keeping the recipe order from InvalidCosts.
4039	DenseMap<VPRecipeBase , unsigned*> Numbering;
4040	unsigned I = `0`;
4041	for (auto &Pair : InvalidCosts)
4042	if (Numbering.try_emplace(Key: Pair.first, Args&: I).second)
4043	++I;
4044
4045	// Sort the list, first on recipe(number) then on VF.
4046	sort(C&: InvalidCosts, Comp: [&Numbering](RecipeVFPair &A, RecipeVFPair &B) {
4047	unsigned NA = Numbering [A.first];
4048	unsigned NB = Numbering [B.first];
4049	if (NA != NB)
4050	return NA < NB;
4051	return ElementCount::isKnownLT(LHS: A.second, RHS: B.second);
4052	});
4053
4054	// For a list of ordered recipe-VF pairs:
4055	// [(load, VF1), (load, VF2), (store, VF1)]
4056	// group the recipes together to emit separate remarks for:
4057	// load (VF1, VF2)
4058	// store (VF1)
4059	auto Tail = ArrayRef<RecipeVFPair>(InvalidCosts);
4060	auto Subset = ArrayRef<RecipeVFPair>();
4061	do {
4062	if (Subset.empty())
4063	Subset = Tail.take_front(N: `1`);
4064
4065	VPRecipeBase *R = Subset.front().first;
4066
4067	unsigned Opcode =
4068	TypeSwitch<const VPRecipeBase , unsigned*>(R)
4069	.Case<VPHeaderPHIRecipe>(
4070	caseFn: [](const auto R) { return* Instruction::PHI; })
4071	.Case<VPWidenSelectRecipe>(
4072	caseFn: [](const auto R) { return* Instruction::Select; })
4073	.Case<VPWidenStoreRecipe>(
4074	caseFn: [](const auto R) { return* Instruction::Store; })
4075	.Case<VPWidenLoadRecipe>(
4076	caseFn: [](const auto R) { return* Instruction::Load; })
4077	.Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
4078	caseFn: [](const auto R) { return* Instruction::Call; })
4079	.Case<VPInstruction, VPWidenRecipe, VPReplicateRecipe,
4080	VPWidenCastRecipe>(
4081	caseFn: [](const auto R) { return* R->getOpcode(); })
4082	.Case<VPInterleaveRecipe>(caseFn: [](const VPInterleaveRecipe *R) {
4083	return R->getStoredValues().empty() ? Instruction::Load
4084	: Instruction::Store;
4085	});
4086
4087	// If the next recipe is different, or if there are no other pairs,
4088	// emit a remark for the collated subset. e.g.
4089	// [(load, VF1), (load, VF2))]
4090	// to emit:
4091	// remark: invalid costs for 'load' at VF=(VF1, VF2)
4092	if (Subset == Tail \|\| Tail [Subset.size()].first != R) {
4093	std::string OutString;
4094	raw_string_ostream OS(OutString);
4095	assert(!Subset.empty() && "Unexpected empty range");
4096	OS << "Recipe with invalid costs prevented vectorization at VF=(";
4097	for (const auto &Pair : Subset)
4098	OS << (Pair.second == Subset.front().second ? "" : ", ") << Pair.second;
4099	OS << "):";
4100	if (Opcode == Instruction::Call) {
4101	StringRef Name = "";
4102	if (auto *Int = dyn_cast<VPWidenIntrinsicRecipe>(Val: R)) {
4103	Name = Int->getIntrinsicName();
4104	} else {
4105	auto *WidenCall = dyn_cast<VPWidenCallRecipe>(Val: R);
4106	Function *CalledFn =
4107	WidenCall ? WidenCall->getCalledScalarFunction()
4108	: cast<Function>(Val: R->getOperand(N: R->getNumOperands() - `1`)
4109	->getLiveInIRValue());
4110	Name = CalledFn->getName();
4111	}
4112	OS << " call to " << Name;
4113	} else
4114	OS << " " << Instruction::getOpcodeName(Opcode);
4115	reportVectorizationInfo(Msg: OutString, ORETag: "InvalidCost", ORE, TheLoop: OrigLoop, I: nullptr,
4116	DL: R->getDebugLoc());
4117	Tail = Tail.drop_front(N: Subset.size());
4118	Subset = {};
4119	} else
4120	// Grow the subset by one element
4121	Subset = Tail.take_front(N: Subset.size() + `1`);
4122	} while (!Tail.empty());
4123	}
4124
4125	/// Check if any recipe of \p Plan will generate a vector value, which will be
4126	/// assigned a vector register.
4127	static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
4128	const TargetTransformInfo &TTI) {
4129	assert(VF.isVector() && "Checking a scalar VF?");
4130	VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
4131	DenseSet<VPRecipeBase *> EphemeralRecipes;
4132	collectEphemeralRecipesForVPlan(Plan, EphRecipes&: EphemeralRecipes);
4133	// Set of already visited types.
4134	DenseSet<Type *> Visited;
4135	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4136	Range: vp_depth_first_shallow(G: Plan.getVectorLoopRegion()->getEntry()))) {
4137	for (VPRecipeBase &R : *VPBB) {
4138	if (EphemeralRecipes.contains(V: &R))
4139	continue;
4140	// Continue early if the recipe is considered to not produce a vector
4141	// result. Note that this includes VPInstruction where some opcodes may
4142	// produce a vector, to preserve existing behavior as VPInstructions model
4143	// aspects not directly mapped to existing IR instructions.
4144	switch (R.getVPDefID()) {
4145	case VPDef::VPDerivedIVSC:
4146	case VPDef::VPScalarIVStepsSC:
4147	case VPDef::VPReplicateSC:
4148	case VPDef::VPInstructionSC:
4149	case VPDef::VPCanonicalIVPHISC:
4150	case VPDef::VPVectorPointerSC:
4151	case VPDef::VPVectorEndPointerSC:
4152	case VPDef::VPExpandSCEVSC:
4153	case VPDef::VPEVLBasedIVPHISC:
4154	case VPDef::VPPredInstPHISC:
4155	case VPDef::VPBranchOnMaskSC:
4156	continue;
4157	case VPDef::VPReductionSC:
4158	case VPDef::VPActiveLaneMaskPHISC:
4159	case VPDef::VPWidenCallSC:
4160	case VPDef::VPWidenCanonicalIVSC:
4161	case VPDef::VPWidenCastSC:
4162	case VPDef::VPWidenGEPSC:
4163	case VPDef::VPWidenIntrinsicSC:
4164	case VPDef::VPWidenSC:
4165	case VPDef::VPWidenSelectSC:
4166	case VPDef::VPBlendSC:
4167	case VPDef::VPFirstOrderRecurrencePHISC:
4168	case VPDef::VPHistogramSC:
4169	case VPDef::VPWidenPHISC:
4170	case VPDef::VPWidenIntOrFpInductionSC:
4171	case VPDef::VPWidenPointerInductionSC:
4172	case VPDef::VPReductionPHISC:
4173	case VPDef::VPInterleaveSC:
4174	case VPDef::VPWidenLoadEVLSC:
4175	case VPDef::VPWidenLoadSC:
4176	case VPDef::VPWidenStoreEVLSC:
4177	case VPDef::VPWidenStoreSC:
4178	break;
4179	default:
4180	llvm_unreachable("unhandled recipe");
4181	}
4182
4183	auto WillGenerateTargetVectors = [&TTI, VF](Type *VectorTy) {
4184	unsigned NumLegalParts = TTI.getNumberOfParts(Tp: VectorTy);
4185	if (!NumLegalParts)
4186	return false;
4187	if (VF.isScalable()) {
4188	// <vscale x 1 x iN> is assumed to be profitable over iN because
4189	// scalable registers are a distinct register class from scalar
4190	// ones. If we ever find a target which wants to lower scalable
4191	// vectors back to scalars, we'll need to update this code to
4192	// explicitly ask TTI about the register class uses for each part.
4193	return NumLegalParts <= VF.getKnownMinValue();
4194	}
4195	// Two or more elements that share a register - are vectorized.
4196	return NumLegalParts < VF.getFixedValue();
4197	};
4198
4199	// If no def nor is a store, e.g., branches, continue - no value to check.
4200	if (R.getNumDefinedValues() == `0` &&
4201	!isa<VPWidenStoreRecipe, VPWidenStoreEVLRecipe, VPInterleaveRecipe>(
4202	Val: &R))
4203	continue;
4204	// For multi-def recipes, currently only interleaved loads, suffice to
4205	// check first def only.
4206	// For stores check their stored value; for interleaved stores suffice
4207	// the check first stored value only. In all cases this is the second
4208	// operand.
4209	VPValue *ToCheck =
4210	R.getNumDefinedValues() >= `1` ? R.getVPValue(I: `0`) : R.getOperand(N: `1`);
4211	Type *ScalarTy = TypeInfo.inferScalarType(V: ToCheck);
4212	if (!Visited.insert(V: {ScalarTy}).second)
4213	continue;
4214	Type *WideTy = toVectorizedTy(Ty: ScalarTy, EC: VF);
4215	if (any_of(Range: getContainedTypes(Ty: WideTy), P: WillGenerateTargetVectors))
4216	return true;
4217	}
4218	}
4219
4220	return false;
4221	}
4222
4223	static bool hasReplicatorRegion(VPlan &Plan) {
4224	return any_of(Range: VPBlockUtils::blocksOnly<VPRegionBlock>(Range: vp_depth_first_shallow(
4225	G: Plan.getVectorLoopRegion()->getEntry())),
4226	P: [](auto VPRB) { return* VPRB->isReplicator(); });
4227	}
4228
4229	#ifndef NDEBUG
4230	VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() {
4231	InstructionCost ExpectedCost = CM.expectedCost(ElementCount::getFixed(`1`));
4232	LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
4233	assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
4234	assert(
4235	any_of(VPlans,
4236	[](std::unique_ptr<VPlan> &P) { return P->hasScalarVFOnly(); }) &&
4237	"Expected Scalar VF to be a candidate");
4238
4239	const VectorizationFactor ScalarCost(ElementCount::getFixed(`1`), ExpectedCost,
4240	ExpectedCost);
4241	VectorizationFactor ChosenFactor = ScalarCost;
4242
4243	bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
4244	if (ForceVectorization &&
4245	(VPlans.size() > `1` \|\| !VPlans[`0`]->hasScalarVFOnly())) {
4246	// Ignore scalar width, because the user explicitly wants vectorization.
4247	// Initialize cost to max so that VF = 2 is, at least, chosen during cost
4248	// evaluation.
4249	ChosenFactor.Cost = InstructionCost::getMax();
4250	}
4251
4252	for (auto &P : VPlans) {
4253	ArrayRef<ElementCount> VFs(P->vectorFactors().begin(),
4254	P->vectorFactors().end());
4255
4256	SmallVector<VPRegisterUsage, `8`> RUs;
4257	if (CM.useMaxBandwidth(TargetTransformInfo::RGK_ScalableVector) \|\|
4258	CM.useMaxBandwidth(TargetTransformInfo::RGK_FixedWidthVector))
4259	RUs = calculateRegisterUsageForPlan(*P, VFs, TTI, CM.ValuesToIgnore);
4260
4261	for (unsigned I = `0`; I < VFs.size(); I++) {
4262	ElementCount VF = VFs[I];
4263	// The cost for scalar VF=1 is already calculated, so ignore it.
4264	if (VF.isScalar())
4265	continue;
4266
4267	/// Don't consider the VF if it exceeds the number of registers for the
4268	/// target.
4269	if (CM.useMaxBandwidth(VF) && RUs[I].exceedsMaxNumRegs(TTI))
4270	continue;
4271
4272	InstructionCost C = CM.expectedCost(VF);
4273
4274	// Add on other costs that are modelled in VPlan, but not in the legacy
4275	// cost model.
4276	VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
4277	CM, CM.CostKind);
4278	VPRegionBlock *VectorRegion = P->getVectorLoopRegion();
4279	assert(VectorRegion && "Expected to have a vector region!");
4280	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
4281	vp_depth_first_shallow(VectorRegion->getEntry()))) {
4282	for (VPRecipeBase &R : *VPBB) {
4283	auto *VPI = dyn_cast<VPInstruction>(&R);
4284	if (!VPI)
4285	continue;
4286	switch (VPI->getOpcode()) {
4287	case VPInstruction::ActiveLaneMask:
4288	case VPInstruction::ExplicitVectorLength:
4289	C += VPI->cost(VF, CostCtx);
4290	break;
4291	default:
4292	break;
4293	}
4294	}
4295	}
4296
4297	VectorizationFactor Candidate(VF, C, ScalarCost.ScalarCost);
4298	unsigned Width =
4299	getEstimatedRuntimeVF(Candidate.Width, CM.getVScaleForTuning());
4300	LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << VF
4301	<< " costs: " << (Candidate.Cost / Width));
4302	if (VF.isScalable())
4303	LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
4304	<< CM.getVScaleForTuning().value_or(`1`) << ")");
4305	LLVM_DEBUG(dbgs() << ".\n");
4306
4307	if (!ForceVectorization && !willGenerateVectors(*P, VF, TTI)) {
4308	LLVM_DEBUG(
4309	dbgs()
4310	<< "LV: Not considering vector loop of width " << VF
4311	<< " because it will not generate any vector instructions.\n");
4312	continue;
4313	}
4314
4315	if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(*P)) {
4316	LLVM_DEBUG(
4317	dbgs()
4318	<< "LV: Not considering vector loop of width " << VF
4319	<< " because it would cause replicated blocks to be generated,"
4320	<< " which isn't allowed when optimizing for size.\n");
4321	continue;
4322	}
4323
4324	if (isMoreProfitable(Candidate, ChosenFactor, P->hasScalarTail()))
4325	ChosenFactor = Candidate;
4326	}
4327	}
4328
4329	if (!EnableCondStoresVectorization && CM.hasPredStores()) {
4330	reportVectorizationFailure(
4331	"There are conditional stores.",
4332	"store that is conditionally executed prevents vectorization",
4333	"ConditionalStore", ORE, OrigLoop);
4334	ChosenFactor = ScalarCost;
4335	}
4336
4337	LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
4338	!isMoreProfitable(ChosenFactor, ScalarCost,
4339	!CM.foldTailByMasking())) dbgs()
4340	<< "LV: Vectorization seems to be not beneficial, "
4341	<< "but was forced by a user.\n");
4342	return ChosenFactor;
4343	}
4344	#endif
4345
4346	bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
4347	ElementCount VF) const {
4348	// Cross iteration phis such as fixed-order recurrences and FMaxNum/FMinNum
4349	// reductions need special handling and are currently unsupported.
4350	if (any_of(Range: OrigLoop->getHeader()->phis(), P: [&](PHINode &Phi) {
4351	if (!Legal->isReductionVariable(PN: &Phi))
4352	return Legal->isFixedOrderRecurrence(Phi: &Phi);
4353	RecurKind RK = Legal->getRecurrenceDescriptor(PN: &Phi).getRecurrenceKind();
4354	return RK == RecurKind::FMinNum \|\| RK == RecurKind::FMaxNum;
4355	}))
4356	return false;
4357
4358	// Phis with uses outside of the loop require special handling and are
4359	// currently unsupported.
4360	for (const auto &Entry : Legal->getInductionVars()) {
4361	// Look for uses of the value of the induction at the last iteration.
4362	Value *PostInc =
4363	Entry.first->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch());
4364	for (User *U : PostInc->users())
4365	if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4366	return false;
4367	// Look for uses of penultimate value of the induction.
4368	for (User *U : Entry.first->users())
4369	if (!OrigLoop->contains(Inst: cast<Instruction>(Val: U)))
4370	return false;
4371	}
4372
4373	// Epilogue vectorization code has not been auditted to ensure it handles
4374	// non-latch exits properly. It may be fine, but it needs auditted and
4375	// tested.
4376	// TODO: Add support for loops with an early exit.
4377	if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch())
4378	return false;
4379
4380	return true;
4381	}
4382
4383	bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
4384	const ElementCount VF, const unsigned IC) const {
4385	// FIXME: We need a much better cost-model to take different parameters such
4386	// as register pressure, code size increase and cost of extra branches into
4387	// account. For now we apply a very crude heuristic and only consider loops
4388	// with vectorization factors larger than a certain value.
4389
4390	// Allow the target to opt out entirely.
4391	if (!TTI.preferEpilogueVectorization())
4392	return false;
4393
4394	// We also consider epilogue vectorization unprofitable for targets that don't
4395	// consider interleaving beneficial (eg. MVE).
4396	if (TTI.getMaxInterleaveFactor(VF) <= `1`)
4397	return false;
4398
4399	// TODO: PR #108190 introduced a discrepancy between fixed-width and scalable
4400	// VFs when deciding profitability.
4401	// See related "TODO: extend to support scalable VFs." in
4402	// selectEpilogueVectorizationFactor.
4403	unsigned Multiplier = VF.isFixed() ? IC : `1`;
4404	unsigned MinVFThreshold = EpilogueVectorizationMinVF.getNumOccurrences() > `0`
4405	? EpilogueVectorizationMinVF
4406	: TTI.getEpilogueVectorizationMinVF();
4407	return getEstimatedRuntimeVF(VF: VF * Multiplier, VScale: VScaleForTuning) >=
4408	MinVFThreshold;
4409	}
4410
4411	VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
4412	const ElementCount MainLoopVF, unsigned IC) {
4413	VectorizationFactor Result = VectorizationFactor::Disabled();
4414	if (!EnableEpilogueVectorization) {
4415	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
4416	return Result;
4417	}
4418
4419	if (!CM.isScalarEpilogueAllowed()) {
4420	LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
4421	"epilogue is allowed.\n");
4422	return Result;
4423	}
4424
4425	// Not really a cost consideration, but check for unsupported cases here to
4426	// simplify the logic.
4427	if (!isCandidateForEpilogueVectorization(VF: MainLoopVF)) {
4428	LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
4429	"is not a supported candidate.\n");
4430	return Result;
4431	}
4432
4433	if (EpilogueVectorizationForceVF > `1`) {
4434	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
4435	ElementCount ForcedEC = ElementCount::getFixed(MinVal: EpilogueVectorizationForceVF);
4436	if (hasPlanWithVF(VF: ForcedEC))
4437	return {ForcedEC, `0`, `0`};
4438
4439	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
4440	"viable.\n");
4441	return Result;
4442	}
4443
4444	if (OrigLoop->getHeader()->getParent()->hasOptSize()) {
4445	LLVM_DEBUG(
4446	dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
4447	return Result;
4448	}
4449
4450	if (!CM.isEpilogueVectorizationProfitable(VF: MainLoopVF, IC)) {
4451	LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
4452	"this loop\n");
4453	return Result;
4454	}
4455
4456	// If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
4457	// the main loop handles 8 lanes per iteration. We could still benefit from
4458	// vectorizing the epilogue loop with VF=4.
4459	ElementCount EstimatedRuntimeVF = ElementCount::getFixed(
4460	MinVal: getEstimatedRuntimeVF(VF: MainLoopVF, VScale: CM.getVScaleForTuning()));
4461
4462	ScalarEvolution &SE = *PSE.getSE();
4463	Type *TCType = Legal->getWidestInductionType();
4464	const SCEV RemainingIterations = nullptr*;
4465	unsigned MaxTripCount = `0`;
4466	for (auto &NextVF : ProfitableVFs) {
4467	// Skip candidate VFs without a corresponding VPlan.
4468	if (!hasPlanWithVF(VF: NextVF.Width))
4469	continue;
4470
4471	// Skip candidate VFs with widths >= the (estimated) runtime VF (scalable
4472	// vectors) or > the VF of the main loop (fixed vectors).
4473	if ((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
4474	ElementCount::isKnownGE(LHS: NextVF.Width, RHS: EstimatedRuntimeVF)) \|\|
4475	(NextVF.Width.isScalable() &&
4476	ElementCount::isKnownGE(LHS: NextVF.Width, RHS: MainLoopVF)) \|\|
4477	(!NextVF.Width.isScalable() && !MainLoopVF.isScalable() &&
4478	ElementCount::isKnownGT(LHS: NextVF.Width, RHS: MainLoopVF)))
4479	continue;
4480
4481	// If NextVF is greater than the number of remaining iterations, the
4482	// epilogue loop would be dead. Skip such factors.
4483	if (!MainLoopVF.isScalable() && !NextVF.Width.isScalable()) {
4484	// TODO: extend to support scalable VFs.
4485	if (!RemainingIterations) {
4486	const SCEV *TC = vputils::getSCEVExprForVPValue(
4487	V: getPlanFor(VF: NextVF.Width).getTripCount(), SE);
4488	assert(!isa<SCEVCouldNotCompute>(TC) &&
4489	"Trip count SCEV must be computable");
4490	RemainingIterations = SE.getURemExpr(
4491	LHS: TC, RHS: SE.getConstant(Ty: TCType, V: MainLoopVF.getFixedValue() * IC));
4492	MaxTripCount = MainLoopVF.getFixedValue() * IC - `1`;
4493	if (SE.isKnownPredicate(Pred: CmpInst::ICMP_ULT, LHS: RemainingIterations,
4494	RHS: SE.getConstant(Ty: TCType, V: MaxTripCount))) {
4495	MaxTripCount =
4496	SE.getUnsignedRangeMax(S: RemainingIterations).getZExtValue();
4497	}
4498	LLVM_DEBUG(dbgs() << "LEV: Maximum Trip Count for Epilogue: "
4499	<< MaxTripCount << "\n");
4500	}
4501	if (SE.isKnownPredicate(
4502	Pred: CmpInst::ICMP_UGT,
4503	LHS: SE.getConstant(Ty: TCType, V: NextVF.Width.getFixedValue()),
4504	RHS: RemainingIterations))
4505	continue;
4506	}
4507
4508	if (Result.Width.isScalar() \|\|
4509	isMoreProfitable(A: NextVF, B: Result, MaxTripCount, HasTail: !CM.foldTailByMasking()))
4510	Result = NextVF;
4511	}
4512
4513	if (Result != VectorizationFactor::Disabled())
4514	LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
4515	<< Result.Width << "\n");
4516	return Result;
4517	}
4518
4519	std::pair<unsigned, unsigned>
4520	LoopVectorizationCostModel::getSmallestAndWidestTypes() {
4521	unsigned MinWidth = -`1U`;
4522	unsigned MaxWidth = `8`;
4523	const DataLayout &DL = TheFunction->getDataLayout();
4524	// For in-loop reductions, no element types are added to ElementTypesInLoop
4525	// if there are no loads/stores in the loop. In this case, check through the
4526	// reduction variables to determine the maximum width.
4527	if (ElementTypesInLoop.empty() && !Legal->getReductionVars().empty()) {
4528	for (const auto &PhiDescriptorPair : Legal->getReductionVars()) {
4529	const RecurrenceDescriptor &RdxDesc = PhiDescriptorPair.second;
4530	// When finding the min width used by the recurrence we need to account
4531	// for casts on the input operands of the recurrence.
4532	MinWidth = std::min(
4533	a: MinWidth,
4534	b: std::min(a: RdxDesc.getMinWidthCastToRecurrenceTypeInBits(),
4535	b: RdxDesc.getRecurrenceType()->getScalarSizeInBits()));
4536	MaxWidth = std::max(a: MaxWidth,
4537	b: RdxDesc.getRecurrenceType()->getScalarSizeInBits());
4538	}
4539	} else {
4540	for (Type *T : ElementTypesInLoop) {
4541	MinWidth = std::min<unsigned>(
4542	a: MinWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4543	MaxWidth = std::max<unsigned>(
4544	a: MaxWidth, b: DL.getTypeSizeInBits(Ty: T->getScalarType()).getFixedValue());
4545	}
4546	}
4547	return {MinWidth, MaxWidth};
4548	}
4549
4550	void LoopVectorizationCostModel::collectElementTypesForWidening() {
4551	ElementTypesInLoop.clear();
4552	// For each block.
4553	for (BasicBlock *BB : TheLoop->blocks()) {
4554	// For each instruction in the loop.
4555	for (Instruction &I : BB->instructionsWithoutDebug()) {
4556	Type *T = I.getType();
4557
4558	// Skip ignored values.
4559	if (ValuesToIgnore.count(Ptr: &I))
4560	continue;
4561
4562	// Only examine Loads, Stores and PHINodes.
4563	if (!isa<LoadInst>(Val: I) && !isa<StoreInst>(Val: I) && !isa<PHINode>(Val: I))
4564	continue;
4565
4566	// Examine PHI nodes that are reduction variables. Update the type to
4567	// account for the recurrence type.
4568	if (auto *PN = dyn_cast<PHINode>(Val: &I)) {
4569	if (!Legal->isReductionVariable(PN))
4570	continue;
4571	const RecurrenceDescriptor &RdxDesc =
4572	Legal->getRecurrenceDescriptor(PN);
4573	if (PreferInLoopReductions \|\| useOrderedReductions(RdxDesc) \|\|
4574	TTI.preferInLoopReduction(Kind: RdxDesc.getRecurrenceKind(),
4575	Ty: RdxDesc.getRecurrenceType()))
4576	continue;
4577	T = RdxDesc.getRecurrenceType();
4578	}
4579
4580	// Examine the stored values.
4581	if (auto *ST = dyn_cast<StoreInst>(Val: &I))
4582	T = ST->getValueOperand()->getType();
4583
4584	assert(T->isSized() &&
4585	"Expected the load/store/recurrence type to be sized");
4586
4587	ElementTypesInLoop.insert(Ptr: T);
4588	}
4589	}
4590	}
4591
4592	unsigned
4593	LoopVectorizationCostModel::selectInterleaveCount(VPlan &Plan, ElementCount VF,
4594	InstructionCost LoopCost) {
4595	// -- The interleave heuristics --
4596	// We interleave the loop in order to expose ILP and reduce the loop overhead.
4597	// There are many micro-architectural considerations that we can't predict
4598	// at this level. For example, frontend pressure (on decode or fetch) due to
4599	// code size, or the number and capabilities of the execution ports.
4600	//
4601	// We use the following heuristics to select the interleave count:
4602	// 1. If the code has reductions, then we interleave to break the cross
4603	// iteration dependency.
4604	// 2. If the loop is really small, then we interleave to reduce the loop
4605	// overhead.
4606	// 3. We don't interleave if we think that we will spill registers to memory
4607	// due to the increased register pressure.
4608
4609	if (!isScalarEpilogueAllowed())
4610	return `1`;
4611
4612	// Do not interleave if EVL is preferred and no User IC is specified.
4613	if (foldTailWithEVL()) {
4614	LLVM_DEBUG(dbgs() << "LV: Preference for VP intrinsics indicated. "
4615	"Unroll factor forced to be 1.\n");
4616	return `1`;
4617	}
4618
4619	// We used the distance for the interleave count.
4620	if (!Legal->isSafeForAnyVectorWidth())
4621	return `1`;
4622
4623	// We don't attempt to perform interleaving for loops with uncountable early
4624	// exits because the VPInstruction::AnyOf code cannot currently handle
4625	// multiple parts.
4626	if (Legal->hasUncountableEarlyExit())
4627	return `1`;
4628
4629	const bool HasReductions = !Legal->getReductionVars().empty();
4630
4631	// If we did not calculate the cost for VF (because the user selected the VF)
4632	// then we calculate the cost of VF here.
4633	if (LoopCost == `0`) {
4634	LoopCost = expectedCost(VF);
4635	assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost");
4636
4637	// Loop body is free and there is no need for interleaving.
4638	if (LoopCost == `0`)
4639	return `1`;
4640	}
4641
4642	VPRegisterUsage R =
4643	calculateRegisterUsageForPlan(Plan, VFs: {VF}, TTI, ValuesToIgnore)[`0`];
4644	// We divide by these constants so assume that we have at least one
4645	// instruction that uses at least one register.
4646	for (auto &Pair : R.MaxLocalUsers) {
4647	Pair.second = std::max(a: Pair.second, b: `1U`);
4648	}
4649
4650	// We calculate the interleave count using the following formula.
4651	// Subtract the number of loop invariants from the number of available
4652	// registers. These registers are used by all of the interleaved instances.
4653	// Next, divide the remaining registers by the number of registers that is
4654	// required by the loop, in order to estimate how many parallel instances
4655	// fit without causing spills. All of this is rounded down if necessary to be
4656	// a power of two. We want power of two interleave count to simplify any
4657	// addressing operations or alignment considerations.
4658	// We also want power of two interleave counts to ensure that the induction
4659	// variable of the vector loop wraps to zero, when tail is folded by masking;
4660	// this currently happens when OptForSize, in which case IC is set to 1 above.
4661	unsigned IC = UINT_MAX;
4662
4663	for (const auto &Pair : R.MaxLocalUsers) {
4664	unsigned TargetNumRegisters = TTI.getNumberOfRegisters(ClassID: Pair.first);
4665	LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
4666	<< " registers of "
4667	<< TTI.getRegisterClassName(Pair.first)
4668	<< " register class\n");
4669	if (VF.isScalar()) {
4670	if (ForceTargetNumScalarRegs.getNumOccurrences() > `0`)
4671	TargetNumRegisters = ForceTargetNumScalarRegs;
4672	} else {
4673	if (ForceTargetNumVectorRegs.getNumOccurrences() > `0`)
4674	TargetNumRegisters = ForceTargetNumVectorRegs;
4675	}
4676	unsigned MaxLocalUsers = Pair.second;
4677	unsigned LoopInvariantRegs = `0`;
4678	if (R.LoopInvariantRegs.contains(Key: Pair.first))
4679	LoopInvariantRegs = R.LoopInvariantRegs [Pair.first];
4680
4681	unsigned TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs) /
4682	MaxLocalUsers);
4683	// Don't count the induction variable as interleaved.
4684	if (EnableIndVarRegisterHeur) {
4685	TmpIC = llvm::bit_floor(Value: (TargetNumRegisters - LoopInvariantRegs - `1`) /
4686	std::max(a: `1U`, b: (MaxLocalUsers - `1`)));
4687	}
4688
4689	IC = std::min(a: IC, b: TmpIC);
4690	}
4691
4692	// Clamp the interleave ranges to reasonable counts.
4693	unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF);
4694
4695	// Check if the user has overridden the max.
4696	if (VF.isScalar()) {
4697	if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > `0`)
4698	MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
4699	} else {
4700	if (ForceTargetMaxVectorInterleaveFactor.getNumOccurrences() > `0`)
4701	MaxInterleaveCount = ForceTargetMaxVectorInterleaveFactor;
4702	}
4703
4704	unsigned EstimatedVF = getEstimatedRuntimeVF(VF, VScale: VScaleForTuning);
4705
4706	// Try to get the exact trip count, or an estimate based on profiling data or
4707	// ConstantMax from PSE, failing that.
4708	if (auto BestKnownTC = getSmallBestKnownTC(PSE, L: TheLoop)) {
4709	// At least one iteration must be scalar when this constraint holds. So the
4710	// maximum available iterations for interleaving is one less.
4711	unsigned AvailableTC = requiresScalarEpilogue(IsVectorizing: VF.isVector())
4712	? BestKnownTC ->getFixedValue() - `1`
4713	: BestKnownTC ->getFixedValue();
4714
4715	unsigned InterleaveCountLB = bit_floor(Value: std::max(
4716	a: `1u`, b: std::min(a: AvailableTC / (EstimatedVF * `2`), b: MaxInterleaveCount)));
4717
4718	if (getSmallConstantTripCount(SE: PSE.getSE(), L: TheLoop).isNonZero()) {
4719	// If the best known trip count is exact, we select between two
4720	// prospective ICs, where
4721	//
4722	// 1) the aggressive IC is capped by the trip count divided by VF
4723	// 2) the conservative IC is capped by the trip count divided by (VF 2)*
4724	//
4725	// The final IC is selected in a way that the epilogue loop trip count is
4726	// minimized while maximizing the IC itself, so that we either run the
4727	// vector loop at least once if it generates a small epilogue loop, or
4728	// else we run the vector loop at least twice.
4729
4730	unsigned InterleaveCountUB = bit_floor(Value: std::max(
4731	a: `1u`, b: std::min(a: AvailableTC / EstimatedVF, b: MaxInterleaveCount)));
4732	MaxInterleaveCount = InterleaveCountLB;
4733
4734	if (InterleaveCountUB != InterleaveCountLB) {
4735	unsigned TailTripCountUB =
4736	(AvailableTC % (EstimatedVF * InterleaveCountUB));
4737	unsigned TailTripCountLB =
4738	(AvailableTC % (EstimatedVF * InterleaveCountLB));
4739	// If both produce same scalar tail, maximize the IC to do the same work
4740	// in fewer vector loop iterations
4741	if (TailTripCountUB == TailTripCountLB)
4742	MaxInterleaveCount = InterleaveCountUB;
4743	}
4744	} else {
4745	// If trip count is an estimated compile time constant, limit the
4746	// IC to be capped by the trip count divided by VF 2, such that the*
4747	// vector loop runs at least twice to make interleaving seem profitable
4748	// when there is an epilogue loop present. Since exact Trip count is not
4749	// known we choose to be conservative in our IC estimate.
4750	MaxInterleaveCount = InterleaveCountLB;
4751	}
4752	}
4753
4754	assert(MaxInterleaveCount > `0` &&
4755	"Maximum interleave count must be greater than 0");
4756
4757	// Clamp the calculated IC to be between the 1 and the max interleave count
4758	// that the target and trip count allows.
4759	if (IC > MaxInterleaveCount)
4760	IC = MaxInterleaveCount;
4761	else
4762	// Make sure IC is greater than 0.
4763	IC = std::max(a: `1u`, b: IC);
4764
4765	assert(IC > `0` && "Interleave count must be greater than 0.");
4766
4767	// Interleave if we vectorized this loop and there is a reduction that could
4768	// benefit from interleaving.
4769	if (VF.isVector() && HasReductions) {
4770	LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
4771	return IC;
4772	}
4773
4774	// For any scalar loop that either requires runtime checks or predication we
4775	// are better off leaving this to the unroller. Note that if we've already
4776	// vectorized the loop we will have done the runtime check and so interleaving
4777	// won't require further checks.
4778	bool ScalarInterleavingRequiresPredication =
4779	(VF.isScalar() && any_of(Range: TheLoop->blocks(), P: [this](BasicBlock *BB) {
4780	return Legal->blockNeedsPredication(BB);
4781	}));
4782	bool ScalarInterleavingRequiresRuntimePointerCheck =
4783	(VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
4784
4785	// We want to interleave small loops in order to reduce the loop overhead and
4786	// potentially expose ILP opportunities.
4787	LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << `'\n'`
4788	<< "LV: IC is " << IC << `'\n'`
4789	<< "LV: VF is " << VF << `'\n'`);
4790	const bool AggressivelyInterleaveReductions =
4791	TTI.enableAggressiveInterleaving(LoopHasReductions: HasReductions);
4792	if (!ScalarInterleavingRequiresRuntimePointerCheck &&
4793	!ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) {
4794	// We assume that the cost overhead is 1 and we use the cost model
4795	// to estimate the cost of the loop and interleave until the cost of the
4796	// loop overhead is about 5% of the cost of the loop.
4797	unsigned SmallIC = std::min(a: IC, b: (unsigned)llvm::bit_floor<uint64_t>(
4798	Value: SmallLoopCost / LoopCost.getValue()));
4799
4800	// Interleave until store/load ports (estimated by max interleave count) are
4801	// saturated.
4802	unsigned NumStores = Legal->getNumStores();
4803	unsigned NumLoads = Legal->getNumLoads();
4804	unsigned StoresIC = IC / (NumStores ? NumStores : `1`);
4805	unsigned LoadsIC = IC / (NumLoads ? NumLoads : `1`);
4806
4807	// There is little point in interleaving for reductions containing selects
4808	// and compares when VF=1 since it may just create more overhead than it's
4809	// worth for loops with small trip counts. This is because we still have to
4810	// do the final reduction after the loop.
4811	bool HasSelectCmpReductions =
4812	HasReductions &&
4813	any_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
4814	const RecurrenceDescriptor &RdxDesc = Reduction.second;
4815	RecurKind RK = RdxDesc.getRecurrenceKind();
4816	return RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK) \|\|
4817	RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RK);
4818	});
4819	if (HasSelectCmpReductions) {
4820	LLVM_DEBUG(dbgs() << "LV: Not interleaving select-cmp reductions.\n");
4821	return `1`;
4822	}
4823
4824	// If we have a scalar reduction (vector reductions are already dealt with
4825	// by this point), we can increase the critical path length if the loop
4826	// we're interleaving is inside another loop. For tree-wise reductions
4827	// set the limit to 2, and for ordered reductions it's best to disable
4828	// interleaving entirely.
4829	if (HasReductions && TheLoop->getLoopDepth() > `1`) {
4830	bool HasOrderedReductions =
4831	any_of(Range: Legal->getReductionVars(), P: [&](auto &Reduction) -> bool {
4832	const RecurrenceDescriptor &RdxDesc = Reduction.second;
4833	return RdxDesc.isOrdered();
4834	});
4835	if (HasOrderedReductions) {
4836	LLVM_DEBUG(
4837	dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
4838	return `1`;
4839	}
4840
4841	unsigned F = MaxNestedScalarReductionIC;
4842	SmallIC = std::min(a: SmallIC, b: F);
4843	StoresIC = std::min(a: StoresIC, b: F);
4844	LoadsIC = std::min(a: LoadsIC, b: F);
4845	}
4846
4847	if (EnableLoadStoreRuntimeInterleave &&
4848	std::max(a: StoresIC, b: LoadsIC) > SmallIC) {
4849	LLVM_DEBUG(
4850	dbgs() << "LV: Interleaving to saturate store or load ports.\n");
4851	return std::max(a: StoresIC, b: LoadsIC);
4852	}
4853
4854	// If there are scalar reductions and TTI has enabled aggressive
4855	// interleaving for reductions, we will interleave to expose ILP.
4856	if (VF.isScalar() && AggressivelyInterleaveReductions) {
4857	LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4858	// Interleave no less than SmallIC but not as aggressive as the normal IC
4859	// to satisfy the rare situation when resources are too limited.
4860	return std::max(a: IC / `2`, b: SmallIC);
4861	}
4862
4863	LLVM_DEBUG(dbgs() << "LV: Interleaving to reduce branch cost.\n");
4864	return SmallIC;
4865	}
4866
4867	// Interleave if this is a large loop (small loops are already dealt with by
4868	// this point) that could benefit from interleaving.
4869	if (AggressivelyInterleaveReductions) {
4870	LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
4871	return IC;
4872	}
4873
4874	LLVM_DEBUG(dbgs() << "LV: Not Interleaving.\n");
4875	return `1`;
4876	}
4877
4878	bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
4879	ElementCount VF) {
4880	// TODO: Cost model for emulated masked load/store is completely
4881	// broken. This hack guides the cost model to use an artificially
4882	// high enough value to practically disable vectorization with such
4883	// operations, except where previously deployed legality hack allowed
4884	// using very low cost values. This is to avoid regressions coming simply
4885	// from moving "masked load/store" check from legality to cost model.
4886	// Masked Load/Gather emulation was previously never allowed.
4887	// Limited number of Masked Store/Scatter emulation was allowed.
4888	assert((isPredicatedInst(I)) &&
4889	"Expecting a scalar emulated instruction");
4890	return isa<LoadInst>(Val: I) \|\|
4891	(isa<StoreInst>(Val: I) &&
4892	NumPredStores > NumberOfStoresToPredicate);
4893	}
4894
4895	void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
4896	assert(VF.isVector() && "Expected VF >= 2");
4897
4898	// If we've already collected the instructions to scalarize or the predicated
4899	// BBs after vectorization, there's nothing to do. Collection may already have
4900	// occurred if we have a user-selected VF and are now computing the expected
4901	// cost for interleaving.
4902	if (InstsToScalarize.contains(Val: VF) \|\|
4903	PredicatedBBsAfterVectorization.contains(Val: VF))
4904	return;
4905
4906	// Initialize a mapping for VF in InstsToScalalarize. If we find that it's
4907	// not profitable to scalarize any instructions, the presence of VF in the
4908	// map will indicate that we've analyzed it already.
4909	ScalarCostsTy &ScalarCostsVF = InstsToScalarize [VF];
4910
4911	// Find all the instructions that are scalar with predication in the loop and
4912	// determine if it would be better to not if-convert the blocks they are in.
4913	// If so, we also record the instructions to scalarize.
4914	for (BasicBlock *BB : TheLoop->blocks()) {
4915	if (!blockNeedsPredicationForAnyReason(BB))
4916	continue;
4917	for (Instruction &I : *BB)
4918	if (isScalarWithPredication(I: &I, VF)) {
4919	ScalarCostsTy ScalarCosts;
4920	// Do not apply discount logic for:
4921	// 1. Scalars after vectorization, as there will only be a single copy
4922	// of the instruction.
4923	// 2. Scalable VF, as that would lead to invalid scalarization costs.
4924	// 3. Emulated masked memrefs, if a hacked cost is needed.
4925	if (!isScalarAfterVectorization(I: &I, VF) && !VF.isScalable() &&
4926	!useEmulatedMaskMemRefHack(I: &I, VF) &&
4927	computePredInstDiscount(PredInst: &I, ScalarCosts, VF) >= `0`) {
4928	ScalarCostsVF.insert_range(R&: ScalarCosts);
4929	// Check if we decided to scalarize a call. If so, update the widening
4930	// decision of the call to CM_Scalarize with the computed scalar cost.
4931	for (const auto &[I, Cost] : ScalarCosts) {
4932	auto *CI = dyn_cast<CallInst>(Val: I);
4933	if (!CI \|\| !CallWideningDecisions.contains(Val: {CI, VF}))
4934	continue;
4935	CallWideningDecisions [{CI, VF}].Kind = CM_Scalarize;
4936	CallWideningDecisions [{CI, VF}].Cost = Cost;
4937	}
4938	}
4939	// Remember that BB will remain after vectorization.
4940	PredicatedBBsAfterVectorization [VF].insert(Ptr: BB);
4941	for (auto *Pred : predecessors(BB)) {
4942	if (Pred->getSingleSuccessor() == BB)
4943	PredicatedBBsAfterVectorization [VF].insert(Ptr: Pred);
4944	}
4945	}
4946	}
4947	}
4948
4949	InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
4950	Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
4951	assert(!isUniformAfterVectorization(PredInst, VF) &&
4952	"Instruction marked uniform-after-vectorization will be predicated");
4953
4954	// Initialize the discount to zero, meaning that the scalar version and the
4955	// vector version cost the same.
4956	InstructionCost Discount = `0`;
4957
4958	// Holds instructions to analyze. The instructions we visit are mapped in
4959	// ScalarCosts. Those instructions are the ones that would be scalarized if
4960	// we find that the scalar version costs less.
4961	SmallVector<Instruction *, `8`> Worklist;
4962
4963	// Returns true if the given instruction can be scalarized.
4964	auto CanBeScalarized = [&](Instruction I) -> bool* {
4965	// We only attempt to scalarize instructions forming a single-use chain
4966	// from the original predicated block that would otherwise be vectorized.
4967	// Although not strictly necessary, we give up on instructions we know will
4968	// already be scalar to avoid traversing chains that are unlikely to be
4969	// beneficial.
4970	if (!I->hasOneUse() \|\| PredInst->getParent() != I->getParent() \|\|
4971	isScalarAfterVectorization(I, VF))
4972	return false;
4973
4974	// If the instruction is scalar with predication, it will be analyzed
4975	// separately. We ignore it within the context of PredInst.
4976	if (isScalarWithPredication(I, VF))
4977	return false;
4978
4979	// If any of the instruction's operands are uniform after vectorization,
4980	// the instruction cannot be scalarized. This prevents, for example, a
4981	// masked load from being scalarized.
4982	//
4983	// We assume we will only emit a value for lane zero of an instruction
4984	// marked uniform after vectorization, rather than VF identical values.
4985	// Thus, if we scalarize an instruction that uses a uniform, we would
4986	// create uses of values corresponding to the lanes we aren't emitting code
4987	// for. This behavior can be changed by allowing getScalarValue to clone
4988	// the lane zero values for uniforms rather than asserting.
4989	for (Use &U : I->operands())
4990	if (auto *J = dyn_cast<Instruction>(Val: U.get()))
4991	if (isUniformAfterVectorization(I: J, VF))
4992	return false;
4993
4994	// Otherwise, we can scalarize the instruction.
4995	return true;
4996	};
4997
4998	// Compute the expected cost discount from scalarizing the entire expression
4999	// feeding the predicated instruction. We currently only consider expressions
5000	// that are single-use instruction chains.
5001	Worklist.push_back(Elt: PredInst);
5002	while (!Worklist.empty()) {
5003	Instruction *I = Worklist.pop_back_val();
5004
5005	// If we've already analyzed the instruction, there's nothing to do.
5006	if (ScalarCosts.contains(Val: I))
5007	continue;
5008
5009	// Cannot scalarize fixed-order recurrence phis at the moment.
5010	if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
5011	continue;
5012
5013	// Compute the cost of the vector instruction. Note that this cost already
5014	// includes the scalarization overhead of the predicated instruction.
5015	InstructionCost VectorCost = getInstructionCost(I, VF);
5016
5017	// Compute the cost of the scalarized instruction. This cost is the cost of
5018	// the instruction as if it wasn't if-converted and instead remained in the
5019	// predicated block. We will scale this cost by block probability after
5020	// computing the scalarization overhead.
5021	InstructionCost ScalarCost =
5022	VF.getFixedValue() * getInstructionCost(I, VF: ElementCount::getFixed(MinVal: `1`));
5023
5024	// Compute the scalarization overhead of needed insertelement instructions
5025	// and phi nodes.
5026	if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
5027	Type *WideTy = toVectorizedTy(Ty: I->getType(), EC: VF);
5028	for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
5029	ScalarCost += TTI.getScalarizationOverhead(
5030	Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5031	/Insert=/true,
5032	/Extract=/false, CostKind);
5033	}
5034	ScalarCost +=
5035	VF.getFixedValue() * TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
5036	}
5037
5038	// Compute the scalarization overhead of needed extractelement
5039	// instructions. For each of the instruction's operands, if the operand can
5040	// be scalarized, add it to the worklist; otherwise, account for the
5041	// overhead.
5042	for (Use &U : I->operands())
5043	if (auto *J = dyn_cast<Instruction>(Val: U.get())) {
5044	assert(canVectorizeTy(J->getType()) &&
5045	"Instruction has non-scalar type");
5046	if (CanBeScalarized (J))
5047	Worklist.push_back(Elt: J);
5048	else if (needsExtract(V: J, VF)) {
5049	Type *WideTy = toVectorizedTy(Ty: J->getType(), EC: VF);
5050	for (Type *VectorTy : getContainedTypes(Ty: WideTy)) {
5051	ScalarCost += TTI.getScalarizationOverhead(
5052	Ty: cast<VectorType>(Val: VectorTy),
5053	DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()), /Insert/ false,
5054	/Extract/ true, CostKind);
5055	}
5056	}
5057	}
5058
5059	// Scale the total scalar cost by block probability.
5060	ScalarCost /= getPredBlockCostDivisor(CostKind);
5061
5062	// Compute the discount. A non-negative discount means the vector version
5063	// of the instruction costs more, and scalarizing would be beneficial.
5064	Discount += VectorCost - ScalarCost;
5065	ScalarCosts [I] = ScalarCost;
5066	}
5067
5068	return Discount;
5069	}
5070
5071	InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
5072	InstructionCost Cost;
5073
5074	// If the vector loop gets executed exactly once with the given VF, ignore the
5075	// costs of comparison and induction instructions, as they'll get simplified
5076	// away.
5077	SmallPtrSet<Instruction *, `2`> ValuesToIgnoreForVF;
5078	auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: TheLoop);
5079	if (TC == VF && !foldTailByMasking())
5080	addFullyUnrolledInstructionsToIgnore(L: TheLoop, IL: Legal->getInductionVars(),
5081	InstsToIgnore&: ValuesToIgnoreForVF);
5082
5083	// For each block.
5084	for (BasicBlock *BB : TheLoop->blocks()) {
5085	InstructionCost BlockCost;
5086
5087	// For each instruction in the old loop.
5088	for (Instruction &I : BB->instructionsWithoutDebug()) {
5089	// Skip ignored values.
5090	if (ValuesToIgnore.count(Ptr: &I) \|\| ValuesToIgnoreForVF.count(Ptr: &I) \|\|
5091	(VF.isVector() && VecValuesToIgnore.count(Ptr: &I)))
5092	continue;
5093
5094	InstructionCost C = getInstructionCost(I: &I, VF);
5095
5096	// Check if we should override the cost.
5097	if (C.isValid() && ForceTargetInstructionCost.getNumOccurrences() > `0`)
5098	C = InstructionCost (ForceTargetInstructionCost);
5099
5100	BlockCost += C;
5101	LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C << " for VF "
5102	<< VF << " For instruction: " << I << `'\n'`);
5103	}
5104
5105	// If we are vectorizing a predicated block, it will have been
5106	// if-converted. This means that the block's instructions (aside from
5107	// stores and instructions that may divide by zero) will now be
5108	// unconditionally executed. For the scalar case, we may not always execute
5109	// the predicated block, if it is an if-else block. Thus, scale the block's
5110	// cost by the probability of executing it. blockNeedsPredication from
5111	// Legal is used so as to not include all blocks in tail folded loops.
5112	if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5113	BlockCost /= getPredBlockCostDivisor(CostKind);
5114
5115	Cost += BlockCost;
5116	}
5117
5118	return Cost;
5119	}
5120
5121	/// Gets Address Access SCEV after verifying that the access pattern
5122	/// is loop invariant except the induction variable dependence.
5123	///
5124	/// This SCEV can be sent to the Target in order to estimate the address
5125	/// calculation cost.
5126	static const SCEV *getAddressAccessSCEV(
5127	Value *Ptr,
5128	LoopVectorizationLegality *Legal,
5129	PredicatedScalarEvolution &PSE,
5130	const Loop *TheLoop) {
5131
5132	auto *Gep = dyn_cast<GetElementPtrInst>(Val: Ptr);
5133	if (!Gep)
5134	return nullptr;
5135
5136	// We are looking for a gep with all loop invariant indices except for one
5137	// which should be an induction variable.
5138	auto *SE = PSE.getSE();
5139	unsigned NumOperands = Gep->getNumOperands();
5140	for (unsigned Idx = `1`; Idx < NumOperands; ++Idx) {
5141	Value *Opd = Gep->getOperand(i_nocapture: Idx);
5142	if (!SE->isLoopInvariant(S: SE->getSCEV(V: Opd), L: TheLoop) &&
5143	!Legal->isInductionVariable(V: Opd))
5144	return nullptr;
5145	}
5146
5147	// Now we know we have a GEP ptr, %inv, %ind, %inv. return the Ptr SCEV.
5148	return PSE.getSCEV(V: Ptr);
5149	}
5150
5151	InstructionCost
5152	LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
5153	ElementCount VF) {
5154	assert(VF.isVector() &&
5155	"Scalarization cost of instruction implies vectorization.");
5156	if (VF.isScalable())
5157	return InstructionCost::getInvalid();
5158
5159	Type *ValTy = getLoadStoreType(I);
5160	auto *SE = PSE.getSE();
5161
5162	unsigned AS = getLoadStoreAddressSpace(I);
5163	Value *Ptr = getLoadStorePointerOperand(V: I);
5164	Type *PtrTy = toVectorTy(Scalar: Ptr->getType(), EC: VF);
5165	// NOTE: PtrTy is a vector to signal `TTI::getAddressComputationCost`
5166	// that it is being called from this specific place.
5167
5168	// Figure out whether the access is strided and get the stride value
5169	// if it's known in compile time
5170	const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, PSE, TheLoop);
5171
5172	// Get the cost of the scalar memory instruction and address computation.
5173	InstructionCost Cost =
5174	VF.getFixedValue() * TTI.getAddressComputationCost(Ty: PtrTy, SE, Ptr: PtrSCEV);
5175
5176	// Don't pass I here, since it is scalar but will actually be part of a*
5177	// vectorized loop where the user of it is a vectorized instruction.
5178	const Align Alignment = getLoadStoreAlignment(I);
5179	Cost += VF.getFixedValue() * TTI.getMemoryOpCost(Opcode: I->getOpcode(),
5180	Src: ValTy->getScalarType(),
5181	Alignment, AddressSpace: AS, CostKind);
5182
5183	// Get the overhead of the extractelement and insertelement instructions
5184	// we might create due to scalarization.
5185	Cost += getScalarizationOverhead(I, VF);
5186
5187	// If we have a predicated load/store, it will need extra i1 extracts and
5188	// conditional branches, but may not be executed for each vector lane. Scale
5189	// the cost by the probability of executing the predicated block.
5190	if (isPredicatedInst(I)) {
5191	Cost /= getPredBlockCostDivisor(CostKind);
5192
5193	// Add the cost of an i1 extract and a branch
5194	auto *VecI1Ty =
5195	VectorType::get(ElementType: IntegerType::getInt1Ty(C&: ValTy->getContext()), EC: VF);
5196	Cost += TTI.getScalarizationOverhead(
5197	Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5198	/Insert=/false, /Extract=/true, CostKind);
5199	Cost += TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind);
5200
5201	if (useEmulatedMaskMemRefHack(I, VF))
5202	// Artificially setting to a high enough value to practically disable
5203	// vectorization with such operations.
5204	Cost = `3000000`;
5205	}
5206
5207	return Cost;
5208	}
5209
5210	InstructionCost
5211	LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
5212	ElementCount VF) {
5213	Type *ValTy = getLoadStoreType(I);
5214	auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5215	Value *Ptr = getLoadStorePointerOperand(V: I);
5216	unsigned AS = getLoadStoreAddressSpace(I);
5217	int ConsecutiveStride = Legal->isConsecutivePtr(AccessTy: ValTy, Ptr);
5218
5219	assert((ConsecutiveStride == `1` \|\| ConsecutiveStride == -`1`) &&
5220	"Stride should be 1 or -1 for consecutive memory access");
5221	const Align Alignment = getLoadStoreAlignment(I);
5222	InstructionCost Cost = `0`;
5223	if (Legal->isMaskRequired(I)) {
5224	Cost += TTI.getMaskedMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
5225	CostKind);
5226	} else {
5227	TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: `0`));
5228	Cost += TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: VectorTy, Alignment, AddressSpace: AS,
5229	CostKind, OpdInfo: OpInfo, I);
5230	}
5231
5232	bool Reverse = ConsecutiveStride < `0`;
5233	if (Reverse)
5234	Cost += TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
5235	SrcTy: VectorTy, Mask: {}, CostKind, Index: `0`);
5236	return Cost;
5237	}
5238
5239	InstructionCost
5240	LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
5241	ElementCount VF) {
5242	assert(Legal->isUniformMemOp(*I, VF));
5243
5244	Type *ValTy = getLoadStoreType(I);
5245	auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5246	const Align Alignment = getLoadStoreAlignment(I);
5247	unsigned AS = getLoadStoreAddressSpace(I);
5248	if (isa<LoadInst>(Val: I)) {
5249	return TTI.getAddressComputationCost(Ty: ValTy) +
5250	TTI.getMemoryOpCost(Opcode: Instruction::Load, Src: ValTy, Alignment, AddressSpace: AS,
5251	CostKind) +
5252	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Broadcast, DstTy: VectorTy,
5253	SrcTy: VectorTy, Mask: {}, CostKind);
5254	}
5255	StoreInst *SI = cast<StoreInst>(Val: I);
5256
5257	bool IsLoopInvariantStoreValue = Legal->isInvariant(V: SI->getValueOperand());
5258	// TODO: We have existing tests that request the cost of extracting element
5259	// VF.getKnownMinValue() - 1 from a scalable vector. This does not represent
5260	// the actual generated code, which involves extracting the last element of
5261	// a scalable vector where the lane to extract is unknown at compile time.
5262	return TTI.getAddressComputationCost(Ty: ValTy) +
5263	TTI.getMemoryOpCost(Opcode: Instruction::Store, Src: ValTy, Alignment, AddressSpace: AS,
5264	CostKind) +
5265	(IsLoopInvariantStoreValue
5266	? `0`
5267	: TTI.getVectorInstrCost(Opcode: Instruction::ExtractElement, Val: VectorTy,
5268	CostKind, Index: VF.getKnownMinValue() - `1`));
5269	}
5270
5271	InstructionCost
5272	LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
5273	ElementCount VF) {
5274	Type *ValTy = getLoadStoreType(I);
5275	auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5276	const Align Alignment = getLoadStoreAlignment(I);
5277	const Value *Ptr = getLoadStorePointerOperand(V: I);
5278
5279	return TTI.getAddressComputationCost(Ty: VectorTy) +
5280	TTI.getGatherScatterOpCost(Opcode: I->getOpcode(), DataTy: VectorTy, Ptr,
5281	VariableMask: Legal->isMaskRequired(I), Alignment,
5282	CostKind, I);
5283	}
5284
5285	InstructionCost
5286	LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
5287	ElementCount VF) {
5288	const auto *Group = getInterleavedAccessGroup(Instr: I);
5289	assert(Group && "Fail to get an interleaved access group.");
5290
5291	Instruction *InsertPos = Group->getInsertPos();
5292	Type *ValTy = getLoadStoreType(I: InsertPos);
5293	auto *VectorTy = cast<VectorType>(Val: toVectorTy(Scalar: ValTy, EC: VF));
5294	unsigned AS = getLoadStoreAddressSpace(I: InsertPos);
5295
5296	unsigned InterleaveFactor = Group->getFactor();
5297	auto WideVecTy = VectorType::get(ElementType: ValTy, EC: VF InterleaveFactor);
5298
5299	// Holds the indices of existing members in the interleaved group.
5300	SmallVector<unsigned, `4`> Indices;
5301	for (unsigned IF = `0`; IF < InterleaveFactor; IF++)
5302	if (Group->getMember(Index: IF))
5303	Indices.push_back(Elt: IF);
5304
5305	// Calculate the cost of the whole interleaved group.
5306	bool UseMaskForGaps =
5307	(Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed()) \|\|
5308	(isa<StoreInst>(Val: I) && (Group->getNumMembers() < Group->getFactor()));
5309	InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
5310	Opcode: InsertPos->getOpcode(), VecTy: WideVecTy, Factor: Group->getFactor(), Indices,
5311	Alignment: Group->getAlign(), AddressSpace: AS, CostKind, UseMaskForCond: Legal->isMaskRequired(I),
5312	UseMaskForGaps);
5313
5314	if (Group->isReverse()) {
5315	// TODO: Add support for reversed masked interleaved access.
5316	assert(!Legal->isMaskRequired(I) &&
5317	"Reverse masked interleaved access not supported.");
5318	Cost += Group->getNumMembers() *
5319	TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Reverse, DstTy: VectorTy,
5320	SrcTy: VectorTy, Mask: {}, CostKind, Index: `0`);
5321	}
5322	return Cost;
5323	}
5324
5325	std::optional<InstructionCost>
5326	LoopVectorizationCostModel::getReductionPatternCost(Instruction *I,
5327	ElementCount VF,
5328	Type Ty) const* {
5329	using namespace llvm::PatternMatch;
5330	// Early exit for no inloop reductions
5331	if (InLoopReductions.empty() \|\| VF.isScalar() \|\| !isa<VectorType>(Val: Ty))
5332	return std::nullopt;
5333	auto *VectorTy = cast<VectorType>(Val: Ty);
5334
5335	// We are looking for a pattern of, and finding the minimal acceptable cost:
5336	// reduce(mul(ext(A), ext(B))) or
5337	// reduce(mul(A, B)) or
5338	// reduce(ext(A)) or
5339	// reduce(A).
5340	// The basic idea is that we walk down the tree to do that, finding the root
5341	// reduction instruction in InLoopReductionImmediateChains. From there we find
5342	// the pattern of mul/ext and test the cost of the entire pattern vs the cost
5343	// of the components. If the reduction cost is lower then we return it for the
5344	// reduction instruction and 0 for the other instructions in the pattern. If
5345	// it is not we return an invalid cost specifying the orignal cost method
5346	// should be used.
5347	Instruction *RetI = I;
5348	if (match(V: RetI, P: m_ZExtOrSExt(Op: m_Value()))) {
5349	if (!RetI->hasOneUser())
5350	return std::nullopt;
5351	RetI = RetI->user_back();
5352	}
5353
5354	if (match(V: RetI, P: m_OneUse(SubPattern: m_Mul(L: m_Value(), R: m_Value()))) &&
5355	RetI->user_back()->getOpcode() == Instruction::Add) {
5356	RetI = RetI->user_back();
5357	}
5358
5359	// Test if the found instruction is a reduction, and if not return an invalid
5360	// cost specifying the parent to use the original cost modelling.
5361	Instruction *LastChain = InLoopReductionImmediateChains.lookup(Val: RetI);
5362	if (!LastChain)
5363	return std::nullopt;
5364
5365	// Find the reduction this chain is a part of and calculate the basic cost of
5366	// the reduction on its own.
5367	Instruction *ReductionPhi = LastChain;
5368	while (!isa<PHINode>(Val: ReductionPhi))
5369	ReductionPhi = InLoopReductionImmediateChains.at(Val: ReductionPhi);
5370
5371	const RecurrenceDescriptor &RdxDesc =
5372	Legal->getRecurrenceDescriptor(PN: cast<PHINode>(Val: ReductionPhi));
5373
5374	InstructionCost BaseCost;
5375	RecurKind RK = RdxDesc.getRecurrenceKind();
5376	if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK)) {
5377	Intrinsic::ID MinMaxID = getMinMaxReductionIntrinsicOp(RK);
5378	BaseCost = TTI.getMinMaxReductionCost(IID: MinMaxID, Ty: VectorTy,
5379	FMF: RdxDesc.getFastMathFlags(), CostKind);
5380	} else {
5381	BaseCost = TTI.getArithmeticReductionCost(
5382	Opcode: RdxDesc.getOpcode(), Ty: VectorTy, FMF: RdxDesc.getFastMathFlags(), CostKind);
5383	}
5384
5385	// For a call to the llvm.fmuladd intrinsic we need to add the cost of a
5386	// normal fmul instruction to the cost of the fadd reduction.
5387	if (RK == RecurKind::FMulAdd)
5388	BaseCost +=
5389	TTI.getArithmeticInstrCost(Opcode: Instruction::FMul, Ty: VectorTy, CostKind);
5390
5391	// If we're using ordered reductions then we can just return the base cost
5392	// here, since getArithmeticReductionCost calculates the full ordered
5393	// reduction cost when FP reassociation is not allowed.
5394	if (useOrderedReductions(RdxDesc))
5395	return BaseCost;
5396
5397	// Get the operand that was not the reduction chain and match it to one of the
5398	// patterns, returning the better cost if it is found.
5399	Instruction *RedOp = RetI->getOperand(i: `1`) == LastChain
5400	? dyn_cast<Instruction>(Val: RetI->getOperand(i: `0`))
5401	: dyn_cast<Instruction>(Val: RetI->getOperand(i: `1`));
5402
5403	VectorTy = VectorType::get(ElementType: I->getOperand(i: `0`)->getType(), Other: VectorTy);
5404
5405	Instruction Op0, Op1;
5406	if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5407	match(V: RedOp,
5408	P: m_ZExtOrSExt(Op: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) &&
5409	match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5410	Op0->getOpcode() == Op1->getOpcode() &&
5411	Op0->getOperand(i: `0`)->getType() == Op1->getOperand(i: `0`)->getType() &&
5412	!TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1) &&
5413	(Op0->getOpcode() == RedOp->getOpcode() \|\| Op0 == Op1)) {
5414
5415	// Matched reduce.add(ext(mul(ext(A), ext(B)))
5416	// Note that the extend opcodes need to all match, or if A==B they will have
5417	// been converted to zext(mul(sext(A), sext(A))) as it is known positive,
5418	// which is equally fine.
5419	bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5420	auto *ExtType = VectorType::get(ElementType: Op0->getOperand(i: `0`)->getType(), Other: VectorTy);
5421	auto *MulType = VectorType::get(ElementType: Op0->getType(), Other: VectorTy);
5422
5423	InstructionCost ExtCost =
5424	TTI.getCastInstrCost(Opcode: Op0->getOpcode(), Dst: MulType, Src: ExtType,
5425	CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5426	InstructionCost MulCost =
5427	TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: MulType, CostKind);
5428	InstructionCost Ext2Cost =
5429	TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: MulType,
5430	CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5431
5432	InstructionCost RedCost = TTI.getMulAccReductionCost(
5433	IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, CostKind);
5434
5435	if (RedCost.isValid() &&
5436	RedCost < ExtCost * `2` + MulCost + Ext2Cost + BaseCost)
5437	return I == RetI ? RedCost : `0`;
5438	} else if (RedOp && match(V: RedOp, P: m_ZExtOrSExt(Op: m_Value())) &&
5439	!TheLoop->isLoopInvariant(V: RedOp)) {
5440	// Matched reduce(ext(A))
5441	bool IsUnsigned = isa<ZExtInst>(Val: RedOp);
5442	auto *ExtType = VectorType::get(ElementType: RedOp->getOperand(i: `0`)->getType(), Other: VectorTy);
5443	InstructionCost RedCost = TTI.getExtendedReductionCost(
5444	Opcode: RdxDesc.getOpcode(), IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType,
5445	FMF: RdxDesc.getFastMathFlags(), CostKind);
5446
5447	InstructionCost ExtCost =
5448	TTI.getCastInstrCost(Opcode: RedOp->getOpcode(), Dst: VectorTy, Src: ExtType,
5449	CCH: TTI::CastContextHint::None, CostKind, I: RedOp);
5450	if (RedCost.isValid() && RedCost < BaseCost + ExtCost)
5451	return I == RetI ? RedCost : `0`;
5452	} else if (RedOp && RdxDesc.getOpcode() == Instruction::Add &&
5453	match(V: RedOp, P: m_Mul(L: m_Instruction(I&: Op0), R: m_Instruction(I&: Op1)))) {
5454	if (match(V: Op0, P: m_ZExtOrSExt(Op: m_Value())) &&
5455	Op0->getOpcode() == Op1->getOpcode() &&
5456	!TheLoop->isLoopInvariant(V: Op0) && !TheLoop->isLoopInvariant(V: Op1)) {
5457	bool IsUnsigned = isa<ZExtInst>(Val: Op0);
5458	Type *Op0Ty = Op0->getOperand(i: `0`)->getType();
5459	Type *Op1Ty = Op1->getOperand(i: `0`)->getType();
5460	Type *LargestOpTy =
5461	Op0Ty->getIntegerBitWidth() < Op1Ty->getIntegerBitWidth() ? Op1Ty
5462	: Op0Ty;
5463	auto *ExtType = VectorType::get(ElementType: LargestOpTy, Other: VectorTy);
5464
5465	// Matched reduce.add(mul(ext(A), ext(B))), where the two ext may be of
5466	// different sizes. We take the largest type as the ext to reduce, and add
5467	// the remaining cost as, for example reduce(mul(ext(ext(A)), ext(B))).
5468	InstructionCost ExtCost0 = TTI.getCastInstrCost(
5469	Opcode: Op0->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op0Ty, Other: VectorTy),
5470	CCH: TTI::CastContextHint::None, CostKind, I: Op0);
5471	InstructionCost ExtCost1 = TTI.getCastInstrCost(
5472	Opcode: Op1->getOpcode(), Dst: VectorTy, Src: VectorType::get(ElementType: Op1Ty, Other: VectorTy),
5473	CCH: TTI::CastContextHint::None, CostKind, I: Op1);
5474	InstructionCost MulCost =
5475	TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5476
5477	InstructionCost RedCost = TTI.getMulAccReductionCost(
5478	IsUnsigned, ResTy: RdxDesc.getRecurrenceType(), Ty: ExtType, CostKind);
5479	InstructionCost ExtraExtCost = `0`;
5480	if (Op0Ty != LargestOpTy \|\| Op1Ty != LargestOpTy) {
5481	Instruction *ExtraExtOp = (Op0Ty != LargestOpTy) ? Op0 : Op1;
5482	ExtraExtCost = TTI.getCastInstrCost(
5483	Opcode: ExtraExtOp->getOpcode(), Dst: ExtType,
5484	Src: VectorType::get(ElementType: ExtraExtOp->getOperand(i: `0`)->getType(), Other: VectorTy),
5485	CCH: TTI::CastContextHint::None, CostKind, I: ExtraExtOp);
5486	}
5487
5488	if (RedCost.isValid() &&
5489	(RedCost + ExtraExtCost) < (ExtCost0 + ExtCost1 + MulCost + BaseCost))
5490	return I == RetI ? RedCost : `0`;
5491	} else if (!match(V: I, P: m_ZExtOrSExt(Op: m_Value()))) {
5492	// Matched reduce.add(mul())
5493	InstructionCost MulCost =
5494	TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
5495
5496	InstructionCost RedCost = TTI.getMulAccReductionCost(
5497	IsUnsigned: true, ResTy: RdxDesc.getRecurrenceType(), Ty: VectorTy, CostKind);
5498
5499	if (RedCost.isValid() && RedCost < MulCost + BaseCost)
5500	return I == RetI ? RedCost : `0`;
5501	}
5502	}
5503
5504	return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
5505	}
5506
5507	InstructionCost
5508	LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
5509	ElementCount VF) {
5510	// Calculate scalar cost only. Vectorization cost should be ready at this
5511	// moment.
5512	if (VF.isScalar()) {
5513	Type *ValTy = getLoadStoreType(I);
5514	const Align Alignment = getLoadStoreAlignment(I);
5515	unsigned AS = getLoadStoreAddressSpace(I);
5516
5517	TTI::OperandValueInfo OpInfo = TTI::getOperandInfo(V: I->getOperand(i: `0`));
5518	return TTI.getAddressComputationCost(Ty: ValTy) +
5519	TTI.getMemoryOpCost(Opcode: I->getOpcode(), Src: ValTy, Alignment, AddressSpace: AS, CostKind,
5520	OpdInfo: OpInfo, I);
5521	}
5522	return getWideningCost(I, VF);
5523	}
5524
5525	InstructionCost
5526	LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
5527	ElementCount VF) const {
5528
5529	// There is no mechanism yet to create a scalable scalarization loop,
5530	// so this is currently Invalid.
5531	if (VF.isScalable())
5532	return InstructionCost::getInvalid();
5533
5534	if (VF.isScalar())
5535	return `0`;
5536
5537	InstructionCost Cost = `0`;
5538	Type *RetTy = toVectorizedTy(Ty: I->getType(), EC: VF);
5539	if (!RetTy->isVoidTy() &&
5540	(!isa<LoadInst>(Val: I) \|\| !TTI.supportsEfficientVectorElementLoadStore())) {
5541
5542	for (Type *VectorTy : getContainedTypes(Ty: RetTy)) {
5543	Cost += TTI.getScalarizationOverhead(
5544	Ty: cast<VectorType>(Val: VectorTy), DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
5545	/Insert=/true,
5546	/Extract=/false, CostKind);
5547	}
5548	}
5549
5550	// Some targets keep addresses scalar.
5551	if (isa<LoadInst>(Val: I) && !TTI.prefersVectorizedAddressing())
5552	return Cost;
5553
5554	// Some targets support efficient element stores.
5555	if (isa<StoreInst>(Val: I) && TTI.supportsEfficientVectorElementLoadStore())
5556	return Cost;
5557
5558	// Collect operands to consider.
5559	CallInst *CI = dyn_cast<CallInst>(Val: I);
5560	Instruction::op_range Ops = CI ? CI->args() : I->operands();
5561
5562	// Skip operands that do not require extraction/scalarization and do not incur
5563	// any overhead.
5564	SmallVector<Type *> Tys;
5565	for (auto *V : filterExtractingOperands(Ops, VF))
5566	Tys.push_back(Elt: maybeVectorizeType(Ty: V->getType(), VF));
5567	return Cost + TTI.getOperandsScalarizationOverhead(
5568	Args: filterExtractingOperands(Ops, VF), Tys, CostKind);
5569	}
5570
5571	void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
5572	if (VF.isScalar())
5573	return;
5574	NumPredStores = `0`;
5575	for (BasicBlock *BB : TheLoop->blocks()) {
5576	// For each instruction in the old loop.
5577	for (Instruction &I : *BB) {
5578	Value *Ptr = getLoadStorePointerOperand(V: &I);
5579	if (!Ptr)
5580	continue;
5581
5582	// TODO: We should generate better code and update the cost model for
5583	// predicated uniform stores. Today they are treated as any other
5584	// predicated store (see added test cases in
5585	// invariant-store-vectorization.ll).
5586	if (isa<StoreInst>(Val: &I) && isScalarWithPredication(I: &I, VF))
5587	NumPredStores++;
5588
5589	if (Legal->isUniformMemOp(I, VF)) {
5590	auto IsLegalToScalarize = [&]() {
5591	if (!VF.isScalable())
5592	// Scalarization of fixed length vectors "just works".
5593	return true;
5594
5595	// We have dedicated lowering for unpredicated uniform loads and
5596	// stores. Note that even with tail folding we know that at least
5597	// one lane is active (i.e. generalized predication is not possible
5598	// here), and the logic below depends on this fact.
5599	if (!foldTailByMasking())
5600	return true;
5601
5602	// For scalable vectors, a uniform memop load is always
5603	// uniform-by-parts and we know how to scalarize that.
5604	if (isa<LoadInst>(Val: I))
5605	return true;
5606
5607	// A uniform store isn't neccessarily uniform-by-part
5608	// and we can't assume scalarization.
5609	auto &SI = cast<StoreInst>(Val&: I);
5610	return TheLoop->isLoopInvariant(V: SI.getValueOperand());
5611	};
5612
5613	const InstructionCost GatherScatterCost =
5614	isLegalGatherOrScatter(V: &I, VF) ?
5615	getGatherScatterCost(I: &I, VF) : InstructionCost::getInvalid();
5616
5617	// Load: Scalar load + broadcast
5618	// Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
5619	// FIXME: This cost is a significant under-estimate for tail folded
5620	// memory ops.
5621	const InstructionCost ScalarizationCost =
5622	IsLegalToScalarize () ? getUniformMemOpCost(I: &I, VF)
5623	: InstructionCost::getInvalid();
5624
5625	// Choose better solution for the current VF, Note that Invalid
5626	// costs compare as maximumal large. If both are invalid, we get
5627	// scalable invalid which signals a failure and a vectorization abort.
5628	if (GatherScatterCost < ScalarizationCost)
5629	setWideningDecision(I: &I, VF, W: CM_GatherScatter, Cost: GatherScatterCost);
5630	else
5631	setWideningDecision(I: &I, VF, W: CM_Scalarize, Cost: ScalarizationCost);
5632	continue;
5633	}
5634
5635	// We assume that widening is the best solution when possible.
5636	if (memoryInstructionCanBeWidened(I: &I, VF)) {
5637	InstructionCost Cost = getConsecutiveMemOpCost(I: &I, VF);
5638	int ConsecutiveStride = Legal->isConsecutivePtr(
5639	AccessTy: getLoadStoreType(I: &I), Ptr: getLoadStorePointerOperand(V: &I));
5640	assert((ConsecutiveStride == `1` \|\| ConsecutiveStride == -`1`) &&
5641	"Expected consecutive stride.");
5642	InstWidening Decision =
5643	ConsecutiveStride == `1` ? CM_Widen : CM_Widen_Reverse;
5644	setWideningDecision(I: &I, VF, W: Decision, Cost);
5645	continue;
5646	}
5647
5648	// Choose between Interleaving, Gather/Scatter or Scalarization.
5649	InstructionCost InterleaveCost = InstructionCost::getInvalid();
5650	unsigned NumAccesses = `1`;
5651	if (isAccessInterleaved(Instr: &I)) {
5652	const auto *Group = getInterleavedAccessGroup(Instr: &I);
5653	assert(Group && "Fail to get an interleaved access group.");
5654
5655	// Make one decision for the whole group.
5656	if (getWideningDecision(I: &I, VF) != CM_Unknown)
5657	continue;
5658
5659	NumAccesses = Group->getNumMembers();
5660	if (interleavedAccessCanBeWidened(I: &I, VF))
5661	InterleaveCost = getInterleaveGroupCost(I: &I, VF);
5662	}
5663
5664	InstructionCost GatherScatterCost =
5665	isLegalGatherOrScatter(V: &I, VF)
5666	? getGatherScatterCost(I: &I, VF) * NumAccesses
5667	: InstructionCost::getInvalid();
5668
5669	InstructionCost ScalarizationCost =
5670	getMemInstScalarizationCost(I: &I, VF) * NumAccesses;
5671
5672	// Choose better solution for the current VF,
5673	// write down this decision and use it during vectorization.
5674	InstructionCost Cost;
5675	InstWidening Decision;
5676	if (InterleaveCost <= GatherScatterCost &&
5677	InterleaveCost < ScalarizationCost) {
5678	Decision = CM_Interleave;
5679	Cost = InterleaveCost;
5680	} else if (GatherScatterCost < ScalarizationCost) {
5681	Decision = CM_GatherScatter;
5682	Cost = GatherScatterCost;
5683	} else {
5684	Decision = CM_Scalarize;
5685	Cost = ScalarizationCost;
5686	}
5687	// If the instructions belongs to an interleave group, the whole group
5688	// receives the same decision. The whole group receives the cost, but
5689	// the cost will actually be assigned to one instruction.
5690	if (const auto *Group = getInterleavedAccessGroup(Instr: &I))
5691	setWideningDecision(Grp: Group, VF, W: Decision, Cost);
5692	else
5693	setWideningDecision(I: &I, VF, W: Decision, Cost);
5694	}
5695	}
5696
5697	// Make sure that any load of address and any other address computation
5698	// remains scalar unless there is gather/scatter support. This avoids
5699	// inevitable extracts into address registers, and also has the benefit of
5700	// activating LSR more, since that pass can't optimize vectorized
5701	// addresses.
5702	if (TTI.prefersVectorizedAddressing())
5703	return;
5704
5705	// Start with all scalar pointer uses.
5706	SmallPtrSet<Instruction *, `8`> AddrDefs;
5707	for (BasicBlock *BB : TheLoop->blocks())
5708	for (Instruction &I : *BB) {
5709	Instruction *PtrDef =
5710	dyn_cast_or_null<Instruction>(Val: getLoadStorePointerOperand(V: &I));
5711	if (PtrDef && TheLoop->contains(Inst: PtrDef) &&
5712	getWideningDecision(I: &I, VF) != CM_GatherScatter)
5713	AddrDefs.insert(Ptr: PtrDef);
5714	}
5715
5716	// Add all instructions used to generate the addresses.
5717	SmallVector<Instruction *, `4`> Worklist;
5718	append_range(C&: Worklist, R&: AddrDefs);
5719	while (!Worklist.empty()) {
5720	Instruction *I = Worklist.pop_back_val();
5721	for (auto &Op : I->operands())
5722	if (auto *InstOp = dyn_cast<Instruction>(Val&: Op))
5723	if ((InstOp->getParent() == I->getParent()) && !isa<PHINode>(Val: InstOp) &&
5724	AddrDefs.insert(Ptr: InstOp).second)
5725	Worklist.push_back(Elt: InstOp);
5726	}
5727
5728	for (auto *I : AddrDefs) {
5729	if (isa<LoadInst>(Val: I)) {
5730	// Setting the desired widening decision should ideally be handled in
5731	// by cost functions, but since this involves the task of finding out
5732	// if the loaded register is involved in an address computation, it is
5733	// instead changed here when we know this is the case.
5734	InstWidening Decision = getWideningDecision(I, VF);
5735	if (Decision == CM_Widen \|\| Decision == CM_Widen_Reverse)
5736	// Scalarize a widened load of address.
5737	setWideningDecision(
5738	I, VF, W: CM_Scalarize,
5739	Cost: (VF.getKnownMinValue() *
5740	getMemoryInstructionCost(I, VF: ElementCount::getFixed(MinVal: `1`))));
5741	else if (const auto *Group = getInterleavedAccessGroup(Instr: I)) {
5742	// Scalarize an interleave group of address loads.
5743	for (unsigned I = `0`; I < Group->getFactor(); ++I) {
5744	if (Instruction *Member = Group->getMember(Index: I))
5745	setWideningDecision(
5746	I: Member, VF, W: CM_Scalarize,
5747	Cost: (VF.getKnownMinValue() *
5748	getMemoryInstructionCost(I: Member, VF: ElementCount::getFixed(MinVal: `1`))));
5749	}
5750	}
5751	} else {
5752	// Cannot scalarize fixed-order recurrence phis at the moment.
5753	if (isa<PHINode>(Val: I) && Legal->isFixedOrderRecurrence(Phi: cast<PHINode>(Val: I)))
5754	continue;
5755
5756	// Make sure I gets scalarized and a cost estimate without
5757	// scalarization overhead.
5758	ForcedScalars [VF].insert(Ptr: I);
5759	}
5760	}
5761	}
5762
5763	void LoopVectorizationCostModel::setVectorizedCallDecision(ElementCount VF) {
5764	assert(!VF.isScalar() &&
5765	"Trying to set a vectorization decision for a scalar VF");
5766
5767	auto ForcedScalar = ForcedScalars.find(Val: VF);
5768	for (BasicBlock *BB : TheLoop->blocks()) {
5769	// For each instruction in the old loop.
5770	for (Instruction &I : *BB) {
5771	CallInst *CI = dyn_cast<CallInst>(Val: &I);
5772
5773	if (!CI)
5774	continue;
5775
5776	InstructionCost ScalarCost = InstructionCost::getInvalid();
5777	InstructionCost VectorCost = InstructionCost::getInvalid();
5778	InstructionCost IntrinsicCost = InstructionCost::getInvalid();
5779	Function *ScalarFunc = CI->getCalledFunction();
5780	Type *ScalarRetTy = CI->getType();
5781	SmallVector<Type *, `4`> Tys, ScalarTys;
5782	for (auto &ArgOp : CI->args())
5783	ScalarTys.push_back(Elt: ArgOp ->getType());
5784
5785	// Estimate cost of scalarized vector call. The source operands are
5786	// assumed to be vectors, so we need to extract individual elements from
5787	// there, execute VF scalar calls, and then gather the result into the
5788	// vector return value.
5789	InstructionCost ScalarCallCost =
5790	TTI.getCallInstrCost(F: ScalarFunc, RetTy: ScalarRetTy, Tys: ScalarTys, CostKind);
5791
5792	// Compute costs of unpacking argument values for the scalar calls and
5793	// packing the return values to a vector.
5794	InstructionCost ScalarizationCost = getScalarizationOverhead(I: CI, VF);
5795
5796	ScalarCost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
5797	// Honor ForcedScalars and UniformAfterVectorization decisions.
5798	// TODO: For calls, it might still be more profitable to widen. Use
5799	// VPlan-based cost model to compare different options.
5800	if (VF.isVector() && ((ForcedScalar != ForcedScalars.end() &&
5801	ForcedScalar ->second.contains(Ptr: CI)) \|\|
5802	isUniformAfterVectorization(I: CI, VF))) {
5803	setCallWideningDecision(CI, VF, Kind: CM_Scalarize, Variant: nullptr,
5804	IID: Intrinsic::not_intrinsic, MaskPos: std::nullopt,
5805	Cost: ScalarCost);
5806	continue;
5807	}
5808
5809	bool MaskRequired = Legal->isMaskRequired(I: CI);
5810	// Compute corresponding vector type for return value and arguments.
5811	Type *RetTy = toVectorizedTy(Ty: ScalarRetTy, EC: VF);
5812	for (Type *ScalarTy : ScalarTys)
5813	Tys.push_back(Elt: toVectorizedTy(Ty: ScalarTy, EC: VF));
5814
5815	// An in-loop reduction using an fmuladd intrinsic is a special case;
5816	// we don't want the normal cost for that intrinsic.
5817	if (RecurrenceDescriptor::isFMulAddIntrinsic(I: CI))
5818	if (auto RedCost = getReductionPatternCost(I: CI, VF, Ty: RetTy)) {
5819	setCallWideningDecision(CI, VF, Kind: CM_IntrinsicCall, Variant: nullptr,
5820	IID: getVectorIntrinsicIDForCall(CI, TLI),
5821	MaskPos: std::nullopt, Cost: *RedCost);
5822	continue;
5823	}
5824
5825	// Find the cost of vectorizing the call, if we can find a suitable
5826	// vector variant of the function.
5827	VFInfo FuncInfo;
5828	Function VecFunc = nullptr*;
5829	// Search through any available variants for one we can use at this VF.
5830	for (VFInfo &Info : VFDatabase::getMappings(CI: *CI)) {
5831	// Must match requested VF.
5832	if (Info.Shape.VF != VF)
5833	continue;
5834
5835	// Must take a mask argument if one is required
5836	if (MaskRequired && !Info.isMasked())
5837	continue;
5838
5839	// Check that all parameter kinds are supported
5840	bool ParamsOk = true;
5841	for (VFParameter Param : Info.Shape.Parameters) {
5842	switch (Param.ParamKind) {
5843	case VFParamKind::Vector:
5844	break;
5845	case VFParamKind::OMP_Uniform: {
5846	Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
5847	// Make sure the scalar parameter in the loop is invariant.
5848	if (!PSE.getSE()->isLoopInvariant(S: PSE.getSCEV(V: ScalarParam),
5849	L: TheLoop))
5850	ParamsOk = false;
5851	break;
5852	}
5853	case VFParamKind::OMP_Linear: {
5854	Value *ScalarParam = CI->getArgOperand(i: Param.ParamPos);
5855	// Find the stride for the scalar parameter in this loop and see if
5856	// it matches the stride for the variant.
5857	// TODO: do we need to figure out the cost of an extract to get the
5858	// first lane? Or do we hope that it will be folded away?
5859	ScalarEvolution *SE = PSE.getSE();
5860	const auto *SAR =
5861	dyn_cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: ScalarParam));
5862
5863	if (!SAR \|\| SAR->getLoop() != TheLoop) {
5864	ParamsOk = false;
5865	break;
5866	}
5867
5868	const SCEVConstant *Step =
5869	dyn_cast<SCEVConstant>(Val: SAR->getStepRecurrence(SE&: *SE));
5870
5871	if (!Step \|\|
5872	Step->getAPInt().getSExtValue() != Param.LinearStepOrPos)
5873	ParamsOk = false;
5874
5875	break;
5876	}
5877	case VFParamKind::GlobalPredicate:
5878	break;
5879	default:
5880	ParamsOk = false;
5881	break;
5882	}
5883	}
5884
5885	if (!ParamsOk)
5886	continue;
5887
5888	// Found a suitable candidate, stop here.
5889	VecFunc = CI->getModule()->getFunction(Name: Info.VectorName);
5890	FuncInfo = Info;
5891	break;
5892	}
5893
5894	if (TLI && VecFunc && !CI->isNoBuiltin())
5895	VectorCost = TTI.getCallInstrCost(F: nullptr, RetTy, Tys, CostKind);
5896
5897	// Find the cost of an intrinsic; some targets may have instructions that
5898	// perform the operation without needing an actual call.
5899	Intrinsic::ID IID = getVectorIntrinsicIDForCall(CI, TLI);
5900	if (IID != Intrinsic::not_intrinsic)
5901	IntrinsicCost = getVectorIntrinsicCost(CI, VF);
5902
5903	InstructionCost Cost = ScalarCost;
5904	InstWidening Decision = CM_Scalarize;
5905
5906	if (VectorCost <= Cost) {
5907	Cost = VectorCost;
5908	Decision = CM_VectorCall;
5909	}
5910
5911	if (IntrinsicCost <= Cost) {
5912	Cost = IntrinsicCost;
5913	Decision = CM_IntrinsicCall;
5914	}
5915
5916	setCallWideningDecision(CI, VF, Kind: Decision, Variant: VecFunc, IID,
5917	MaskPos: FuncInfo.getParamIndexForOptionalMask(), Cost);
5918	}
5919	}
5920	}
5921
5922	bool LoopVectorizationCostModel::shouldConsiderInvariant(Value *Op) {
5923	if (!Legal->isInvariant(V: Op))
5924	return false;
5925	// Consider Op invariant, if it or its operands aren't predicated
5926	// instruction in the loop. In that case, it is not trivially hoistable.
5927	auto *OpI = dyn_cast<Instruction>(Val: Op);
5928	return !OpI \|\| !TheLoop->contains(Inst: OpI) \|\|
5929	(!isPredicatedInst(I: OpI) &&
5930	(!isa<PHINode>(Val: OpI) \|\| OpI->getParent() != TheLoop->getHeader()) &&
5931	all_of(Range: OpI->operands(),
5932	P: [this](Value Op) { return* shouldConsiderInvariant(Op); }));
5933	}
5934
5935	InstructionCost
5936	LoopVectorizationCostModel::getInstructionCost(Instruction *I,
5937	ElementCount VF) {
5938	// If we know that this instruction will remain uniform, check the cost of
5939	// the scalar version.
5940	if (isUniformAfterVectorization(I, VF))
5941	VF = ElementCount::getFixed(MinVal: `1`);
5942
5943	if (VF.isVector() && isProfitableToScalarize(I, VF))
5944	return InstsToScalarize [VF][I];
5945
5946	// Forced scalars do not have any scalarization overhead.
5947	auto ForcedScalar = ForcedScalars.find(Val: VF);
5948	if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
5949	auto InstSet = ForcedScalar ->second;
5950	if (InstSet.count(Ptr: I))
5951	return getInstructionCost(I, VF: ElementCount::getFixed(MinVal: `1`)) *
5952	VF.getKnownMinValue();
5953	}
5954
5955	Type *RetTy = I->getType();
5956	if (canTruncateToMinimalBitwidth(I, VF))
5957	RetTy = IntegerType::get(C&: RetTy->getContext(), NumBits: MinBWs [I]);
5958	auto *SE = PSE.getSE();
5959
5960	Type *VectorTy;
5961	if (isScalarAfterVectorization(I, VF)) {
5962	[[maybe_unused]] auto HasSingleCopyAfterVectorization =
5963	[this](Instruction I, ElementCount VF) -> bool* {
5964	if (VF.isScalar())
5965	return true;
5966
5967	auto Scalarized = InstsToScalarize.find(Val: VF);
5968	assert(Scalarized != InstsToScalarize.end() &&
5969	"VF not yet analyzed for scalarization profitability");
5970	return !Scalarized ->second.count(Val: I) &&
5971	llvm::all_of(Range: I->users(), P: [&](User *U) {
5972	auto *UI = cast<Instruction>(Val: U);
5973	return !Scalarized ->second.count(Val: UI);
5974	});
5975	};
5976
5977	// With the exception of GEPs and PHIs, after scalarization there should
5978	// only be one copy of the instruction generated in the loop. This is
5979	// because the VF is either 1, or any instructions that need scalarizing
5980	// have already been dealt with by the time we get here. As a result,
5981	// it means we don't have to multiply the instruction cost by VF.
5982	assert(I->getOpcode() == Instruction::GetElementPtr \|\|
5983	I->getOpcode() == Instruction::PHI \|\|
5984	(I->getOpcode() == Instruction::BitCast &&
5985	I->getType()->isPointerTy()) \|\|
5986	HasSingleCopyAfterVectorization(I, VF));
5987	VectorTy = RetTy;
5988	} else
5989	VectorTy = toVectorizedTy(Ty: RetTy, EC: VF);
5990
5991	if (VF.isVector() && VectorTy->isVectorTy() &&
5992	!TTI.getNumberOfParts(Tp: VectorTy))
5993	return InstructionCost::getInvalid();
5994
5995	// TODO: We need to estimate the cost of intrinsic calls.
5996	switch (I->getOpcode()) {
5997	case Instruction::GetElementPtr:
5998	// We mark this instruction as zero-cost because the cost of GEPs in
5999	// vectorized code depends on whether the corresponding memory instruction
6000	// is scalarized or not. Therefore, we handle GEPs with the memory
6001	// instruction cost.
6002	return `0`;
6003	case Instruction::Br: {
6004	// In cases of scalarized and predicated instructions, there will be VF
6005	// predicated blocks in the vectorized loop. Each branch around these
6006	// blocks requires also an extract of its vector compare i1 element.
6007	// Note that the conditional branch from the loop latch will be replaced by
6008	// a single branch controlling the loop, so there is no extra overhead from
6009	// scalarization.
6010	bool ScalarPredicatedBB = false;
6011	BranchInst *BI = cast<BranchInst>(Val: I);
6012	if (VF.isVector() && BI->isConditional() &&
6013	(PredicatedBBsAfterVectorization [VF].count(Ptr: BI->getSuccessor(i: `0`)) \|\|
6014	PredicatedBBsAfterVectorization [VF].count(Ptr: BI->getSuccessor(i: `1`))) &&
6015	BI->getParent() != TheLoop->getLoopLatch())
6016	ScalarPredicatedBB = true;
6017
6018	if (ScalarPredicatedBB) {
6019	// Not possible to scalarize scalable vector with predicated instructions.
6020	if (VF.isScalable())
6021	return InstructionCost::getInvalid();
6022	// Return cost for branches around scalarized and predicated blocks.
6023	auto *VecI1Ty =
6024	VectorType::get(ElementType: IntegerType::getInt1Ty(C&: RetTy->getContext()), EC: VF);
6025	return (
6026	TTI.getScalarizationOverhead(
6027	Ty: VecI1Ty, DemandedElts: APInt::getAllOnes(numBits: VF.getFixedValue()),
6028	/Insert/ false, /Extract/ true, CostKind) +
6029	(TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind) * VF.getFixedValue()));
6030	}
6031
6032	if (I->getParent() == TheLoop->getLoopLatch() \|\| VF.isScalar())
6033	// The back-edge branch will remain, as will all scalar branches.
6034	return TTI.getCFInstrCost(Opcode: Instruction::Br, CostKind);
6035
6036	// This branch will be eliminated by if-conversion.
6037	return `0`;
6038	// Note: We currently assume zero cost for an unconditional branch inside
6039	// a predicated block since it will become a fall-through, although we
6040	// may decide in the future to call TTI for all branches.
6041	}
6042	case Instruction::Switch: {
6043	if (VF.isScalar())
6044	return TTI.getCFInstrCost(Opcode: Instruction::Switch, CostKind);
6045	auto *Switch = cast<SwitchInst>(Val: I);
6046	return Switch->getNumCases() *
6047	TTI.getCmpSelInstrCost(
6048	Opcode: Instruction::ICmp,
6049	ValTy: toVectorTy(Scalar: Switch->getCondition()->getType(), EC: VF),
6050	CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: I->getContext()), EC: VF),
6051	VecPred: CmpInst::ICMP_EQ, CostKind);
6052	}
6053	case Instruction::PHI: {
6054	auto *Phi = cast<PHINode>(Val: I);
6055
6056	// First-order recurrences are replaced by vector shuffles inside the loop.
6057	if (VF.isVector() && Legal->isFixedOrderRecurrence(Phi)) {
6058	SmallVector<int> Mask(VF.getKnownMinValue());
6059	std::iota(first: Mask.begin(), last: Mask.end(), value: VF.getKnownMinValue() - `1`);
6060	return TTI.getShuffleCost(Kind: TargetTransformInfo::SK_Splice,
6061	DstTy: cast<VectorType>(Val: VectorTy),
6062	SrcTy: cast<VectorType>(Val: VectorTy), Mask, CostKind,
6063	Index: VF.getKnownMinValue() - `1`);
6064	}
6065
6066	// Phi nodes in non-header blocks (not inductions, reductions, etc.) are
6067	// converted into select instructions. We require N - 1 selects per phi
6068	// node, where N is the number of incoming values.
6069	if (VF.isVector() && Phi->getParent() != TheLoop->getHeader()) {
6070	Type *ResultTy = Phi->getType();
6071
6072	// All instructions in an Any-of reduction chain are narrowed to bool.
6073	// Check if that is the case for this phi node.
6074	auto *HeaderUser = cast_if_present<PHINode>(
6075	Val: find_singleton<User>(Range: Phi->users(), P: [this](User U, bool) -> User {
6076	auto *Phi = dyn_cast<PHINode>(Val: U);
6077	if (Phi && Phi->getParent() == TheLoop->getHeader())
6078	return Phi;
6079	return nullptr;
6080	}));
6081	if (HeaderUser) {
6082	auto &ReductionVars = Legal->getReductionVars();
6083	auto Iter = ReductionVars.find(Key: HeaderUser);
6084	if (Iter != ReductionVars.end() &&
6085	RecurrenceDescriptor::isAnyOfRecurrenceKind(
6086	Kind: Iter->second.getRecurrenceKind()))
6087	ResultTy = Type::getInt1Ty(C&: Phi->getContext());
6088	}
6089	return (Phi->getNumIncomingValues() - `1`) *
6090	TTI.getCmpSelInstrCost(
6091	Opcode: Instruction::Select, ValTy: toVectorTy(Scalar: ResultTy, EC: VF),
6092	CondTy: toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF),
6093	VecPred: CmpInst::BAD_ICMP_PREDICATE, CostKind);
6094	}
6095
6096	// When tail folding with EVL, if the phi is part of an out of loop
6097	// reduction then it will be transformed into a wide vp_merge.
6098	if (VF.isVector() && foldTailWithEVL() &&
6099	Legal->getReductionVars().contains(Key: Phi) && !isInLoopReduction(Phi)) {
6100	IntrinsicCostAttributes ICA(
6101	Intrinsic::vp_merge, toVectorTy(Scalar: Phi->getType(), EC: VF),
6102	{toVectorTy(Scalar: Type::getInt1Ty(C&: Phi->getContext()), EC: VF)});
6103	return TTI.getIntrinsicInstrCost(ICA, CostKind);
6104	}
6105
6106	return TTI.getCFInstrCost(Opcode: Instruction::PHI, CostKind);
6107	}
6108	case Instruction::UDiv:
6109	case Instruction::SDiv:
6110	case Instruction::URem:
6111	case Instruction::SRem:
6112	if (VF.isVector() && isPredicatedInst(I)) {
6113	const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
6114	return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
6115	ScalarCost : SafeDivisorCost;
6116	}
6117	// We've proven all lanes safe to speculate, fall through.
6118	[[fallthrough]];
6119	case Instruction::Add:
6120	case Instruction::Sub: {
6121	auto Info = Legal->getHistogramInfo(I);
6122	if (Info && VF.isVector()) {
6123	const HistogramInfo *HGram = Info.value();
6124	// Assume that a non-constant update value (or a constant != 1) requires
6125	// a multiply, and add that into the cost.
6126	InstructionCost MulCost = TTI::TCC_Free;
6127	ConstantInt *RHS = dyn_cast<ConstantInt>(Val: I->getOperand(i: `1`));
6128	if (!RHS \|\| RHS->getZExtValue() != `1`)
6129	MulCost =
6130	TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6131
6132	// Find the cost of the histogram operation itself.
6133	Type *PtrTy = VectorType::get(ElementType: HGram->Load->getPointerOperandType(), EC: VF);
6134	Type *ScalarTy = I->getType();
6135	Type *MaskTy = VectorType::get(ElementType: Type::getInt1Ty(C&: I->getContext()), EC: VF);
6136	IntrinsicCostAttributes ICA(Intrinsic::experimental_vector_histogram_add,
6137	Type::getVoidTy(C&: I->getContext()),
6138	{PtrTy, ScalarTy, MaskTy});
6139
6140	// Add the costs together with the add/sub operation.
6141	return TTI.getIntrinsicInstrCost(ICA, CostKind) + MulCost +
6142	TTI.getArithmeticInstrCost(Opcode: I->getOpcode(), Ty: VectorTy, CostKind);
6143	}
6144	[[fallthrough]];
6145	}
6146	case Instruction::FAdd:
6147	case Instruction::FSub:
6148	case Instruction::Mul:
6149	case Instruction::FMul:
6150	case Instruction::FDiv:
6151	case Instruction::FRem:
6152	case Instruction::Shl:
6153	case Instruction::LShr:
6154	case Instruction::AShr:
6155	case Instruction::And:
6156	case Instruction::Or:
6157	case Instruction::Xor: {
6158	// If we're speculating on the stride being 1, the multiplication may
6159	// fold away. We can generalize this for all operations using the notion
6160	// of neutral elements. (TODO)
6161	if (I->getOpcode() == Instruction::Mul &&
6162	((TheLoop->isLoopInvariant(V: I->getOperand(i: `0`)) &&
6163	PSE.getSCEV(V: I->getOperand(i: `0`))->isOne()) \|\|
6164	(TheLoop->isLoopInvariant(V: I->getOperand(i: `1`)) &&
6165	PSE.getSCEV(V: I->getOperand(i: `1`))->isOne())))
6166	return `0`;
6167
6168	// Detect reduction patterns
6169	if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
6170	return *RedCost;
6171
6172	// Certain instructions can be cheaper to vectorize if they have a constant
6173	// second vector operand. One example of this are shifts on x86.
6174	Value *Op2 = I->getOperand(i: `1`);
6175	if (!isa<Constant>(Val: Op2) && TheLoop->isLoopInvariant(V: Op2) &&
6176	PSE.getSE()->isSCEVable(Ty: Op2->getType()) &&
6177	isa<SCEVConstant>(Val: PSE.getSCEV(V: Op2))) {
6178	Op2 = cast<SCEVConstant>(Val: PSE.getSCEV(V: Op2))->getValue();
6179	}
6180	auto Op2Info = TTI.getOperandInfo(V: Op2);
6181	if (Op2Info.Kind == TargetTransformInfo::OK_AnyValue &&
6182	shouldConsiderInvariant(Op: Op2))
6183	Op2Info.Kind = TargetTransformInfo::OK_UniformValue;
6184
6185	SmallVector<const Value *, `4`> Operands(I->operand_values());
6186	return TTI.getArithmeticInstrCost(
6187	Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6188	Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6189	Opd2Info: Op2Info, Args: Operands, CxtI: I, TLibInfo: TLI);
6190	}
6191	case Instruction::FNeg: {
6192	return TTI.getArithmeticInstrCost(
6193	Opcode: I->getOpcode(), Ty: VectorTy, CostKind,
6194	Opd1Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6195	Opd2Info: {.Kind: TargetTransformInfo::OK_AnyValue, .Properties: TargetTransformInfo::OP_None},
6196	Args: I->getOperand(i: `0`), CxtI: I);
6197	}
6198	case Instruction::Select: {
6199	SelectInst *SI = cast<SelectInst>(Val: I);
6200	const SCEV *CondSCEV = SE->getSCEV(V: SI->getCondition());
6201	bool ScalarCond = (SE->isLoopInvariant(S: CondSCEV, L: TheLoop));
6202
6203	const Value Op0, Op1;
6204	using namespace llvm::PatternMatch;
6205	if (!ScalarCond && (match(V: I, P: m_LogicalAnd(L: m_Value(V&: Op0), R: m_Value(V&: Op1))) \|\|
6206	match(V: I, P: m_LogicalOr(L: m_Value(V&: Op0), R: m_Value(V&: Op1))))) {
6207	// select x, y, false --> x & y
6208	// select x, true, y --> x \| y
6209	const auto [Op1VK, Op1VP] = TTI::getOperandInfo(V: Op0);
6210	const auto [Op2VK, Op2VP] = TTI::getOperandInfo(V: Op1);
6211	assert(Op0->getType()->getScalarSizeInBits() == `1` &&
6212	Op1->getType()->getScalarSizeInBits() == `1`);
6213
6214	SmallVector<const Value *, `2`> Operands{Op0, Op1};
6215	return TTI.getArithmeticInstrCost(
6216	Opcode: match(V: I, P: m_LogicalOr()) ? Instruction::Or : Instruction::And, Ty: VectorTy,
6217	CostKind, Opd1Info: {.Kind: Op1VK, .Properties: Op1VP}, Opd2Info: {.Kind: Op2VK, .Properties: Op2VP}, Args: Operands, CxtI: I);
6218	}
6219
6220	Type *CondTy = SI->getCondition()->getType();
6221	if (!ScalarCond)
6222	CondTy = VectorType::get(ElementType: CondTy, EC: VF);
6223
6224	CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
6225	if (auto *Cmp = dyn_cast<CmpInst>(Val: SI->getCondition()))
6226	Pred = Cmp->getPredicate();
6227	return TTI.getCmpSelInstrCost(Opcode: I->getOpcode(), ValTy: VectorTy, CondTy, VecPred: Pred,
6228	CostKind, Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None},
6229	Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
6230	}
6231	case Instruction::ICmp:
6232	case Instruction::FCmp: {
6233	Type *ValTy = I->getOperand(i: `0`)->getType();
6234
6235	if (canTruncateToMinimalBitwidth(I, VF)) {
6236	[[maybe_unused]] Instruction *Op0AsInstruction =
6237	dyn_cast<Instruction>(Val: I->getOperand(i: `0`));
6238	assert((!canTruncateToMinimalBitwidth(Op0AsInstruction, VF) \|\|
6239	MinBWs[I] == MinBWs[Op0AsInstruction]) &&
6240	"if both the operand and the compare are marked for "
6241	"truncation, they must have the same bitwidth");
6242	ValTy = IntegerType::get(C&: ValTy->getContext(), NumBits: MinBWs [I]);
6243	}
6244
6245	VectorTy = toVectorTy(Scalar: ValTy, EC: VF);
6246	return TTI.getCmpSelInstrCost(
6247	Opcode: I->getOpcode(), ValTy: VectorTy, CondTy: CmpInst::makeCmpResultType(opnd_type: VectorTy),
6248	VecPred: cast<CmpInst>(Val: I)->getPredicate(), CostKind,
6249	Op1Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, Op2Info: {.Kind: TTI::OK_AnyValue, .Properties: TTI::OP_None}, I);
6250	}
6251	case Instruction::Store:
6252	case Instruction::Load: {
6253	ElementCount Width = VF;
6254	if (Width.isVector()) {
6255	InstWidening Decision = getWideningDecision(I, VF: Width);
6256	assert(Decision != CM_Unknown &&
6257	"CM decision should be taken at this point");
6258	if (getWideningCost(I, VF) == InstructionCost::getInvalid())
6259	return InstructionCost::getInvalid();
6260	if (Decision == CM_Scalarize)
6261	Width = ElementCount::getFixed(MinVal: `1`);
6262	}
6263	VectorTy = toVectorTy(Scalar: getLoadStoreType(I), EC: Width);
6264	return getMemoryInstructionCost(I, VF);
6265	}
6266	case Instruction::BitCast:
6267	if (I->getType()->isPointerTy())
6268	return `0`;
6269	[[fallthrough]];
6270	case Instruction::ZExt:
6271	case Instruction::SExt:
6272	case Instruction::FPToUI:
6273	case Instruction::FPToSI:
6274	case Instruction::FPExt:
6275	case Instruction::PtrToInt:
6276	case Instruction::IntToPtr:
6277	case Instruction::SIToFP:
6278	case Instruction::UIToFP:
6279	case Instruction::Trunc:
6280	case Instruction::FPTrunc: {
6281	// Computes the CastContextHint from a Load/Store instruction.
6282	auto ComputeCCH = [&](Instruction *I) -> TTI::CastContextHint {
6283	assert((isa<LoadInst>(I) \|\| isa<StoreInst>(I)) &&
6284	"Expected a load or a store!");
6285
6286	if (VF.isScalar() \|\| !TheLoop->contains(Inst: I))
6287	return TTI::CastContextHint::Normal;
6288
6289	switch (getWideningDecision(I, VF)) {
6290	case LoopVectorizationCostModel::CM_GatherScatter:
6291	return TTI::CastContextHint::GatherScatter;
6292	case LoopVectorizationCostModel::CM_Interleave:
6293	return TTI::CastContextHint::Interleave;
6294	case LoopVectorizationCostModel::CM_Scalarize:
6295	case LoopVectorizationCostModel::CM_Widen:
6296	return isPredicatedInst(I) ? TTI::CastContextHint::Masked
6297	: TTI::CastContextHint::Normal;
6298	case LoopVectorizationCostModel::CM_Widen_Reverse:
6299	return TTI::CastContextHint::Reversed;
6300	case LoopVectorizationCostModel::CM_Unknown:
6301	llvm_unreachable("Instr did not go through cost modelling?");
6302	case LoopVectorizationCostModel::CM_VectorCall:
6303	case LoopVectorizationCostModel::CM_IntrinsicCall:
6304	llvm_unreachable_internal(msg: "Instr has invalid widening decision");
6305	}
6306
6307	llvm_unreachable("Unhandled case!");
6308	};
6309
6310	unsigned Opcode = I->getOpcode();
6311	TTI::CastContextHint CCH = TTI::CastContextHint::None;
6312	// For Trunc, the context is the only user, which must be a StoreInst.
6313	if (Opcode == Instruction::Trunc \|\| Opcode == Instruction::FPTrunc) {
6314	if (I->hasOneUse())
6315	if (StoreInst Store = dyn_cast<StoreInst>(Val: I->user_begin()))
6316	CCH = ComputeCCH (Store);
6317	}
6318	// For Z/Sext, the context is the operand, which must be a LoadInst.
6319	else if (Opcode == Instruction::ZExt \|\| Opcode == Instruction::SExt \|\|
6320	Opcode == Instruction::FPExt) {
6321	if (LoadInst *Load = dyn_cast<LoadInst>(Val: I->getOperand(i: `0`)))
6322	CCH = ComputeCCH (Load);
6323	}
6324
6325	// We optimize the truncation of induction variables having constant
6326	// integer steps. The cost of these truncations is the same as the scalar
6327	// operation.
6328	if (isOptimizableIVTruncate(I, VF)) {
6329	auto *Trunc = cast<TruncInst>(Val: I);
6330	return TTI.getCastInstrCost(Opcode: Instruction::Trunc, Dst: Trunc->getDestTy(),
6331	Src: Trunc->getSrcTy(), CCH, CostKind, I: Trunc);
6332	}
6333
6334	// Detect reduction patterns
6335	if (auto RedCost = getReductionPatternCost(I, VF, Ty: VectorTy))
6336	return *RedCost;
6337
6338	Type *SrcScalarTy = I->getOperand(i: `0`)->getType();
6339	Instruction *Op0AsInstruction = dyn_cast<Instruction>(Val: I->getOperand(i: `0`));
6340	if (canTruncateToMinimalBitwidth(I: Op0AsInstruction, VF))
6341	SrcScalarTy =
6342	IntegerType::get(C&: SrcScalarTy->getContext(), NumBits: MinBWs [Op0AsInstruction]);
6343	Type *SrcVecTy =
6344	VectorTy->isVectorTy() ? toVectorTy(Scalar: SrcScalarTy, EC: VF) : SrcScalarTy;
6345
6346	if (canTruncateToMinimalBitwidth(I, VF)) {
6347	// If the result type is <= the source type, there will be no extend
6348	// after truncating the users to the minimal required bitwidth.
6349	if (VectorTy->getScalarSizeInBits() <= SrcVecTy->getScalarSizeInBits() &&
6350	(I->getOpcode() == Instruction::ZExt \|\|
6351	I->getOpcode() == Instruction::SExt))
6352	return `0`;
6353	}
6354
6355	return TTI.getCastInstrCost(Opcode, Dst: VectorTy, Src: SrcVecTy, CCH, CostKind, I);
6356	}
6357	case Instruction::Call:
6358	return getVectorCallCost(CI: cast<CallInst>(Val: I), VF);
6359	case Instruction::ExtractValue:
6360	return TTI.getInstructionCost(U: I, CostKind);
6361	case Instruction::Alloca:
6362	// We cannot easily widen alloca to a scalable alloca, as
6363	// the result would need to be a vector of pointers.
6364	if (VF.isScalable())
6365	return InstructionCost::getInvalid();
6366	[[fallthrough]];
6367	default:
6368	// This opcode is unknown. Assume that it is the same as 'mul'.
6369	return TTI.getArithmeticInstrCost(Opcode: Instruction::Mul, Ty: VectorTy, CostKind);
6370	} // end of switch.
6371	}
6372
6373	void LoopVectorizationCostModel::collectValuesToIgnore() {
6374	// Ignore ephemeral values.
6375	CodeMetrics::collectEphemeralValues(L: TheLoop, AC, EphValues&: ValuesToIgnore);
6376
6377	SmallVector<Value *, `4`> DeadInterleavePointerOps;
6378	SmallVector<Value *, `4`> DeadOps;
6379
6380	// If a scalar epilogue is required, users outside the loop won't use
6381	// live-outs from the vector loop but from the scalar epilogue. Ignore them if
6382	// that is the case.
6383	bool RequiresScalarEpilogue = requiresScalarEpilogue(IsVectorizing: true);
6384	auto IsLiveOutDead = [this, RequiresScalarEpilogue](User *U) {
6385	return RequiresScalarEpilogue &&
6386	!TheLoop->contains(BB: cast<Instruction>(Val: U)->getParent());
6387	};
6388
6389	LoopBlocksDFS DFS(TheLoop);
6390	DFS.perform(LI);
6391	MapVector<Value , SmallVector<Value >> DeadInvariantStoreOps;
6392	for (BasicBlock *BB : reverse(C: make_range(x: DFS.beginRPO(), y: DFS.endRPO())))
6393	for (Instruction &I : reverse(C&: *BB)) {
6394	// Find all stores to invariant variables. Since they are going to sink
6395	// outside the loop we do not need calculate cost for them.
6396	StoreInst *SI;
6397	if ((SI = dyn_cast<StoreInst>(Val: &I)) &&
6398	Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand())) {
6399	ValuesToIgnore.insert(Ptr: &I);
6400	DeadInvariantStoreOps [SI->getPointerOperand()].push_back(
6401	Elt: SI->getValueOperand());
6402	}
6403
6404	if (VecValuesToIgnore.contains(Ptr: &I) \|\| ValuesToIgnore.contains(Ptr: &I))
6405	continue;
6406
6407	// Add instructions that would be trivially dead and are only used by
6408	// values already ignored to DeadOps to seed worklist.
6409	if (wouldInstructionBeTriviallyDead(I: &I, TLI) &&
6410	all_of(Range: I.users(), P: [this, IsLiveOutDead](User *U) {
6411	return VecValuesToIgnore.contains(Ptr: U) \|\|
6412	ValuesToIgnore.contains(Ptr: U) \|\| IsLiveOutDead (U);
6413	}))
6414	DeadOps.push_back(Elt: &I);
6415
6416	// For interleave groups, we only create a pointer for the start of the
6417	// interleave group. Queue up addresses of group members except the insert
6418	// position for further processing.
6419	if (isAccessInterleaved(Instr: &I)) {
6420	auto *Group = getInterleavedAccessGroup(Instr: &I);
6421	if (Group->getInsertPos() == &I)
6422	continue;
6423	Value *PointerOp = getLoadStorePointerOperand(V: &I);
6424	DeadInterleavePointerOps.push_back(Elt: PointerOp);
6425	}
6426
6427	// Queue branches for analysis. They are dead, if their successors only
6428	// contain dead instructions.
6429	if (auto *Br = dyn_cast<BranchInst>(Val: &I)) {
6430	if (Br->isConditional())
6431	DeadOps.push_back(Elt: &I);
6432	}
6433	}
6434
6435	// Mark ops feeding interleave group members as free, if they are only used
6436	// by other dead computations.
6437	for (unsigned I = `0`; I != DeadInterleavePointerOps.size(); ++I) {
6438	auto *Op = dyn_cast<Instruction>(Val: DeadInterleavePointerOps [I]);
6439	if (!Op \|\| !TheLoop->contains(Inst: Op) \|\| any_of(Range: Op->users(), P: [this](User *U) {
6440	Instruction *UI = cast<Instruction>(Val: U);
6441	return !VecValuesToIgnore.contains(Ptr: U) &&
6442	(!isAccessInterleaved(Instr: UI) \|\|
6443	getInterleavedAccessGroup(Instr: UI)->getInsertPos() == UI);
6444	}))
6445	continue;
6446	VecValuesToIgnore.insert(Ptr: Op);
6447	DeadInterleavePointerOps.append(in_start: Op->op_begin(), in_end: Op->op_end());
6448	}
6449
6450	for (const auto &[_, Ops] : DeadInvariantStoreOps)
6451	llvm::append_range(C&: DeadOps, R: drop_end(RangeOrContainer: Ops));
6452
6453	// Mark ops that would be trivially dead and are only used by ignored
6454	// instructions as free.
6455	BasicBlock *Header = TheLoop->getHeader();
6456
6457	// Returns true if the block contains only dead instructions. Such blocks will
6458	// be removed by VPlan-to-VPlan transforms and won't be considered by the
6459	// VPlan-based cost model, so skip them in the legacy cost-model as well.
6460	auto IsEmptyBlock = [this](BasicBlock *BB) {
6461	return all_of(Range&: BB, P: [this*](Instruction &I) {
6462	return ValuesToIgnore.contains(Ptr: &I) \|\| VecValuesToIgnore.contains(Ptr: &I) \|\|
6463	(isa<BranchInst>(Val: &I) && !cast<BranchInst>(Val: &I)->isConditional());
6464	});
6465	};
6466	for (unsigned I = `0`; I != DeadOps.size(); ++I) {
6467	auto *Op = dyn_cast<Instruction>(Val: DeadOps [I]);
6468
6469	// Check if the branch should be considered dead.
6470	if (auto *Br = dyn_cast_or_null<BranchInst>(Val: Op)) {
6471	BasicBlock *ThenBB = Br->getSuccessor(i: `0`);
6472	BasicBlock *ElseBB = Br->getSuccessor(i: `1`);
6473	// Don't considers branches leaving the loop for simplification.
6474	if (!TheLoop->contains(BB: ThenBB) \|\| !TheLoop->contains(BB: ElseBB))
6475	continue;
6476	bool ThenEmpty = IsEmptyBlock (ThenBB);
6477	bool ElseEmpty = IsEmptyBlock (ElseBB);
6478	if ((ThenEmpty && ElseEmpty) \|\|
6479	(ThenEmpty && ThenBB->getSingleSuccessor() == ElseBB &&
6480	ElseBB->phis().empty()) \|\|
6481	(ElseEmpty && ElseBB->getSingleSuccessor() == ThenBB &&
6482	ThenBB->phis().empty())) {
6483	VecValuesToIgnore.insert(Ptr: Br);
6484	DeadOps.push_back(Elt: Br->getCondition());
6485	}
6486	continue;
6487	}
6488
6489	// Skip any op that shouldn't be considered dead.
6490	if (!Op \|\| !TheLoop->contains(Inst: Op) \|\|
6491	(isa<PHINode>(Val: Op) && Op->getParent() == Header) \|\|
6492	!wouldInstructionBeTriviallyDead(I: Op, TLI) \|\|
6493	any_of(Range: Op->users(), P: [this, IsLiveOutDead](User *U) {
6494	return !VecValuesToIgnore.contains(Ptr: U) &&
6495	!ValuesToIgnore.contains(Ptr: U) && !IsLiveOutDead (U);
6496	}))
6497	continue;
6498
6499	// If all of Op's users are in ValuesToIgnore, add it to ValuesToIgnore
6500	// which applies for both scalar and vector versions. Otherwise it is only
6501	// dead in vector versions, so only add it to VecValuesToIgnore.
6502	if (all_of(Range: Op->users(),
6503	P: [this](User U) { return* ValuesToIgnore.contains(Ptr: U); }))
6504	ValuesToIgnore.insert(Ptr: Op);
6505
6506	VecValuesToIgnore.insert(Ptr: Op);
6507	DeadOps.append(in_start: Op->op_begin(), in_end: Op->op_end());
6508	}
6509
6510	// Ignore type-promoting instructions we identified during reduction
6511	// detection.
6512	for (const auto &Reduction : Legal->getReductionVars()) {
6513	const RecurrenceDescriptor &RedDes = Reduction.second;
6514	const SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
6515	VecValuesToIgnore.insert_range(R: Casts);
6516	}
6517	// Ignore type-casting instructions we identified during induction
6518	// detection.
6519	for (const auto &Induction : Legal->getInductionVars()) {
6520	const InductionDescriptor &IndDes = Induction.second;
6521	const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
6522	VecValuesToIgnore.insert_range(R: Casts);
6523	}
6524	}
6525
6526	void LoopVectorizationCostModel::collectInLoopReductions() {
6527	// Avoid duplicating work finding in-loop reductions.
6528	if (!InLoopReductions.empty())
6529	return;
6530
6531	for (const auto &Reduction : Legal->getReductionVars()) {
6532	PHINode *Phi = Reduction.first;
6533	const RecurrenceDescriptor &RdxDesc = Reduction.second;
6534
6535	// We don't collect reductions that are type promoted (yet).
6536	if (RdxDesc.getRecurrenceType() != Phi->getType())
6537	continue;
6538
6539	// If the target would prefer this reduction to happen "in-loop", then we
6540	// want to record it as such.
6541	RecurKind Kind = RdxDesc.getRecurrenceKind();
6542	if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) &&
6543	!TTI.preferInLoopReduction(Kind, Ty: Phi->getType()))
6544	continue;
6545
6546	// Check that we can correctly put the reductions into the loop, by
6547	// finding the chain of operations that leads from the phi to the loop
6548	// exit value.
6549	SmallVector<Instruction *, `4`> ReductionOperations =
6550	RdxDesc.getReductionOpChain(Phi, L: TheLoop);
6551	bool InLoop = !ReductionOperations.empty();
6552
6553	if (InLoop) {
6554	InLoopReductions.insert(Ptr: Phi);
6555	// Add the elements to InLoopReductionImmediateChains for cost modelling.
6556	Instruction *LastChain = Phi;
6557	for (auto *I : ReductionOperations) {
6558	InLoopReductionImmediateChains [I] = LastChain;
6559	LastChain = I;
6560	}
6561	}
6562	LLVM_DEBUG(dbgs() << "LV: Using " << (InLoop ? "inloop" : "out of loop")
6563	<< " reduction for phi: " << *Phi << "\n");
6564	}
6565	}
6566
6567	// This function will select a scalable VF if the target supports scalable
6568	// vectors and a fixed one otherwise.
6569	// TODO: we could return a pair of values that specify the max VF and
6570	// min VF, to be used in `buildVPlans(MinVF, MaxVF)` instead of
6571	// `buildVPlans(VF, VF)`. We cannot do it because VPLAN at the moment
6572	// doesn't have a cost model that can choose which plan to execute if
6573	// more than one is generated.
6574	static ElementCount determineVPlanVF(const TargetTransformInfo &TTI,
6575	LoopVectorizationCostModel &CM) {
6576	unsigned WidestType;
6577	std::tie(args: std::ignore, args&: WidestType) = CM.getSmallestAndWidestTypes();
6578
6579	TargetTransformInfo::RegisterKind RegKind =
6580	TTI.enableScalableVectorization()
6581	? TargetTransformInfo::RGK_ScalableVector
6582	: TargetTransformInfo::RGK_FixedWidthVector;
6583
6584	TypeSize RegSize = TTI.getRegisterBitWidth(K: RegKind);
6585	unsigned N = RegSize.getKnownMinValue() / WidestType;
6586	return ElementCount::get(MinVal: N, Scalable: RegSize.isScalable());
6587	}
6588
6589	VectorizationFactor
6590	LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
6591	ElementCount VF = UserVF;
6592	// Outer loop handling: They may require CFG and instruction level
6593	// transformations before even evaluating whether vectorization is profitable.
6594	// Since we cannot modify the incoming IR, we need to build VPlan upfront in
6595	// the vectorization pipeline.
6596	if (!OrigLoop->isInnermost()) {
6597	// If the user doesn't provide a vectorization factor, determine a
6598	// reasonable one.
6599	if (UserVF.isZero()) {
6600	VF = determineVPlanVF(TTI, CM);
6601	LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n");
6602
6603	// Make sure we have a VF > 1 for stress testing.
6604	if (VPlanBuildStressTest && (VF.isScalar() \|\| VF.isZero())) {
6605	LLVM_DEBUG(dbgs() << "LV: VPlan stress testing: "
6606	<< "overriding computed VF.\n");
6607	VF = ElementCount::getFixed(MinVal: `4`);
6608	}
6609	} else if (UserVF.isScalable() && !TTI.supportsScalableVectors() &&
6610	!ForceTargetSupportsScalableVectors) {
6611	LLVM_DEBUG(dbgs() << "LV: Not vectorizing. Scalable VF requested, but "
6612	<< "not supported by the target.\n");
6613	reportVectorizationFailure(
6614	DebugMsg: "Scalable vectorization requested but not supported by the target",
6615	OREMsg: "the scalable user-specified vectorization width for outer-loop "
6616	"vectorization cannot be used because the target does not support "
6617	"scalable vectors.",
6618	ORETag: "ScalableVFUnfeasible", ORE, TheLoop: OrigLoop);
6619	return VectorizationFactor::Disabled();
6620	}
6621	assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
6622	assert(isPowerOf2_32(VF.getKnownMinValue()) &&
6623	"VF needs to be a power of two");
6624	LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
6625	<< "VF " << VF << " to build VPlans.\n");
6626	buildVPlans(MinVF: VF, MaxVF: VF);
6627
6628	if (VPlans.empty())
6629	return VectorizationFactor::Disabled();
6630
6631	// For VPlan build stress testing, we bail out after VPlan construction.
6632	if (VPlanBuildStressTest)
6633	return VectorizationFactor::Disabled();
6634
6635	return {VF, `0` /Cost/, `0` / ScalarCost /};
6636	}
6637
6638	LLVM_DEBUG(
6639	dbgs() << "LV: Not vectorizing. Inner loops aren't supported in the "
6640	"VPlan-native path.\n");
6641	return VectorizationFactor::Disabled();
6642	}
6643
6644	void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6645	assert(OrigLoop->isInnermost() && "Inner loop expected.");
6646	CM.collectValuesToIgnore();
6647	CM.collectElementTypesForWidening();
6648
6649	FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
6650	if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
6651	return;
6652
6653	// Invalidate interleave groups if all blocks of loop will be predicated.
6654	if (CM.blockNeedsPredicationForAnyReason(BB: OrigLoop->getHeader()) &&
6655	!useMaskedInterleavedAccesses(TTI)) {
6656	LLVM_DEBUG(
6657	dbgs()
6658	<< "LV: Invalidate all interleaved groups due to fold-tail by masking "
6659	"which requires masked-interleaved support.\n");
6660	if (CM.InterleaveInfo.invalidateGroups())
6661	// Invalidating interleave groups also requires invalidating all decisions
6662	// based on them, which includes widening decisions and uniform and scalar
6663	// values.
6664	CM.invalidateCostModelingDecisions();
6665	}
6666
6667	if (CM.foldTailByMasking())
6668	Legal->prepareToFoldTailByMasking();
6669
6670	ElementCount MaxUserVF =
6671	UserVF.isScalable() ? MaxFactors.ScalableVF : MaxFactors.FixedVF;
6672	if (UserVF) {
6673	if (!ElementCount::isKnownLE(LHS: UserVF, RHS: MaxUserVF)) {
6674	reportVectorizationInfo(
6675	Msg: "UserVF ignored because it may be larger than the maximal safe VF",
6676	ORETag: "InvalidUserVF", ORE, TheLoop: OrigLoop);
6677	} else {
6678	assert(isPowerOf2_32(UserVF.getKnownMinValue()) &&
6679	"VF needs to be a power of two");
6680	// Collect the instructions (and their associated costs) that will be more
6681	// profitable to scalarize.
6682	CM.collectInLoopReductions();
6683	if (CM.selectUserVectorizationFactor(UserVF)) {
6684	LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6685	buildVPlansWithVPRecipes(MinVF: UserVF, MaxVF: UserVF);
6686	LLVM_DEBUG(printPlans(dbgs()));
6687	return;
6688	}
6689	reportVectorizationInfo(Msg: "UserVF ignored because of invalid costs.",
6690	ORETag: "InvalidCost", ORE, TheLoop: OrigLoop);
6691	}
6692	}
6693
6694	// Collect the Vectorization Factor Candidates.
6695	SmallVector<ElementCount> VFCandidates;
6696	for (auto VF = ElementCount::getFixed(MinVal: `1`);
6697	ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.FixedVF); VF *= `2`)
6698	VFCandidates.push_back(Elt: VF);
6699	for (auto VF = ElementCount::getScalable(MinVal: `1`);
6700	ElementCount::isKnownLE(LHS: VF, RHS: MaxFactors.ScalableVF); VF *= `2`)
6701	VFCandidates.push_back(Elt: VF);
6702
6703	CM.collectInLoopReductions();
6704	for (const auto &VF : VFCandidates) {
6705	// Collect Uniform and Scalar instructions after vectorization with VF.
6706	CM.collectNonVectorizedAndSetWideningDecisions(VF);
6707	}
6708
6709	buildVPlansWithVPRecipes(MinVF: ElementCount::getFixed(MinVal: `1`), MaxVF: MaxFactors.FixedVF);
6710	buildVPlansWithVPRecipes(MinVF: ElementCount::getScalable(MinVal: `1`), MaxVF: MaxFactors.ScalableVF);
6711
6712	LLVM_DEBUG(printPlans(dbgs()));
6713	}
6714
6715	InstructionCost VPCostContext::getLegacyCost(Instruction *UI,
6716	ElementCount VF) const {
6717	if (ForceTargetInstructionCost.getNumOccurrences())
6718	return InstructionCost (ForceTargetInstructionCost.getNumOccurrences());
6719	return CM.getInstructionCost(I: UI, VF);
6720	}
6721
6722	bool VPCostContext::isLegacyUniformAfterVectorization(Instruction *I,
6723	ElementCount VF) const {
6724	return CM.isUniformAfterVectorization(I, VF);
6725	}
6726
6727	bool VPCostContext::skipCostComputation(Instruction UI, bool* IsVector) const {
6728	return CM.ValuesToIgnore.contains(Ptr: UI) \|\|
6729	(IsVector && CM.VecValuesToIgnore.contains(Ptr: UI)) \|\|
6730	SkipCostComputation.contains(Ptr: UI);
6731	}
6732
6733	InstructionCost
6734	LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
6735	VPCostContext &CostCtx) const {
6736	InstructionCost Cost;
6737	// Cost modeling for inductions is inaccurate in the legacy cost model
6738	// compared to the recipes that are generated. To match here initially during
6739	// VPlan cost model bring up directly use the induction costs from the legacy
6740	// cost model. Note that we do this as pre-processing; the VPlan may not have
6741	// any recipes associated with the original induction increment instruction
6742	// and may replace truncates with VPWidenIntOrFpInductionRecipe. We precompute
6743	// the cost of induction phis and increments (both that are represented by
6744	// recipes and those that are not), to avoid distinguishing between them here,
6745	// and skip all recipes that represent induction phis and increments (the
6746	// former case) later on, if they exist, to avoid counting them twice.
6747	// Similarly we pre-compute the cost of any optimized truncates.
6748	// TODO: Switch to more accurate costing based on VPlan.
6749	for (const auto &[IV, IndDesc] : Legal->getInductionVars()) {
6750	Instruction *IVInc = cast<Instruction>(
6751	Val: IV->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()));
6752	SmallVector<Instruction *> IVInsts = {IVInc};
6753	for (unsigned I = `0`; I != IVInsts.size(); I++) {
6754	for (Value *Op : IVInsts [I]->operands()) {
6755	auto *OpI = dyn_cast<Instruction>(Val: Op);
6756	if (Op == IV \|\| !OpI \|\| !OrigLoop->contains(Inst: OpI) \|\| !Op->hasOneUse())
6757	continue;
6758	IVInsts.push_back(Elt: OpI);
6759	}
6760	}
6761	IVInsts.push_back(Elt: IV);
6762	for (User *U : IV->users()) {
6763	auto *CI = cast<Instruction>(Val: U);
6764	if (!CostCtx.CM.isOptimizableIVTruncate(I: CI, VF))
6765	continue;
6766	IVInsts.push_back(Elt: CI);
6767	}
6768
6769	// If the vector loop gets executed exactly once with the given VF, ignore
6770	// the costs of comparison and induction instructions, as they'll get
6771	// simplified away.
6772	// TODO: Remove this code after stepping away from the legacy cost model and
6773	// adding code to simplify VPlans before calculating their costs.
6774	auto TC = getSmallConstantTripCount(SE: PSE.getSE(), L: OrigLoop);
6775	if (TC == VF && !CM.foldTailByMasking())
6776	addFullyUnrolledInstructionsToIgnore(L: OrigLoop, IL: Legal->getInductionVars(),
6777	InstsToIgnore&: CostCtx.SkipCostComputation);
6778
6779	for (Instruction *IVInst : IVInsts) {
6780	if (CostCtx.skipCostComputation(UI: IVInst, IsVector: VF.isVector()))
6781	continue;
6782	InstructionCost InductionCost = CostCtx.getLegacyCost(UI: IVInst, VF);
6783	LLVM_DEBUG({
6784	dbgs() << "Cost of " << InductionCost << " for VF " << VF
6785	<< ": induction instruction " << *IVInst << "\n";
6786	});
6787	Cost += InductionCost;
6788	CostCtx.SkipCostComputation.insert(Ptr: IVInst);
6789	}
6790	}
6791
6792	/// Compute the cost of all exiting conditions of the loop using the legacy
6793	/// cost model. This is to match the legacy behavior, which adds the cost of
6794	/// all exit conditions. Note that this over-estimates the cost, as there will
6795	/// be a single condition to control the vector loop.
6796	SmallVector<BasicBlock *> Exiting;
6797	CM.TheLoop->getExitingBlocks(ExitingBlocks&: Exiting);
6798	SetVector<Instruction *> ExitInstrs;
6799	// Collect all exit conditions.
6800	for (BasicBlock *EB : Exiting) {
6801	auto *Term = dyn_cast<BranchInst>(Val: EB->getTerminator());
6802	if (!Term \|\| CostCtx.skipCostComputation(UI: Term, IsVector: VF.isVector()))
6803	continue;
6804	if (auto *CondI = dyn_cast<Instruction>(Val: Term->getOperand(i_nocapture: `0`))) {
6805	ExitInstrs.insert(X: CondI);
6806	}
6807	}
6808	// Compute the cost of all instructions only feeding the exit conditions.
6809	for (unsigned I = `0`; I != ExitInstrs.size(); ++I) {
6810	Instruction *CondI = ExitInstrs [I];
6811	if (!OrigLoop->contains(Inst: CondI) \|\|
6812	!CostCtx.SkipCostComputation.insert(Ptr: CondI).second)
6813	continue;
6814	InstructionCost CondICost = CostCtx.getLegacyCost(UI: CondI, VF);
6815	LLVM_DEBUG({
6816	dbgs() << "Cost of " << CondICost << " for VF " << VF
6817	<< ": exit condition instruction " << *CondI << "\n";
6818	});
6819	Cost += CondICost;
6820	for (Value *Op : CondI->operands()) {
6821	auto *OpI = dyn_cast<Instruction>(Val: Op);
6822	if (!OpI \|\| CostCtx.skipCostComputation(UI: OpI, IsVector: VF.isVector()) \|\|
6823	any_of(Range: OpI->users(), P: [&ExitInstrs, this](User *U) {
6824	return OrigLoop->contains(BB: cast<Instruction>(Val: U)->getParent()) &&
6825	!ExitInstrs.contains(key: cast<Instruction>(Val: U));
6826	}))
6827	continue;
6828	ExitInstrs.insert(X: OpI);
6829	}
6830	}
6831
6832	// Pre-compute the costs for branches except for the backedge, as the number
6833	// of replicate regions in a VPlan may not directly match the number of
6834	// branches, which would lead to different decisions.
6835	// TODO: Compute cost of branches for each replicate region in the VPlan,
6836	// which is more accurate than the legacy cost model.
6837	for (BasicBlock *BB : OrigLoop->blocks()) {
6838	if (CostCtx.skipCostComputation(UI: BB->getTerminator(), IsVector: VF.isVector()))
6839	continue;
6840	CostCtx.SkipCostComputation.insert(Ptr: BB->getTerminator());
6841	if (BB == OrigLoop->getLoopLatch())
6842	continue;
6843	auto BranchCost = CostCtx.getLegacyCost(UI: BB->getTerminator(), VF);
6844	Cost += BranchCost;
6845	}
6846
6847	// Pre-compute costs for instructions that are forced-scalar or profitable to
6848	// scalarize. Their costs will be computed separately in the legacy cost
6849	// model.
6850	for (Instruction *ForcedScalar : CM.ForcedScalars [VF]) {
6851	if (CostCtx.skipCostComputation(UI: ForcedScalar, IsVector: VF.isVector()))
6852	continue;
6853	CostCtx.SkipCostComputation.insert(Ptr: ForcedScalar);
6854	InstructionCost ForcedCost = CostCtx.getLegacyCost(UI: ForcedScalar, VF);
6855	LLVM_DEBUG({
6856	dbgs() << "Cost of " << ForcedCost << " for VF " << VF
6857	<< ": forced scalar " << *ForcedScalar << "\n";
6858	});
6859	Cost += ForcedCost;
6860	}
6861	for (const auto &[Scalarized, ScalarCost] : CM.InstsToScalarize [VF]) {
6862	if (CostCtx.skipCostComputation(UI: Scalarized, IsVector: VF.isVector()))
6863	continue;
6864	CostCtx.SkipCostComputation.insert(Ptr: Scalarized);
6865	LLVM_DEBUG({
6866	dbgs() << "Cost of " << ScalarCost << " for VF " << VF
6867	<< ": profitable to scalarize " << *Scalarized << "\n";
6868	});
6869	Cost += ScalarCost;
6870	}
6871
6872	return Cost;
6873	}
6874
6875	InstructionCost LoopVectorizationPlanner::cost(VPlan &Plan,
6876	ElementCount VF) const {
6877	VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
6878	CM.CostKind);
6879	InstructionCost Cost = precomputeCosts(Plan, VF, CostCtx);
6880
6881	// Now compute and add the VPlan-based cost.
6882	Cost += Plan.cost(VF, Ctx&: CostCtx);
6883	#ifndef NDEBUG
6884	unsigned EstimatedWidth = getEstimatedRuntimeVF(VF, CM.getVScaleForTuning());
6885	LLVM_DEBUG(dbgs() << "Cost for VF " << VF << ": " << Cost
6886	<< " (Estimated cost per lane: ");
6887	if (Cost.isValid()) {
6888	double CostPerLane = double(Cost.getValue()) / EstimatedWidth;
6889	LLVM_DEBUG(dbgs() << format("%.1f", CostPerLane));
6890	} else / No point dividing an invalid cost - it will still be invalid /
6891	LLVM_DEBUG(dbgs() << "Invalid");
6892	LLVM_DEBUG(dbgs() << ")\n");
6893	#endif
6894	return Cost;
6895	}
6896
6897	#ifndef NDEBUG
6898	/// Return true if the original loop \ TheLoop contains any instructions that do
6899	/// not have corresponding recipes in \p Plan and are not marked to be ignored
6900	/// in \p CostCtx. This means the VPlan contains simplification that the legacy
6901	/// cost-model did not account for.
6902	static bool planContainsAdditionalSimplifications(VPlan &Plan,
6903	VPCostContext &CostCtx,
6904	Loop *TheLoop,
6905	ElementCount VF) {
6906	// First collect all instructions for the recipes in Plan.
6907	auto GetInstructionForCost = [](const VPRecipeBase R) -> Instruction {
6908	if (auto *S = dyn_cast<VPSingleDefRecipe>(R))
6909	return dyn_cast_or_null<Instruction>(S->getUnderlyingValue());
6910	if (auto *WidenMem = dyn_cast<VPWidenMemoryRecipe>(R))
6911	return &WidenMem->getIngredient();
6912	return nullptr;
6913	};
6914
6915	DenseSet<Instruction *> SeenInstrs;
6916	auto Iter = vp_depth_first_deep(Plan.getVectorLoopRegion()->getEntry());
6917	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Iter)) {
6918	for (VPRecipeBase &R : *VPBB) {
6919	if (auto *IR = dyn_cast<VPInterleaveRecipe>(&R)) {
6920	auto *IG = IR->getInterleaveGroup();
6921	unsigned NumMembers = IG->getNumMembers();
6922	for (unsigned I = `0`; I != NumMembers; ++I) {
6923	if (Instruction *M = IG->getMember(I))
6924	SeenInstrs.insert(M);
6925	}
6926	continue;
6927	}
6928	// Unused FOR splices are removed by VPlan transforms, so the VPlan-based
6929	// cost model won't cost it whilst the legacy will.
6930	if (auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(&R)) {
6931	if (none_of(FOR->users(), [](VPUser *U) {
6932	auto *VPI = dyn_cast<VPInstruction>(U);
6933	return VPI && VPI->getOpcode() ==
6934	VPInstruction::FirstOrderRecurrenceSplice;
6935	}))
6936	return true;
6937	}
6938	// The VPlan-based cost model is more accurate for partial reduction and
6939	// comparing against the legacy cost isn't desirable.
6940	if (isa<VPPartialReductionRecipe>(&R))
6941	return true;
6942
6943	/// If a VPlan transform folded a recipe to one producing a single-scalar,
6944	/// but the original instruction wasn't uniform-after-vectorization in the
6945	/// legacy cost model, the legacy cost overestimates the actual cost.
6946	if (auto *RepR = dyn_cast<VPReplicateRecipe>(&R)) {
6947	if (RepR->isSingleScalar() &&
6948	!CostCtx.isLegacyUniformAfterVectorization(
6949	RepR->getUnderlyingInstr(), VF))
6950	return true;
6951	}
6952	if (Instruction *UI = GetInstructionForCost(&R)) {
6953	// If we adjusted the predicate of the recipe, the cost in the legacy
6954	// cost model may be different.
6955	if (auto *WidenCmp = dyn_cast<VPWidenRecipe>(&R)) {
6956	if ((WidenCmp->getOpcode() == Instruction::ICmp \|\|
6957	WidenCmp->getOpcode() == Instruction::FCmp) &&
6958	WidenCmp->getPredicate() != cast<CmpInst>(UI)->getPredicate())
6959	return true;
6960	}
6961	SeenInstrs.insert(UI);
6962	}
6963	}
6964	}
6965
6966	// Return true if the loop contains any instructions that are not also part of
6967	// the VPlan or are skipped for VPlan-based cost computations. This indicates
6968	// that the VPlan contains extra simplifications.
6969	return any_of(TheLoop->blocks(), [&SeenInstrs, &CostCtx,
6970	TheLoop](BasicBlock *BB) {
6971	return any_of(*BB, [&SeenInstrs, &CostCtx, TheLoop, BB](Instruction &I) {
6972	// Skip induction phis when checking for simplifications, as they may not
6973	// be lowered directly be lowered to a corresponding PHI recipe.
6974	if (isa<PHINode>(&I) && BB == TheLoop->getHeader() &&
6975	CostCtx.CM.Legal->isInductionPhi(cast<PHINode>(&I)))
6976	return false;
6977	return !SeenInstrs.contains(&I) && !CostCtx.skipCostComputation(&I, true);
6978	});
6979	});
6980	}
6981	#endif
6982
6983	VectorizationFactor LoopVectorizationPlanner::computeBestVF() {
6984	if (VPlans.empty())
6985	return VectorizationFactor::Disabled();
6986	// If there is a single VPlan with a single VF, return it directly.
6987	VPlan &FirstPlan = *VPlans [`0`];
6988	if (VPlans.size() == `1` && size(Range: FirstPlan.vectorFactors()) == `1`)
6989	return {*FirstPlan.vectorFactors().begin(), `0`, `0`};
6990
6991	LLVM_DEBUG(dbgs() << "LV: Computing best VF using cost kind: "
6992	<< (CM.CostKind == TTI::TCK_RecipThroughput
6993	? "Reciprocal Throughput\n"
6994	: CM.CostKind == TTI::TCK_Latency
6995	? "Instruction Latency\n"
6996	: CM.CostKind == TTI::TCK_CodeSize ? "Code Size\n"
6997	: CM.CostKind == TTI::TCK_SizeAndLatency
6998	? "Code Size and Latency\n"
6999	: "Unknown\n"));
7000
7001	ElementCount ScalarVF = ElementCount::getFixed(MinVal: `1`);
7002	assert(hasPlanWithVF(ScalarVF) &&
7003	"More than a single plan/VF w/o any plan having scalar VF");
7004
7005	// TODO: Compute scalar cost using VPlan-based cost model.
7006	InstructionCost ScalarCost = CM.expectedCost(VF: ScalarVF);
7007	LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ScalarCost << ".\n");
7008	VectorizationFactor ScalarFactor(ScalarVF, ScalarCost, ScalarCost);
7009	VectorizationFactor BestFactor = ScalarFactor;
7010
7011	bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
7012	if (ForceVectorization) {
7013	// Ignore scalar width, because the user explicitly wants vectorization.
7014	// Initialize cost to max so that VF = 2 is, at least, chosen during cost
7015	// evaluation.
7016	BestFactor.Cost = InstructionCost::getMax();
7017	}
7018
7019	for (auto &P : VPlans) {
7020	ArrayRef<ElementCount> VFs(P ->vectorFactors().begin(),
7021	P ->vectorFactors().end());
7022
7023	SmallVector<VPRegisterUsage, `8`> RUs;
7024	if (CM.useMaxBandwidth(RegKind: TargetTransformInfo::RGK_ScalableVector) \|\|
7025	CM.useMaxBandwidth(RegKind: TargetTransformInfo::RGK_FixedWidthVector))
7026	RUs = calculateRegisterUsageForPlan(Plan&: *P, VFs, TTI, ValuesToIgnore: CM.ValuesToIgnore);
7027
7028	for (unsigned I = `0`; I < VFs.size(); I++) {
7029	ElementCount VF = VFs [I];
7030	if (VF.isScalar())
7031	continue;
7032	if (!ForceVectorization && !willGenerateVectors(Plan&: *P, VF, TTI)) {
7033	LLVM_DEBUG(
7034	dbgs()
7035	<< "LV: Not considering vector loop of width " << VF
7036	<< " because it will not generate any vector instructions.\n");
7037	continue;
7038	}
7039	if (CM.OptForSize && !ForceVectorization && hasReplicatorRegion(Plan&: *P)) {
7040	LLVM_DEBUG(
7041	dbgs()
7042	<< "LV: Not considering vector loop of width " << VF
7043	<< " because it would cause replicated blocks to be generated,"
7044	<< " which isn't allowed when optimizing for size.\n");
7045	continue;
7046	}
7047
7048	InstructionCost Cost = cost(Plan&: *P, VF);
7049	VectorizationFactor CurrentFactor(VF, Cost, ScalarCost);
7050
7051	if (CM.useMaxBandwidth(VF) && RUs [I].exceedsMaxNumRegs(TTI)) {
7052	LLVM_DEBUG(dbgs() << "LV(REG): Not considering vector loop of width "
7053	<< VF << " because it uses too many registers\n");
7054	continue;
7055	}
7056
7057	if (isMoreProfitable(A: CurrentFactor, B: BestFactor, HasTail: P ->hasScalarTail()))
7058	BestFactor = CurrentFactor;
7059
7060	// If profitable add it to ProfitableVF list.
7061	if (isMoreProfitable(A: CurrentFactor, B: ScalarFactor, HasTail: P ->hasScalarTail()))
7062	ProfitableVFs.push_back(Elt: CurrentFactor);
7063	}
7064	}
7065
7066	#ifndef NDEBUG
7067	// Select the optimal vectorization factor according to the legacy cost-model.
7068	// This is now only used to verify the decisions by the new VPlan-based
7069	// cost-model and will be retired once the VPlan-based cost-model is
7070	// stabilized.
7071	VectorizationFactor LegacyVF = selectVectorizationFactor();
7072	VPlan &BestPlan = getPlanFor(BestFactor.Width);
7073
7074	// Pre-compute the cost and use it to check if BestPlan contains any
7075	// simplifications not accounted for in the legacy cost model. If that's the
7076	// case, don't trigger the assertion, as the extra simplifications may cause a
7077	// different VF to be picked by the VPlan-based cost model.
7078	VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
7079	CM.CostKind);
7080	precomputeCosts(BestPlan, BestFactor.Width, CostCtx);
7081	// Verify that the VPlan-based and legacy cost models agree, except for VPlans
7082	// with early exits and plans with additional VPlan simplifications. The
7083	// legacy cost model doesn't properly model costs for such loops.
7084	assert((BestFactor.Width == LegacyVF.Width \|\| BestPlan.hasEarlyExit() \|\|
7085	planContainsAdditionalSimplifications(getPlanFor(BestFactor.Width),
7086	CostCtx, OrigLoop,
7087	BestFactor.Width) \|\|
7088	planContainsAdditionalSimplifications(
7089	getPlanFor(LegacyVF.Width), CostCtx, OrigLoop, LegacyVF.Width)) &&
7090	" VPlan cost model and legacy cost model disagreed");
7091	assert((BestFactor.Width.isScalar() \|\| BestFactor.ScalarCost > `0`) &&
7092	"when vectorizing, the scalar cost must be computed.");
7093	#endif
7094
7095	LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << BestFactor.Width << ".\n");
7096	return BestFactor;
7097	}
7098
7099	static void addRuntimeUnrollDisableMetaData(Loop *L) {
7100	SmallVector<Metadata *, `4`> MDs;
7101	// Reserve first location for self reference to the LoopID metadata node.
7102	MDs.push_back(Elt: nullptr);
7103	bool IsUnrollMetadata = false;
7104	MDNode *LoopID = L->getLoopID();
7105	if (LoopID) {
7106	// First find existing loop unrolling disable metadata.
7107	for (unsigned I = `1`, IE = LoopID->getNumOperands(); I < IE; ++I) {
7108	auto *MD = dyn_cast<MDNode>(Val: LoopID->getOperand(I));
7109	if (MD) {
7110	const auto *S = dyn_cast<MDString>(Val: MD->getOperand(I: `0`));
7111	IsUnrollMetadata =
7112	S && S->getString().starts_with(Prefix: "llvm.loop.unroll.disable");
7113	}
7114	MDs.push_back(Elt: LoopID->getOperand(I));
7115	}
7116	}
7117
7118	if (!IsUnrollMetadata) {
7119	// Add runtime unroll disable metadata.
7120	LLVMContext &Context = L->getHeader()->getContext();
7121	SmallVector<Metadata *, `1`> DisableOperands;
7122	DisableOperands.push_back(
7123	Elt: MDString::get(Context, Str: "llvm.loop.unroll.runtime.disable"));
7124	MDNode *DisableNode = MDNode::get(Context, MDs: DisableOperands);
7125	MDs.push_back(Elt: DisableNode);
7126	MDNode *NewLoopID = MDNode::get(Context, MDs);
7127	// Set operand 0 to refer to the loop id itself.
7128	NewLoopID->replaceOperandWith(I: `0`, New: NewLoopID);
7129	L->setLoopID(NewLoopID);
7130	}
7131	}
7132
7133	static Value getStartValueFromReductionResult(VPInstruction RdxResult) {
7134	using namespace VPlanPatternMatch;
7135	assert(RdxResult->getOpcode() == VPInstruction::ComputeFindIVResult &&
7136	"RdxResult must be ComputeFindIVResult");
7137	VPValue *StartVPV = RdxResult->getOperand(N: `1`);
7138	match(V: StartVPV, P: m_Freeze(Op0: m_VPValue(V&: StartVPV)));
7139	return StartVPV->getLiveInIRValue();
7140	}
7141
7142	// If \p EpiResumePhiR is resume VPPhi for a reduction when vectorizing the
7143	// epilog loop, fix the reduction's scalar PHI node by adding the incoming value
7144	// from the main vector loop.
7145	static void fixReductionScalarResumeWhenVectorizingEpilog(
7146	VPPhi EpiResumePhiR, VPTransformState &State, BasicBlock BypassBlock) {
7147	// Get the VPInstruction computing the reduction result in the middle block.
7148	// The first operand may not be from the middle block if it is not connected
7149	// to the scalar preheader. In that case, there's nothing to fix.
7150	VPValue *Incoming = EpiResumePhiR->getOperand(N: `0`);
7151	match(V: Incoming, P: VPlanPatternMatch::m_ZExtOrSExt(
7152	Op0: VPlanPatternMatch::m_VPValue(V&: Incoming)));
7153	auto *EpiRedResult = dyn_cast<VPInstruction>(Val: Incoming);
7154	if (!EpiRedResult \|\|
7155	(EpiRedResult->getOpcode() != VPInstruction::ComputeAnyOfResult &&
7156	EpiRedResult->getOpcode() != VPInstruction::ComputeReductionResult &&
7157	EpiRedResult->getOpcode() != VPInstruction::ComputeFindIVResult))
7158	return;
7159
7160	auto *EpiRedHeaderPhi =
7161	cast<VPReductionPHIRecipe>(Val: EpiRedResult->getOperand(N: `0`));
7162	RecurKind Kind = EpiRedHeaderPhi->getRecurrenceKind();
7163	Value *MainResumeValue;
7164	if (auto *VPI = dyn_cast<VPInstruction>(Val: EpiRedHeaderPhi->getStartValue())) {
7165	assert((VPI->getOpcode() == VPInstruction::Broadcast \|\|
7166	VPI->getOpcode() == VPInstruction::ReductionStartVector) &&
7167	"unexpected start recipe");
7168	MainResumeValue = VPI->getOperand(N: `0`)->getUnderlyingValue();
7169	} else
7170	MainResumeValue = EpiRedHeaderPhi->getStartValue()->getUnderlyingValue();
7171	if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind)) {
7172	[[maybe_unused]] Value *StartV =
7173	EpiRedResult->getOperand(N: `1`)->getLiveInIRValue();
7174	auto *Cmp = cast<ICmpInst>(Val: MainResumeValue);
7175	assert(Cmp->getPredicate() == CmpInst::ICMP_NE &&
7176	"AnyOf expected to start with ICMP_NE");
7177	assert(Cmp->getOperand(`1`) == StartV &&
7178	"AnyOf expected to start by comparing main resume value to original "
7179	"start value");
7180	MainResumeValue = Cmp->getOperand(i_nocapture: `0`);
7181	} else if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind)) {
7182	Value *StartV = getStartValueFromReductionResult(RdxResult: EpiRedResult);
7183	Value *SentinelV = EpiRedResult->getOperand(N: `2`)->getLiveInIRValue();
7184	using namespace llvm::PatternMatch;
7185	Value Cmp, OrigResumeV, *CmpOp;
7186	[[maybe_unused]] bool IsExpectedPattern =
7187	match(V: MainResumeValue,
7188	P: m_Select(C: m_OneUse(SubPattern: m_Value(V&: Cmp)), L: m_Specific(V: SentinelV),
7189	R: m_Value(V&: OrigResumeV))) &&
7190	(match(V: Cmp, P: m_SpecificICmp(MatchPred: ICmpInst::ICMP_EQ, L: m_Specific(V: OrigResumeV),
7191	R: m_Value(V&: CmpOp))) &&
7192	((CmpOp == StartV && isGuaranteedNotToBeUndefOrPoison(V: CmpOp))));
7193	assert(IsExpectedPattern && "Unexpected reduction resume pattern");
7194	MainResumeValue = OrigResumeV;
7195	}
7196	PHINode *MainResumePhi = cast<PHINode>(Val: MainResumeValue);
7197
7198	// When fixing reductions in the epilogue loop we should already have
7199	// created a bc.merge.rdx Phi after the main vector body. Ensure that we carry
7200	// over the incoming values correctly.
7201	auto EpiResumePhi = cast<PHINode>(Val: State.get(Def: EpiResumePhiR, IsScalar: true*));
7202	EpiResumePhi->setIncomingValueForBlock(
7203	BB: BypassBlock, V: MainResumePhi->getIncomingValueForBlock(BB: BypassBlock));
7204	}
7205
7206	DenseMap<const SCEV , Value > LoopVectorizationPlanner::executePlan(
7207	ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7208	InnerLoopVectorizer &ILV, DominatorTree DT, bool* VectorizingEpilogue) {
7209	assert(BestVPlan.hasVF(BestVF) &&
7210	"Trying to execute plan with unsupported VF");
7211	assert(BestVPlan.hasUF(BestUF) &&
7212	"Trying to execute plan with unsupported UF");
7213	if (BestVPlan.hasEarlyExit())
7214	++LoopsEarlyExitVectorized;
7215	// TODO: Move to VPlan transform stage once the transition to the VPlan-based
7216	// cost model is complete for better cost estimates.
7217	VPlanTransforms::runPass(Fn: VPlanTransforms::unrollByUF, Plan&: BestVPlan, Args&: BestUF,
7218	Args&: OrigLoop->getHeader()->getContext());
7219	VPlanTransforms::runPass(Fn: VPlanTransforms::replicateByVF, Plan&: BestVPlan, Args&: BestVF);
7220	VPlanTransforms::runPass(Fn: VPlanTransforms::materializeBroadcasts, Plan&: BestVPlan);
7221	bool HasBranchWeights =
7222	hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator());
7223	if (HasBranchWeights) {
7224	std::optional<unsigned> VScale = CM.getVScaleForTuning();
7225	VPlanTransforms::runPass(Fn: VPlanTransforms::addBranchWeightToMiddleTerminator,
7226	Plan&: BestVPlan, Args&: BestVF, Args&: VScale);
7227	}
7228
7229	if (!VectorizingEpilogue) {
7230	// Checks are the same for all VPlans, added to BestVPlan only for
7231	// compactness.
7232	attachRuntimeChecks(Plan&: BestVPlan, RTChecks&: ILV.RTChecks, HasBranchWeights);
7233	}
7234
7235	// Retrieving VectorPH now when it's easier while VPlan still has Regions.
7236	VPBasicBlock *VectorPH = cast<VPBasicBlock>(Val: BestVPlan.getVectorPreheader());
7237	VPlanTransforms::optimizeForVFAndUF(Plan&: BestVPlan, BestVF, BestUF, PSE);
7238	VPlanTransforms::simplifyRecipes(Plan&: BestVPlan, CanonicalIVTy&: *Legal->getWidestInductionType());
7239	VPlanTransforms::narrowInterleaveGroups(
7240	Plan&: BestVPlan, VF: BestVF,
7241	VectorRegWidth: TTI.getRegisterBitWidth(K: TargetTransformInfo::RGK_FixedWidthVector));
7242	VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
7243
7244	VPlanTransforms::convertToConcreteRecipes(Plan&: BestVPlan,
7245	CanonicalIVTy&: *Legal->getWidestInductionType());
7246	// Regions are dissolved after optimizing for VF and UF, which completely
7247	// removes unneeded loop regions first.
7248	VPlanTransforms::dissolveLoopRegions(Plan&: BestVPlan);
7249	// Perform the actual loop transformation.
7250	VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan,
7251	OrigLoop->getParentLoop(),
7252	Legal->getWidestInductionType());
7253
7254	#ifdef EXPENSIVE_CHECKS
7255	assert(DT->verify(DominatorTree::VerificationLevel::Fast));
7256	#endif
7257
7258	// 0. Generate SCEV-dependent code in the entry, including TripCount, before
7259	// making any changes to the CFG.
7260	DenseMap<const SCEV , Value > ExpandedSCEVs;
7261	auto *Entry = cast<VPIRBasicBlock>(Val: BestVPlan.getEntry());
7262	State.Builder.SetInsertPoint(Entry->getIRBasicBlock()->getTerminator());
7263	for (VPRecipeBase &R : make_early_inc_range(Range&: *Entry)) {
7264	auto *ExpSCEV = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
7265	if (!ExpSCEV)
7266	continue;
7267	ExpSCEV->execute(State);
7268	ExpandedSCEVs [ExpSCEV->getSCEV()] = State.get(Def: ExpSCEV, Lane: VPLane (`0`));
7269	VPValue *Exp = BestVPlan.getOrAddLiveIn(V: ExpandedSCEVs [ExpSCEV->getSCEV()]);
7270	ExpSCEV->replaceAllUsesWith(New: Exp);
7271	if (BestVPlan.getTripCount() == ExpSCEV)
7272	BestVPlan.resetTripCount(NewTripCount: Exp);
7273	ExpSCEV->eraseFromParent();
7274	}
7275
7276	if (!ILV.getTripCount())
7277	ILV.setTripCount(State.get(Def: BestVPlan.getTripCount(), Lane: VPLane (`0`)));
7278	else
7279	assert(VectorizingEpilogue && "should only re-use the existing trip "
7280	"count during epilogue vectorization");
7281
7282	// 1. Set up the skeleton for vectorization, including vector pre-header and
7283	// middle block. The vector loop is created during VPlan execution.
7284	BasicBlock *EntryBB =
7285	cast<VPIRBasicBlock>(Val: BestVPlan.getEntry())->getIRBasicBlock();
7286	State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
7287	if (VectorizingEpilogue)
7288	VPlanTransforms::removeDeadRecipes(Plan&: BestVPlan);
7289
7290	assert(verifyVPlanIsValid(BestVPlan, true /VerifyLate/) &&
7291	"final VPlan is invalid");
7292
7293	ILV.printDebugTracesAtStart();
7294
7295	//===------------------------------------------------===//
7296	//
7297	// Notice: any optimization or new instruction that go
7298	// into the code below should also be implemented in
7299	// the cost-model.
7300	//
7301	//===------------------------------------------------===//
7302
7303	// 2. Copy and widen instructions from the old loop into the new loop.
7304	BestVPlan.prepareToExecute(
7305	TripCount: ILV.getTripCount(),
7306	VectorTripCount: ILV.getOrCreateVectorTripCount(InsertBlock: ILV.LoopVectorPreHeader), State);
7307	replaceVPBBWithIRVPBB(VPBB: VectorPH, IRBB: State.CFG.PrevBB);
7308
7309	// Move check blocks to their final position.
7310	// TODO: Move as part of VPIRBB execute and update impacted tests.
7311	if (BasicBlock *MemCheckBlock = ILV.RTChecks.getMemRuntimeChecks().second)
7312	MemCheckBlock->moveAfter(MovePos: EntryBB);
7313	if (BasicBlock *SCEVCheckBlock = ILV.RTChecks.getSCEVChecks().second)
7314	SCEVCheckBlock->moveAfter(MovePos: EntryBB);
7315
7316	BestVPlan.execute(State: &State);
7317
7318	// 2.5 When vectorizing the epilogue, fix reduction resume values from the
7319	// additional bypass block.
7320	if (VectorizingEpilogue) {
7321	assert(!BestVPlan.hasEarlyExit() &&
7322	"Epilogue vectorisation not yet supported with early exits");
7323	BasicBlock *PH = OrigLoop->getLoopPreheader();
7324	BasicBlock *BypassBlock = ILV.getAdditionalBypassBlock();
7325	for (auto *Pred : predecessors(BB: PH)) {
7326	for (PHINode &Phi : PH->phis()) {
7327	if (Phi.getBasicBlockIndex(BB: Pred) != -`1`)
7328	continue;
7329	Phi.addIncoming(V: Phi.getIncomingValueForBlock(BB: BypassBlock), BB: Pred);
7330	}
7331	}
7332	VPBasicBlock *ScalarPH = BestVPlan.getScalarPreheader();
7333	if (ScalarPH->getNumPredecessors() > `0`) {
7334	// If ScalarPH has predecessors, we may need to update its reduction
7335	// resume values.
7336	for (VPRecipeBase &R : ScalarPH->phis()) {
7337	fixReductionScalarResumeWhenVectorizingEpilog(EpiResumePhiR: cast<VPPhi>(Val: &R), State,
7338	BypassBlock);
7339	}
7340	}
7341	}
7342
7343	// 2.6. Maintain Loop Hints
7344	// Keep all loop hints from the original loop on the vector loop (we'll
7345	// replace the vectorizer-specific hints below).
7346	VPBasicBlock *HeaderVPBB = vputils::getFirstLoopHeader(Plan&: BestVPlan, VPDT&: State.VPDT);
7347	if (HeaderVPBB) {
7348	MDNode *OrigLoopID = OrigLoop->getLoopID();
7349
7350	std::optional<MDNode *> VectorizedLoopID =
7351	makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
7352	LLVMLoopVectorizeFollowupVectorized});
7353
7354	Loop *L = LI->getLoopFor(BB: State.CFG.VPBB2IRBB [HeaderVPBB]);
7355	if (VectorizedLoopID) {
7356	L->setLoopID(*VectorizedLoopID);
7357	} else {
7358	// Keep all loop hints from the original loop on the vector loop (we'll
7359	// replace the vectorizer-specific hints below).
7360	if (MDNode *LID = OrigLoop->getLoopID())
7361	L->setLoopID(LID);
7362
7363	LoopVectorizeHints Hints(L, true, *ORE);
7364	Hints.setAlreadyVectorized();
7365
7366	// Check if it's EVL-vectorized and mark the corresponding metadata.
7367	bool IsEVLVectorized =
7368	llvm::any_of(Range&: HeaderVPBB, P: [](const* VPRecipeBase &Recipe) {
7369	// Looking for the ExplictVectorLength VPInstruction.
7370	if (const auto *VI = dyn_cast<VPInstruction>(Val: &Recipe))
7371	return VI->getOpcode() == VPInstruction::ExplicitVectorLength;
7372	return false;
7373	});
7374	if (IsEVLVectorized) {
7375	LLVMContext &Context = L->getHeader()->getContext();
7376	MDNode *LoopID = L->getLoopID();
7377	auto *IsEVLVectorizedMD = MDNode::get(
7378	Context,
7379	MDs: {MDString::get(Context, Str: "llvm.loop.isvectorized.tailfoldingstyle"),
7380	MDString::get(Context, Str: "evl")});
7381	MDNode *NewLoopID = makePostTransformationMetadata(Context, OrigLoopID: LoopID, RemovePrefixes: {},
7382	AddAttrs: {IsEVLVectorizedMD});
7383	L->setLoopID(NewLoopID);
7384	}
7385	}
7386	TargetTransformInfo::UnrollingPreferences UP;
7387	TTI.getUnrollingPreferences(L, *PSE.getSE(), UP, ORE);
7388	if (!UP.UnrollVectorizedLoop \|\| VectorizingEpilogue)
7389	addRuntimeUnrollDisableMetaData(L);
7390	}
7391
7392	// 3. Fix the vectorized code: take care of header phi's, live-outs,
7393	// predication, updating analyses.
7394	ILV.fixVectorizedLoop(State);
7395
7396	ILV.printDebugTracesAtEnd();
7397
7398	return ExpandedSCEVs;
7399	}
7400
7401	//===--------------------------------------------------------------------===//
7402	// EpilogueVectorizerMainLoop
7403	//===--------------------------------------------------------------------===//
7404
7405	/// This function is partially responsible for generating the control flow
7406	/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7407	BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
7408	createVectorLoopSkeleton(Prefix: "");
7409
7410	// Generate the code to check the minimum iteration count of the vector
7411	// epilogue (see below).
7412	EPI.EpilogueIterationCountCheck =
7413	emitIterationCountCheck(Bypass: LoopScalarPreHeader, ForEpilogue: true);
7414	EPI.EpilogueIterationCountCheck->setName("iter.check");
7415
7416	// Generate the iteration count check for the main loop, after* the check*
7417	// for the epilogue loop, so that the path-length is shorter for the case
7418	// that goes directly through the vector epilogue. The longer-path length for
7419	// the main loop is compensated for, by the gain from vectorizing the larger
7420	// trip count. Note: the branch will get updated later on when we vectorize
7421	// the epilogue.
7422	EPI.MainLoopIterationCountCheck =
7423	emitIterationCountCheck(Bypass: LoopScalarPreHeader, ForEpilogue: false);
7424
7425	// Generate the induction variable.
7426	EPI.VectorTripCount = getOrCreateVectorTripCount(InsertBlock: LoopVectorPreHeader);
7427
7428	replaceVPBBWithIRVPBB(VPBB: Plan.getScalarPreheader(), IRBB: LoopScalarPreHeader);
7429	return LoopVectorPreHeader;
7430	}
7431
7432	void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
7433	LLVM_DEBUG({
7434	dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
7435	<< "Main Loop VF:" << EPI.MainLoopVF
7436	<< ", Main Loop UF:" << EPI.MainLoopUF
7437	<< ", Epilogue Loop VF:" << EPI.EpilogueVF
7438	<< ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7439	});
7440	}
7441
7442	void EpilogueVectorizerMainLoop::printDebugTracesAtEnd() {
7443	DEBUG_WITH_TYPE(VerboseDebug, {
7444	dbgs() << "intermediate fn:\n"
7445	<< *OrigLoop->getHeader()->getParent() << "\n";
7446	});
7447	}
7448
7449	BasicBlock *
7450	EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
7451	bool ForEpilogue) {
7452	assert(Bypass && "Expected valid bypass basic block.");
7453	Value *Count = getTripCount();
7454	MinProfitableTripCount = ElementCount::getFixed(MinVal: `0`);
7455	Value *CheckMinIters = createIterationCountCheck(
7456	VF: ForEpilogue ? EPI.EpilogueVF : VF, UF: ForEpilogue ? EPI.EpilogueUF : UF);
7457
7458	BasicBlock *const TCCheckBlock = LoopVectorPreHeader;
7459	if (!ForEpilogue)
7460	TCCheckBlock->setName("vector.main.loop.iter.check");
7461
7462	// Create new preheader for vector loop.
7463	LoopVectorPreHeader = SplitBlock(Old: TCCheckBlock, SplitPt: TCCheckBlock->getTerminator(),
7464	DT: static_cast<DominatorTree >(nullptr*), LI,
7465	MSSAU: nullptr, BBName: "vector.ph");
7466
7467	if (ForEpilogue) {
7468	// Save the trip count so we don't have to regenerate it in the
7469	// vec.epilog.iter.check. This is safe to do because the trip count
7470	// generated here dominates the vector epilog iter check.
7471	EPI.TripCount = Count;
7472	}
7473
7474	BranchInst &BI =
7475	*BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
7476	if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator()))
7477	setBranchWeights(I&: BI, Weights: MinItersBypassWeights, /IsExpected=/false);
7478	ReplaceInstWithInst(From: TCCheckBlock->getTerminator(), To: &BI);
7479
7480	// When vectorizing the main loop, its trip-count check is placed in a new
7481	// block, whereas the overall trip-count check is placed in the VPlan entry
7482	// block. When vectorizing the epilogue loop, its trip-count check is placed
7483	// in the VPlan entry block.
7484	if (!ForEpilogue)
7485	introduceCheckBlockInVPlan(CheckIRBB: TCCheckBlock);
7486	return TCCheckBlock;
7487	}
7488
7489	//===--------------------------------------------------------------------===//
7490	// EpilogueVectorizerEpilogueLoop
7491	//===--------------------------------------------------------------------===//
7492
7493	/// This function is partially responsible for generating the control flow
7494	/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7495	BasicBlock *
7496	EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
7497	createVectorLoopSkeleton(Prefix: "vec.epilog.");
7498
7499	// Now, compare the remaining count and if there aren't enough iterations to
7500	// execute the vectorized epilogue skip to the scalar part.
7501	LoopVectorPreHeader->setName("vec.epilog.ph");
7502	BasicBlock *VecEpilogueIterationCountCheck =
7503	SplitBlock(Old: LoopVectorPreHeader, SplitPt: LoopVectorPreHeader->begin(), DT, LI,
7504	MSSAU: nullptr, BBName: "vec.epilog.iter.check", Before: true);
7505	emitMinimumVectorEpilogueIterCountCheck(Bypass: LoopScalarPreHeader,
7506	Insert: VecEpilogueIterationCountCheck);
7507	AdditionalBypassBlock = VecEpilogueIterationCountCheck;
7508
7509	// Adjust the control flow taking the state info from the main loop
7510	// vectorization into account.
7511	assert(EPI.MainLoopIterationCountCheck && EPI.EpilogueIterationCountCheck &&
7512	"expected this to be saved from the previous pass.");
7513	EPI.MainLoopIterationCountCheck->getTerminator()->replaceUsesOfWith(
7514	From: VecEpilogueIterationCountCheck, To: LoopVectorPreHeader);
7515
7516	EPI.EpilogueIterationCountCheck->getTerminator()->replaceUsesOfWith(
7517	From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7518
7519	// Adjust the terminators of runtime check blocks and phis using them.
7520	BasicBlock *SCEVCheckBlock = RTChecks.getSCEVChecks().second;
7521	BasicBlock *MemCheckBlock = RTChecks.getMemRuntimeChecks().second;
7522	if (SCEVCheckBlock)
7523	SCEVCheckBlock->getTerminator()->replaceUsesOfWith(
7524	From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7525	if (MemCheckBlock)
7526	MemCheckBlock->getTerminator()->replaceUsesOfWith(
7527	From: VecEpilogueIterationCountCheck, To: LoopScalarPreHeader);
7528
7529	DT->changeImmediateDominator(BB: LoopScalarPreHeader,
7530	NewBB: EPI.EpilogueIterationCountCheck);
7531
7532	// The vec.epilog.iter.check block may contain Phi nodes from inductions or
7533	// reductions which merge control-flow from the latch block and the middle
7534	// block. Update the incoming values here and move the Phi into the preheader.
7535	SmallVector<PHINode *, `4`> PhisInBlock(
7536	llvm::make_pointer_range(Range: VecEpilogueIterationCountCheck->phis()));
7537
7538	for (PHINode *Phi : PhisInBlock) {
7539	Phi->moveBefore(InsertPos: LoopVectorPreHeader->getFirstNonPHIIt());
7540	Phi->replaceIncomingBlockWith(
7541	Old: VecEpilogueIterationCountCheck->getSinglePredecessor(),
7542	New: VecEpilogueIterationCountCheck);
7543
7544	// If the phi doesn't have an incoming value from the
7545	// EpilogueIterationCountCheck, we are done. Otherwise remove the incoming
7546	// value and also those from other check blocks. This is needed for
7547	// reduction phis only.
7548	if (none_of(Range: Phi->blocks(), P: [&](BasicBlock *IncB) {
7549	return EPI.EpilogueIterationCountCheck == IncB;
7550	}))
7551	continue;
7552	Phi->removeIncomingValue(BB: EPI.EpilogueIterationCountCheck);
7553	if (SCEVCheckBlock)
7554	Phi->removeIncomingValue(BB: SCEVCheckBlock);
7555	if (MemCheckBlock)
7556	Phi->removeIncomingValue(BB: MemCheckBlock);
7557	}
7558
7559	replaceVPBBWithIRVPBB(VPBB: Plan.getScalarPreheader(), IRBB: LoopScalarPreHeader);
7560	return LoopVectorPreHeader;
7561	}
7562
7563	BasicBlock *
7564	EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
7565	BasicBlock Bypass, BasicBlock Insert) {
7566
7567	assert(EPI.TripCount &&
7568	"Expected trip count to have been saved in the first pass.");
7569	Value *TC = EPI.TripCount;
7570	IRBuilder<> Builder(Insert->getTerminator());
7571	Value *Count = Builder.CreateSub(LHS: TC, RHS: EPI.VectorTripCount, Name: "n.vec.remaining");
7572
7573	// Generate code to check if the loop's trip count is less than VF UF of the*
7574	// vector epilogue loop.
7575	auto P = Cost->requiresScalarEpilogue(IsVectorizing: EPI.EpilogueVF.isVector())
7576	? ICmpInst::ICMP_ULE
7577	: ICmpInst::ICMP_ULT;
7578
7579	Value *CheckMinIters =
7580	Builder.CreateICmp(P, LHS: Count,
7581	RHS: createStepForVF(B&: Builder, Ty: Count->getType(),
7582	VF: EPI.EpilogueVF, Step: EPI.EpilogueUF),
7583	Name: "min.epilog.iters.check");
7584
7585	BranchInst &BI =
7586	*BranchInst::Create(IfTrue: Bypass, IfFalse: LoopVectorPreHeader, Cond: CheckMinIters);
7587	if (hasBranchWeightMD(I: *OrigLoop->getLoopLatch()->getTerminator())) {
7588	// FIXME: See test Transforms/LoopVectorize/branch-weights.ll. I don't
7589	// think the MainLoopStep is correct.
7590	unsigned MainLoopStep = UF * VF.getKnownMinValue();
7591	unsigned EpilogueLoopStep =
7592	EPI.EpilogueUF * EPI.EpilogueVF.getKnownMinValue();
7593	// We assume the remaining `Count` is equally distributed in
7594	// [0, MainLoopStep)
7595	// So the probability for `Count < EpilogueLoopStep` should be
7596	// min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7597	unsigned EstimatedSkipCount = std::min(a: MainLoopStep, b: EpilogueLoopStep);
7598	const uint32_t Weights[] = {EstimatedSkipCount,
7599	MainLoopStep - EstimatedSkipCount};
7600	setBranchWeights(I&: BI, Weights, /IsExpected=/false);
7601	}
7602	ReplaceInstWithInst(From: Insert->getTerminator(), To: &BI);
7603
7604	// A new entry block has been created for the epilogue VPlan. Hook it in, as
7605	// otherwise we would try to modify the entry to the main vector loop.
7606	VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(IRBB: Insert);
7607	VPBasicBlock *OldEntry = Plan.getEntry();
7608	VPBlockUtils::reassociateBlocks(Old: OldEntry, New: NewEntry);
7609	Plan.setEntry(NewEntry);
7610	// OldEntry is now dead and will be cleaned up when the plan gets destroyed.
7611
7612	return Insert;
7613	}
7614
7615	void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
7616	LLVM_DEBUG({
7617	dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
7618	<< "Epilogue Loop VF:" << EPI.EpilogueVF
7619	<< ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
7620	});
7621	}
7622
7623	void EpilogueVectorizerEpilogueLoop::printDebugTracesAtEnd() {
7624	DEBUG_WITH_TYPE(VerboseDebug, {
7625	dbgs() << "final fn:\n" << *OrigLoop->getHeader()->getParent() << "\n";
7626	});
7627	}
7628
7629	VPWidenMemoryRecipe *
7630	VPRecipeBuilder::tryToWidenMemory(Instruction I, ArrayRef<VPValue > Operands,
7631	VFRange &Range) {
7632	assert((isa<LoadInst>(I) \|\| isa<StoreInst>(I)) &&
7633	"Must be called with either a load or store");
7634
7635	auto WillWiden = [&](ElementCount VF) -> bool {
7636	LoopVectorizationCostModel::InstWidening Decision =
7637	CM.getWideningDecision(I, VF);
7638	assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
7639	"CM decision should be taken at this point.");
7640	if (Decision == LoopVectorizationCostModel::CM_Interleave)
7641	return true;
7642	if (CM.isScalarAfterVectorization(I, VF) \|\|
7643	CM.isProfitableToScalarize(I, VF))
7644	return false;
7645	return Decision != LoopVectorizationCostModel::CM_Scalarize;
7646	};
7647
7648	if (!LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillWiden, Range))
7649	return nullptr;
7650
7651	VPValue Mask = nullptr*;
7652	if (Legal->isMaskRequired(I))
7653	Mask = getBlockInMask(VPBB: Builder.getInsertBlock());
7654
7655	// Determine if the pointer operand of the access is either consecutive or
7656	// reverse consecutive.
7657	LoopVectorizationCostModel::InstWidening Decision =
7658	CM.getWideningDecision(I, VF: Range.Start);
7659	bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
7660	bool Consecutive =
7661	Reverse \|\| Decision == LoopVectorizationCostModel::CM_Widen;
7662
7663	VPValue *Ptr = isa<LoadInst>(Val: I) ? Operands [`0`] : Operands [`1`];
7664	if (Consecutive) {
7665	auto *GEP = dyn_cast<GetElementPtrInst>(
7666	Val: Ptr->getUnderlyingValue()->stripPointerCasts());
7667	VPSingleDefRecipe *VectorPtr;
7668	if (Reverse) {
7669	// When folding the tail, we may compute an address that we don't in the
7670	// original scalar loop and it may not be inbounds. Drop Inbounds in that
7671	// case.
7672	GEPNoWrapFlags Flags =
7673	(CM.foldTailByMasking() \|\| !GEP \|\| !GEP->isInBounds())
7674	? GEPNoWrapFlags::none()
7675	: GEPNoWrapFlags::inBounds();
7676	VectorPtr =
7677	new VPVectorEndPointerRecipe (Ptr, &Plan.getVF(), getLoadStoreType(I),
7678	/Stride/ -`1`, Flags, I->getDebugLoc());
7679	} else {
7680	VectorPtr = new VPVectorPointerRecipe (Ptr, getLoadStoreType(I),
7681	GEP ? GEP->getNoWrapFlags()
7682	: GEPNoWrapFlags::none(),
7683	I->getDebugLoc());
7684	}
7685	Builder.insert(R: VectorPtr);
7686	Ptr = VectorPtr;
7687	}
7688	if (LoadInst *Load = dyn_cast<LoadInst>(Val: I))
7689	return new VPWidenLoadRecipe (*Load, Ptr, Mask, Consecutive, Reverse,
7690	VPIRMetadata (*Load, LVer), I->getDebugLoc());
7691
7692	StoreInst *Store = cast<StoreInst>(Val: I);
7693	return new VPWidenStoreRecipe (*Store, Ptr, Operands [`0`], Mask, Consecutive,
7694	Reverse, VPIRMetadata (*Store, LVer),
7695	I->getDebugLoc());
7696	}
7697
7698	/// Creates a VPWidenIntOrFpInductionRecpipe for \p Phi. If needed, it will also
7699	/// insert a recipe to expand the step for the induction recipe.
7700	static VPWidenIntOrFpInductionRecipe *
7701	createWidenInductionRecipes(PHINode Phi, Instruction PhiOrTrunc,
7702	VPValue Start, const* InductionDescriptor &IndDesc,
7703	VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop) {
7704	assert(IndDesc.getStartValue() ==
7705	Phi->getIncomingValueForBlock(OrigLoop.getLoopPreheader()));
7706	assert(SE.isLoopInvariant(IndDesc.getStep(), &OrigLoop) &&
7707	"step must be loop invariant");
7708
7709	VPValue *Step =
7710	vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: IndDesc.getStep(), SE);
7711	if (auto *TruncI = dyn_cast<TruncInst>(Val: PhiOrTrunc)) {
7712	return new VPWidenIntOrFpInductionRecipe (Phi, Start, Step, &Plan.getVF(),
7713	IndDesc, TruncI,
7714	TruncI->getDebugLoc());
7715	}
7716	assert(isa<PHINode>(PhiOrTrunc) && "must be a phi node here");
7717	return new VPWidenIntOrFpInductionRecipe (Phi, Start, Step, &Plan.getVF(),
7718	IndDesc, Phi->getDebugLoc());
7719	}
7720
7721	VPHeaderPHIRecipe *VPRecipeBuilder::tryToOptimizeInductionPHI(
7722	PHINode Phi, ArrayRef<VPValue > Operands, VFRange &Range) {
7723
7724	// Check if this is an integer or fp induction. If so, build the recipe that
7725	// produces its scalar and vector values.
7726	if (auto *II = Legal->getIntOrFpInductionDescriptor(Phi))
7727	return createWidenInductionRecipes(Phi, PhiOrTrunc: Phi, Start: Operands [`0`], IndDesc: *II, Plan,
7728	SE&: PSE.getSE(), OrigLoop&: OrigLoop);
7729
7730	// Check if this is pointer induction. If so, build the recipe for it.
7731	if (auto *II = Legal->getPointerInductionDescriptor(Phi)) {
7732	VPValue *Step = vputils::getOrCreateVPValueForSCEVExpr(Plan, Expr: II->getStep(),
7733	SE&: *PSE.getSE());
7734	return new VPWidenPointerInductionRecipe (
7735	Phi, Operands [`0`], Step, &Plan.getVFxUF(), *II,
7736	LoopVectorizationPlanner::getDecisionAndClampRange(
7737	Predicate: [&](ElementCount VF) {
7738	return CM.isScalarAfterVectorization(I: Phi, VF);
7739	},
7740	Range),
7741	Phi->getDebugLoc());
7742	}
7743	return nullptr;
7744	}
7745
7746	VPWidenIntOrFpInductionRecipe *VPRecipeBuilder::tryToOptimizeInductionTruncate(
7747	TruncInst I, ArrayRef<VPValue > Operands, VFRange &Range) {
7748	// Optimize the special case where the source is a constant integer
7749	// induction variable. Notice that we can only optimize the 'trunc' case
7750	// because (a) FP conversions lose precision, (b) sext/zext may wrap, and
7751	// (c) other casts depend on pointer size.
7752
7753	// Determine whether \p K is a truncation based on an induction variable that
7754	// can be optimized.
7755	auto IsOptimizableIVTruncate =
7756	[&](Instruction K) -> std::function<bool*(ElementCount)> {
7757	return [=](ElementCount VF) -> bool {
7758	return CM.isOptimizableIVTruncate(I: K, VF);
7759	};
7760	};
7761
7762	if (LoopVectorizationPlanner::getDecisionAndClampRange(
7763	Predicate: IsOptimizableIVTruncate (I), Range)) {
7764
7765	auto *Phi = cast<PHINode>(Val: I->getOperand(i_nocapture: `0`));
7766	const InductionDescriptor &II = *Legal->getIntOrFpInductionDescriptor(Phi);
7767	VPValue *Start = Plan.getOrAddLiveIn(V: II.getStartValue());
7768	return createWidenInductionRecipes(Phi, PhiOrTrunc: I, Start, IndDesc: II, Plan, SE&: *PSE.getSE(),
7769	OrigLoop&: *OrigLoop);
7770	}
7771	return nullptr;
7772	}
7773
7774	VPSingleDefRecipe VPRecipeBuilder::tryToWidenCall(CallInst CI,
7775	ArrayRef<VPValue *> Operands,
7776	VFRange &Range) {
7777	bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
7778	Predicate: [this, CI](ElementCount VF) {
7779	return CM.isScalarWithPredication(I: CI, VF);
7780	},
7781	Range);
7782
7783	if (IsPredicated)
7784	return nullptr;
7785
7786	Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
7787	if (ID && (ID == Intrinsic::assume \|\| ID == Intrinsic::lifetime_end \|\|
7788	ID == Intrinsic::lifetime_start \|\| ID == Intrinsic::sideeffect \|\|
7789	ID == Intrinsic::pseudoprobe \|\|
7790	ID == Intrinsic::experimental_noalias_scope_decl))
7791	return nullptr;
7792
7793	SmallVector<VPValue *, `4`> Ops(Operands.take_front(N: CI->arg_size()));
7794
7795	// Is it beneficial to perform intrinsic call compared to lib call?
7796	bool ShouldUseVectorIntrinsic =
7797	ID && LoopVectorizationPlanner::getDecisionAndClampRange(
7798	Predicate: [&](ElementCount VF) -> bool {
7799	return CM.getCallWideningDecision(CI, VF).Kind ==
7800	LoopVectorizationCostModel::CM_IntrinsicCall;
7801	},
7802	Range);
7803	if (ShouldUseVectorIntrinsic)
7804	return new VPWidenIntrinsicRecipe (*CI, ID, Ops, CI->getType(),
7805	CI->getDebugLoc());
7806
7807	Function Variant = nullptr*;
7808	std::optional<unsigned> MaskPos;
7809	// Is better to call a vectorized version of the function than to to scalarize
7810	// the call?
7811	auto ShouldUseVectorCall = LoopVectorizationPlanner::getDecisionAndClampRange(
7812	Predicate: [&](ElementCount VF) -> bool {
7813	// The following case may be scalarized depending on the VF.
7814	// The flag shows whether we can use a usual Call for vectorized
7815	// version of the instruction.
7816
7817	// If we've found a variant at a previous VF, then stop looking. A
7818	// vectorized variant of a function expects input in a certain shape
7819	// -- basically the number of input registers, the number of lanes
7820	// per register, and whether there's a mask required.
7821	// We store a pointer to the variant in the VPWidenCallRecipe, so
7822	// once we have an appropriate variant it's only valid for that VF.
7823	// This will force a different vplan to be generated for each VF that
7824	// finds a valid variant.
7825	if (Variant)
7826	return false;
7827	LoopVectorizationCostModel::CallWideningDecision Decision =
7828	CM.getCallWideningDecision(CI, VF);
7829	if (Decision.Kind == LoopVectorizationCostModel::CM_VectorCall) {
7830	Variant = Decision.Variant;
7831	MaskPos = Decision.MaskPos;
7832	return true;
7833	}
7834
7835	return false;
7836	},
7837	Range);
7838	if (ShouldUseVectorCall) {
7839	if (MaskPos.has_value()) {
7840	// We have 2 cases that would require a mask:
7841	// 1) The block needs to be predicated, either due to a conditional
7842	// in the scalar loop or use of an active lane mask with
7843	// tail-folding, and we use the appropriate mask for the block.
7844	// 2) No mask is required for the block, but the only available
7845	// vector variant at this VF requires a mask, so we synthesize an
7846	// all-true mask.
7847	VPValue Mask = nullptr*;
7848	if (Legal->isMaskRequired(I: CI))
7849	Mask = getBlockInMask(VPBB: Builder.getInsertBlock());
7850	else
7851	Mask = Plan.getOrAddLiveIn(
7852	V: ConstantInt::getTrue(Ty: IntegerType::getInt1Ty(C&: CI->getContext())));
7853
7854	Ops.insert(I: Ops.begin() + *MaskPos, Elt: Mask);
7855	}
7856
7857	Ops.push_back(Elt: Operands.back());
7858	return new VPWidenCallRecipe (CI, Variant, Ops, CI->getDebugLoc());
7859	}
7860
7861	return nullptr;
7862	}
7863
7864	bool VPRecipeBuilder::shouldWiden(Instruction I, VFRange &Range) const* {
7865	assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
7866	!isa<StoreInst>(I) && "Instruction should have been handled earlier");
7867	// Instruction should be widened, unless it is scalar after vectorization,
7868	// scalarization is profitable or it is predicated.
7869	auto WillScalarize = [this, I](ElementCount VF) -> bool {
7870	return CM.isScalarAfterVectorization(I, VF) \|\|
7871	CM.isProfitableToScalarize(I, VF) \|\|
7872	CM.isScalarWithPredication(I, VF);
7873	};
7874	return !LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: WillScalarize,
7875	Range);
7876	}
7877
7878	VPWidenRecipe VPRecipeBuilder::tryToWiden(Instruction I,
7879	ArrayRef<VPValue *> Operands) {
7880	switch (I->getOpcode()) {
7881	default:
7882	return nullptr;
7883	case Instruction::SDiv:
7884	case Instruction::UDiv:
7885	case Instruction::SRem:
7886	case Instruction::URem: {
7887	// If not provably safe, use a select to form a safe divisor before widening the
7888	// div/rem operation itself. Otherwise fall through to general handling below.
7889	if (CM.isPredicatedInst(I)) {
7890	SmallVector<VPValue *> Ops(Operands);
7891	VPValue *Mask = getBlockInMask(VPBB: Builder.getInsertBlock());
7892	VPValue *One =
7893	Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: I->getType(), V: `1u`, IsSigned: false));
7894	auto *SafeRHS = Builder.createSelect(Cond: Mask, TrueVal: Ops [`1`], FalseVal: One, DL: I->getDebugLoc());
7895	Ops [`1`] = SafeRHS;
7896	return new VPWidenRecipe (*I, Ops);
7897	}
7898	[[fallthrough]];
7899	}
7900	case Instruction::Add:
7901	case Instruction::And:
7902	case Instruction::AShr:
7903	case Instruction::FAdd:
7904	case Instruction::FCmp:
7905	case Instruction::FDiv:
7906	case Instruction::FMul:
7907	case Instruction::FNeg:
7908	case Instruction::FRem:
7909	case Instruction::FSub:
7910	case Instruction::ICmp:
7911	case Instruction::LShr:
7912	case Instruction::Mul:
7913	case Instruction::Or:
7914	case Instruction::Select:
7915	case Instruction::Shl:
7916	case Instruction::Sub:
7917	case Instruction::Xor:
7918	case Instruction::Freeze: {
7919	SmallVector<VPValue *> NewOps(Operands);
7920	if (Instruction::isBinaryOp(Opcode: I->getOpcode())) {
7921	// The legacy cost model uses SCEV to check if some of the operands are
7922	// constants. To match the legacy cost model's behavior, use SCEV to try
7923	// to replace operands with constants.
7924	ScalarEvolution &SE = *PSE.getSE();
7925	auto GetConstantViaSCEV = [this, &SE](VPValue *Op) {
7926	if (!Op->isLiveIn())
7927	return Op;
7928	Value *V = Op->getUnderlyingValue();
7929	if (isa<Constant>(Val: V) \|\| !SE.isSCEVable(Ty: V->getType()))
7930	return Op;
7931	auto *C = dyn_cast<SCEVConstant>(Val: SE.getSCEV(V));
7932	if (!C)
7933	return Op;
7934	return Plan.getOrAddLiveIn(V: C->getValue());
7935	};
7936	// For Mul, the legacy cost model checks both operands.
7937	if (I->getOpcode() == Instruction::Mul)
7938	NewOps [`0`] = GetConstantViaSCEV (NewOps [`0`]);
7939	// For other binops, the legacy cost model only checks the second operand.
7940	NewOps [`1`] = GetConstantViaSCEV (NewOps [`1`]);
7941	}
7942	return new VPWidenRecipe (*I, NewOps);
7943	}
7944	case Instruction::ExtractValue: {
7945	SmallVector<VPValue *> NewOps(Operands);
7946	Type *I32Ty = IntegerType::getInt32Ty(C&: I->getContext());
7947	auto *EVI = cast<ExtractValueInst>(Val: I);
7948	assert(EVI->getNumIndices() == `1` && "Expected one extractvalue index");
7949	unsigned Idx = EVI->getIndices()[`0`];
7950	NewOps.push_back(Elt: Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: I32Ty, V: Idx, IsSigned: false)));
7951	return new VPWidenRecipe (*I, NewOps);
7952	}
7953	};
7954	}
7955
7956	VPHistogramRecipe *
7957	VPRecipeBuilder::tryToWidenHistogram(const HistogramInfo *HI,
7958	ArrayRef<VPValue *> Operands) {
7959	// FIXME: Support other operations.
7960	unsigned Opcode = HI->Update->getOpcode();
7961	assert((Opcode == Instruction::Add \|\| Opcode == Instruction::Sub) &&
7962	"Histogram update operation must be an Add or Sub");
7963
7964	SmallVector<VPValue *, `3`> HGramOps;
7965	// Bucket address.
7966	HGramOps.push_back(Elt: Operands [`1`]);
7967	// Increment value.
7968	HGramOps.push_back(Elt: getVPValueOrAddLiveIn(V: HI->Update->getOperand(i: `1`)));
7969
7970	// In case of predicated execution (due to tail-folding, or conditional
7971	// execution, or both), pass the relevant mask.
7972	if (Legal->isMaskRequired(I: HI->Store))
7973	HGramOps.push_back(Elt: getBlockInMask(VPBB: Builder.getInsertBlock()));
7974
7975	return new VPHistogramRecipe (Opcode, HGramOps, HI->Store->getDebugLoc());
7976	}
7977
7978	VPReplicateRecipe *
7979	VPRecipeBuilder::handleReplication(Instruction I, ArrayRef<VPValue > Operands,
7980	VFRange &Range) {
7981	bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
7982	Predicate: [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
7983	Range);
7984
7985	bool IsPredicated = CM.isPredicatedInst(I);
7986
7987	// Even if the instruction is not marked as uniform, there are certain
7988	// intrinsic calls that can be effectively treated as such, so we check for
7989	// them here. Conservatively, we only do this for scalable vectors, since
7990	// for fixed-width VFs we can always fall back on full scalarization.
7991	if (!IsUniform && Range.Start.isScalable() && isa<IntrinsicInst>(Val: I)) {
7992	switch (cast<IntrinsicInst>(Val: I)->getIntrinsicID()) {
7993	case Intrinsic::assume:
7994	case Intrinsic::lifetime_start:
7995	case Intrinsic::lifetime_end:
7996	// For scalable vectors if one of the operands is variant then we still
7997	// want to mark as uniform, which will generate one instruction for just
7998	// the first lane of the vector. We can't scalarize the call in the same
7999	// way as for fixed-width vectors because we don't know how many lanes
8000	// there are.
8001	//
8002	// The reasons for doing it this way for scalable vectors are:
8003	// 1. For the assume intrinsic generating the instruction for the first
8004	// lane is still be better than not generating any at all. For
8005	// example, the input may be a splat across all lanes.
8006	// 2. For the lifetime start/end intrinsics the pointer operand only
8007	// does anything useful when the input comes from a stack object,
8008	// which suggests it should always be uniform. For non-stack objects
8009	// the effect is to poison the object, which still allows us to
8010	// remove the call.
8011	IsUniform = true;
8012	break;
8013	default:
8014	break;
8015	}
8016	}
8017	VPValue BlockInMask = nullptr*;
8018	if (!IsPredicated) {
8019	// Finalize the recipe for Instr, first if it is not predicated.
8020	LLVM_DEBUG(dbgs() << "LV: Scalarizing:" << *I << "\n");
8021	} else {
8022	LLVM_DEBUG(dbgs() << "LV: Scalarizing and predicating:" << *I << "\n");
8023	// Instructions marked for predication are replicated and a mask operand is
8024	// added initially. Masked replicate recipes will later be placed under an
8025	// if-then construct to prevent side-effects. Generate recipes to compute
8026	// the block mask for this region.
8027	BlockInMask = getBlockInMask(VPBB: Builder.getInsertBlock());
8028	}
8029
8030	// Note that there is some custom logic to mark some intrinsics as uniform
8031	// manually above for scalable vectors, which this assert needs to account for
8032	// as well.
8033	assert((Range.Start.isScalar() \|\| !IsUniform \|\| !IsPredicated \|\|
8034	(Range.Start.isScalable() && isa<IntrinsicInst>(I))) &&
8035	"Should not predicate a uniform recipe");
8036	auto Recipe = new* VPReplicateRecipe (I, Operands, IsUniform, BlockInMask,
8037	VPIRMetadata (*I, LVer));
8038	return Recipe;
8039	}
8040
8041	/// Find all possible partial reductions in the loop and track all of those that
8042	/// are valid so recipes can be formed later.
8043	void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
8044	// Find all possible partial reductions.
8045	SmallVector<std::pair<PartialReductionChain, unsigned>>
8046	PartialReductionChains;
8047	for (const auto &[Phi, RdxDesc] : Legal->getReductionVars()) {
8048	getScaledReductions(PHI: Phi, RdxExitInstr: RdxDesc.getLoopExitInstr(), Range,
8049	Chains&: PartialReductionChains);
8050	}
8051
8052	// A partial reduction is invalid if any of its extends are used by
8053	// something that isn't another partial reduction. This is because the
8054	// extends are intended to be lowered along with the reduction itself.
8055
8056	// Build up a set of partial reduction ops for efficient use checking.
8057	SmallSet<User *, `4`> PartialReductionOps;
8058	for (const auto &[PartialRdx, _] : PartialReductionChains)
8059	PartialReductionOps.insert(Ptr: PartialRdx.ExtendUser);
8060
8061	auto ExtendIsOnlyUsedByPartialReductions =
8062	[&PartialReductionOps](Instruction *Extend) {
8063	return all_of(Range: Extend->users(), P: [&](const User *U) {
8064	return PartialReductionOps.contains(Ptr: U);
8065	});
8066	};
8067
8068	// Check if each use of a chain's two extends is a partial reduction
8069	// and only add those that don't have non-partial reduction users.
8070	for (auto Pair : PartialReductionChains) {
8071	PartialReductionChain Chain = Pair.first;
8072	if (ExtendIsOnlyUsedByPartialReductions (Chain.ExtendA) &&
8073	(!Chain.ExtendB \|\| ExtendIsOnlyUsedByPartialReductions (Chain.ExtendB)))
8074	ScaledReductionMap.try_emplace(Key: Chain.Reduction, Args&: Pair.second);
8075	}
8076	}
8077
8078	bool VPRecipeBuilder::getScaledReductions(
8079	Instruction PHI, Instruction RdxExitInstr, VFRange &Range,
8080	SmallVectorImpl<std::pair<PartialReductionChain, unsigned>> &Chains) {
8081	if (!CM.TheLoop->contains(Inst: RdxExitInstr))
8082	return false;
8083
8084	auto *Update = dyn_cast<BinaryOperator>(Val: RdxExitInstr);
8085	if (!Update)
8086	return false;
8087
8088	Value *Op = Update->getOperand(i_nocapture: `0`);
8089	Value *PhiOp = Update->getOperand(i_nocapture: `1`);
8090	if (Op == PHI)
8091	std::swap(a&: Op, b&: PhiOp);
8092
8093	// Try and get a scaled reduction from the first non-phi operand.
8094	// If one is found, we use the discovered reduction instruction in
8095	// place of the accumulator for costing.
8096	if (auto *OpInst = dyn_cast<Instruction>(Val: Op)) {
8097	if (getScaledReductions(PHI, RdxExitInstr: OpInst, Range, Chains)) {
8098	PHI = Chains.rbegin()->first.Reduction;
8099
8100	Op = Update->getOperand(i_nocapture: `0`);
8101	PhiOp = Update->getOperand(i_nocapture: `1`);
8102	if (Op == PHI)
8103	std::swap(a&: Op, b&: PhiOp);
8104	}
8105	}
8106	if (PhiOp != PHI)
8107	return false;
8108
8109	using namespace llvm::PatternMatch;
8110
8111	// If the update is a binary operator, check both of its operands to see if
8112	// they are extends. Otherwise, see if the update comes directly from an
8113	// extend.
8114	Instruction Exts[`2`] = {nullptr*};
8115	BinaryOperator *ExtendUser = dyn_cast<BinaryOperator>(Val: Op);
8116	std::optional<unsigned> BinOpc;
8117	Type ExtOpTypes[`2`] = {nullptr*};
8118
8119	auto CollectExtInfo = [this, &Exts,
8120	&ExtOpTypes](SmallVectorImpl<Value > &Ops) -> bool* {
8121	unsigned I = `0`;
8122	for (Value *OpI : Ops) {
8123	Value *ExtOp;
8124	if (!match(V: OpI, P: m_ZExtOrSExt(Op: m_Value(V&: ExtOp))))
8125	return false;
8126	Exts[I] = cast<Instruction>(Val: OpI);
8127
8128	// TODO: We should be able to support live-ins.
8129	if (!CM.TheLoop->contains(Inst: Exts[I]))
8130	return false;
8131
8132	ExtOpTypes[I] = ExtOp->getType();
8133	I++;
8134	}
8135	return true;
8136	};
8137
8138	if (ExtendUser) {
8139	if (!ExtendUser->hasOneUse())
8140	return false;
8141
8142	// Use the side-effect of match to replace BinOp only if the pattern is
8143	// matched, we don't care at this point whether it actually matched.
8144	match(V: ExtendUser, P: m_Neg(V: m_BinOp(I&: ExtendUser)));
8145
8146	SmallVector<Value *> Ops(ExtendUser->operands());
8147	if (!CollectExtInfo (Ops))
8148	return false;
8149
8150	BinOpc = std::make_optional(t: ExtendUser->getOpcode());
8151	} else if (match(V: Update, P: m_Add(L: m_Value(), R: m_Value()))) {
8152	// We already know the operands for Update are Op and PhiOp.
8153	SmallVector<Value *> Ops({Op});
8154	if (!CollectExtInfo (Ops))
8155	return false;
8156
8157	ExtendUser = Update;
8158	BinOpc = std::nullopt;
8159	} else
8160	return false;
8161
8162	TTI::PartialReductionExtendKind OpAExtend =
8163	TTI::getPartialReductionExtendKind(I: Exts[`0`]);
8164	TTI::PartialReductionExtendKind OpBExtend =
8165	Exts[`1`] ? TTI::getPartialReductionExtendKind(I: Exts[`1`]) : TTI::PR_None;
8166	PartialReductionChain Chain(RdxExitInstr, Exts[`0`], Exts[`1`], ExtendUser);
8167
8168	TypeSize PHISize = PHI->getType()->getPrimitiveSizeInBits();
8169	TypeSize ASize = ExtOpTypes[`0`]->getPrimitiveSizeInBits();
8170	if (!PHISize.hasKnownScalarFactor(RHS: ASize))
8171	return false;
8172	unsigned TargetScaleFactor = PHISize.getKnownScalarFactor(RHS: ASize);
8173
8174	if (LoopVectorizationPlanner::getDecisionAndClampRange(
8175	Predicate: [&](ElementCount VF) {
8176	InstructionCost Cost = TTI->getPartialReductionCost(
8177	Opcode: Update->getOpcode(), InputTypeA: ExtOpTypes[`0`], InputTypeB: ExtOpTypes[`1`],
8178	AccumType: PHI->getType(), VF, OpAExtend, OpBExtend, BinOp: BinOpc, CostKind: CM.CostKind);
8179	return Cost.isValid();
8180	},
8181	Range)) {
8182	Chains.emplace_back(Args&: Chain, Args&: TargetScaleFactor);
8183	return true;
8184	}
8185
8186	return false;
8187	}
8188
8189	VPRecipeBase VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe R,
8190	VFRange &Range) {
8191	// First, check for specific widening recipes that deal with inductions, Phi
8192	// nodes, calls and memory operations.
8193	VPRecipeBase *Recipe;
8194	Instruction *Instr = R->getUnderlyingInstr();
8195	SmallVector<VPValue *, `4`> Operands(R->operands());
8196	if (auto *PhiR = dyn_cast<VPWidenPHIRecipe>(Val: R)) {
8197	VPBasicBlock *Parent = PhiR->getParent();
8198	[[maybe_unused]] VPRegionBlock *LoopRegionOf =
8199	Parent->getEnclosingLoopRegion();
8200	assert(LoopRegionOf && LoopRegionOf->getEntry() == Parent &&
8201	"Non-header phis should have been handled during predication");
8202	auto *Phi = cast<PHINode>(Val: R->getUnderlyingInstr());
8203	assert(Operands.size() == `2` && "Must have 2 operands for header phis");
8204	if ((Recipe = tryToOptimizeInductionPHI(Phi, Operands, Range)))
8205	return Recipe;
8206
8207	VPHeaderPHIRecipe PhiRecipe = nullptr*;
8208	assert((Legal->isReductionVariable(Phi) \|\|
8209	Legal->isFixedOrderRecurrence(Phi)) &&
8210	"can only widen reductions and fixed-order recurrences here");
8211	VPValue *StartV = Operands [`0`];
8212	if (Legal->isReductionVariable(PN: Phi)) {
8213	const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(PN: Phi);
8214	assert(RdxDesc.getRecurrenceStartValue() ==
8215	Phi->getIncomingValueForBlock(OrigLoop->getLoopPreheader()));
8216
8217	// If the PHI is used by a partial reduction, set the scale factor.
8218	unsigned ScaleFactor =
8219	getScalingForReduction(ExitInst: RdxDesc.getLoopExitInstr()).value_or(u: `1`);
8220	PhiRecipe = new VPReductionPHIRecipe (
8221	Phi, RdxDesc.getRecurrenceKind(), *StartV, CM.isInLoopReduction(Phi),
8222	CM.useOrderedReductions(RdxDesc), ScaleFactor);
8223	} else {
8224	// TODO: Currently fixed-order recurrences are modeled as chains of
8225	// first-order recurrences. If there are no users of the intermediate
8226	// recurrences in the chain, the fixed order recurrence should be modeled
8227	// directly, enabling more efficient codegen.
8228	PhiRecipe = new VPFirstOrderRecurrencePHIRecipe (Phi, *StartV);
8229	}
8230	// Add backedge value.
8231	PhiRecipe->addOperand(Operand: Operands [`1`]);
8232	return PhiRecipe;
8233	}
8234
8235	if (isa<TruncInst>(Val: Instr) && (Recipe = tryToOptimizeInductionTruncate(
8236	I: cast<TruncInst>(Val: Instr), Operands, Range)))
8237	return Recipe;
8238
8239	// All widen recipes below deal only with VF > 1.
8240	if (LoopVectorizationPlanner::getDecisionAndClampRange(
8241	Predicate: [&](ElementCount VF) { return VF.isScalar(); }, Range))
8242	return nullptr;
8243
8244	if (auto *CI = dyn_cast<CallInst>(Val: Instr))
8245	return tryToWidenCall(CI, Operands, Range);
8246
8247	if (StoreInst *SI = dyn_cast<StoreInst>(Val: Instr))
8248	if (auto HistInfo = Legal->getHistogramInfo(I: SI))
8249	return tryToWidenHistogram(HI: *HistInfo, Operands);
8250
8251	if (isa<LoadInst>(Val: Instr) \|\| isa<StoreInst>(Val: Instr))
8252	return tryToWidenMemory(I: Instr, Operands, Range);
8253
8254	if (std::optional<unsigned> ScaleFactor = getScalingForReduction(ExitInst: Instr))
8255	return tryToCreatePartialReduction(Reduction: Instr, Operands, ScaleFactor: ScaleFactor.value());
8256
8257	if (!shouldWiden(I: Instr, Range))
8258	return nullptr;
8259
8260	if (auto *GEP = dyn_cast<GetElementPtrInst>(Val: Instr))
8261	return new VPWidenGEPRecipe (GEP, Operands);
8262
8263	if (auto *SI = dyn_cast<SelectInst>(Val: Instr)) {
8264	return new VPWidenSelectRecipe (*SI, Operands);
8265	}
8266
8267	if (auto *CI = dyn_cast<CastInst>(Val: Instr)) {
8268	return new VPWidenCastRecipe (CI->getOpcode(), Operands [`0`], CI->getType(),
8269	*CI);
8270	}
8271
8272	return tryToWiden(I: Instr, Operands);
8273	}
8274
8275	VPRecipeBase *
8276	VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
8277	ArrayRef<VPValue *> Operands,
8278	unsigned ScaleFactor) {
8279	assert(Operands.size() == `2` &&
8280	"Unexpected number of operands for partial reduction");
8281
8282	VPValue *BinOp = Operands [`0`];
8283	VPValue *Accumulator = Operands [`1`];
8284	VPRecipeBase *BinOpRecipe = BinOp->getDefiningRecipe();
8285	if (isa<VPReductionPHIRecipe>(Val: BinOpRecipe) \|\|
8286	isa<VPPartialReductionRecipe>(Val: BinOpRecipe))
8287	std::swap(a&: BinOp, b&: Accumulator);
8288
8289	unsigned ReductionOpcode = Reduction->getOpcode();
8290	if (ReductionOpcode == Instruction::Sub) {
8291	auto *const Zero = ConstantInt::get(Ty: Reduction->getType(), V: `0`);
8292	SmallVector<VPValue *, `2`> Ops;
8293	Ops.push_back(Elt: Plan.getOrAddLiveIn(V: Zero));
8294	Ops.push_back(Elt: BinOp);
8295	BinOp = new VPWidenRecipe (*Reduction, Ops);
8296	Builder.insert(R: BinOp->getDefiningRecipe());
8297	ReductionOpcode = Instruction::Add;
8298	}
8299
8300	VPValue Cond = nullptr*;
8301	if (CM.blockNeedsPredicationForAnyReason(BB: Reduction->getParent())) {
8302	assert((ReductionOpcode == Instruction::Add \|\|
8303	ReductionOpcode == Instruction::Sub) &&
8304	"Expected an ADD or SUB operation for predicated partial "
8305	"reductions (because the neutral element in the mask is zero)!");
8306	Cond = getBlockInMask(VPBB: Builder.getInsertBlock());
8307	VPValue *Zero =
8308	Plan.getOrAddLiveIn(V: ConstantInt::get(Ty: Reduction->getType(), V: `0`));
8309	BinOp = Builder.createSelect(Cond, TrueVal: BinOp, FalseVal: Zero, DL: Reduction->getDebugLoc());
8310	}
8311	return new VPPartialReductionRecipe (ReductionOpcode, Accumulator, BinOp, Cond,
8312	ScaleFactor, Reduction);
8313	}
8314
8315	void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8316	ElementCount MaxVF) {
8317	if (ElementCount::isKnownGT(LHS: MinVF, RHS: MaxVF))
8318	return;
8319
8320	assert(OrigLoop->isInnermost() && "Inner loop expected.");
8321
8322	const LoopAccessInfo *LAI = Legal->getLAI();
8323	LoopVersioning LVer(*LAI, LAI->getRuntimePointerChecking()->getChecks(),
8324	OrigLoop, LI, DT, PSE.getSE());
8325	if (!LAI->getRuntimePointerChecking()->getChecks().empty() &&
8326	!LAI->getRuntimePointerChecking()->getDiffChecks()) {
8327	// Only use noalias metadata when using memory checks guaranteeing no
8328	// overlap across all iterations.
8329	LVer.prepareNoAliasMetadata();
8330	}
8331
8332	auto MaxVFTimes2 = MaxVF * `2`;
8333	auto VPlan0 = VPlanTransforms::buildPlainCFG(TheLoop: OrigLoop, LI&: *LI);
8334	for (ElementCount VF = MinVF; ElementCount::isKnownLT(LHS: VF, RHS: MaxVFTimes2);) {
8335	VFRange SubRange = {VF, MaxVFTimes2};
8336	if (auto Plan = tryToBuildVPlanWithVPRecipes(
8337	InitialPlan: std::unique_ptr<VPlan>(VPlan0 ->duplicate()), Range&: SubRange, LVer: &LVer)) {
8338	bool HasScalarVF = Plan ->hasScalarVFOnly();
8339	// Now optimize the initial VPlan.
8340	if (!HasScalarVF)
8341	VPlanTransforms::runPass(Fn: VPlanTransforms::truncateToMinimalBitwidths,
8342	Plan&: *Plan, Args: CM.getMinimalBitwidths());
8343	VPlanTransforms::runPass(Fn: VPlanTransforms::optimize, Plan&: *Plan);
8344	// TODO: try to put it close to addActiveLaneMask().
8345	// Discard the plan if it is not EVL-compatible
8346	if (CM.foldTailWithEVL() && !HasScalarVF &&
8347	!VPlanTransforms::runPass(Transform: VPlanTransforms::tryAddExplicitVectorLength,
8348	Plan&: *Plan, Args: CM.getMaxSafeElements()))
8349	break;
8350	assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8351	VPlans.push_back(Elt: std::move(Plan));
8352	}
8353	VF = SubRange.End;
8354	}
8355	}
8356
8357	/// Create and return a ResumePhi for \p WideIV, unless it is truncated. If the
8358	/// induction recipe is not canonical, creates a VPDerivedIVRecipe to compute
8359	/// the end value of the induction.
8360	static VPInstruction *addResumePhiRecipeForInduction(
8361	VPWidenInductionRecipe *WideIV, VPBuilder &VectorPHBuilder,
8362	VPBuilder &ScalarPHBuilder, VPTypeAnalysis &TypeInfo, VPValue *VectorTC) {
8363	auto *WideIntOrFp = dyn_cast<VPWidenIntOrFpInductionRecipe>(Val: WideIV);
8364	// Truncated wide inductions resume from the last lane of their vector value
8365	// in the last vector iteration which is handled elsewhere.
8366	if (WideIntOrFp && WideIntOrFp->getTruncInst())
8367	return nullptr;
8368
8369	VPValue *Start = WideIV->getStartValue();
8370	VPValue *Step = WideIV->getStepValue();
8371	const InductionDescriptor &ID = WideIV->getInductionDescriptor();
8372	VPValue *EndValue = VectorTC;
8373	if (!WideIntOrFp \|\| !WideIntOrFp->isCanonical()) {
8374	EndValue = VectorPHBuilder.createDerivedIV(
8375	Kind: ID.getKind(), FPBinOp: dyn_cast_or_null<FPMathOperator>(Val: ID.getInductionBinOp()),
8376	Start, Current: VectorTC, Step);
8377	}
8378
8379	// EndValue is derived from the vector trip count (which has the same type as
8380	// the widest induction) and thus may be wider than the induction here.
8381	Type *ScalarTypeOfWideIV = TypeInfo.inferScalarType(V: WideIV);
8382	if (ScalarTypeOfWideIV != TypeInfo.inferScalarType(V: EndValue)) {
8383	EndValue = VectorPHBuilder.createScalarCast(Opcode: Instruction::Trunc, Op: EndValue,
8384	ResultTy: ScalarTypeOfWideIV,
8385	DL: WideIV->getDebugLoc());
8386	}
8387
8388	auto *ResumePhiRecipe = ScalarPHBuilder.createScalarPhi(
8389	IncomingValues: {EndValue, Start}, DL: WideIV->getDebugLoc(), Name: "bc.resume.val");
8390	return ResumePhiRecipe;
8391	}
8392
8393	/// Create resume phis in the scalar preheader for first-order recurrences,
8394	/// reductions and inductions, and update the VPIRInstructions wrapping the
8395	/// original phis in the scalar header. End values for inductions are added to
8396	/// \p IVEndValues.
8397	static void addScalarResumePhis(VPRecipeBuilder &Builder, VPlan &Plan,
8398	DenseMap<VPValue , VPValue > &IVEndValues) {
8399	VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
8400	auto *ScalarPH = Plan.getScalarPreheader();
8401	auto *MiddleVPBB = cast<VPBasicBlock>(Val: ScalarPH->getPredecessors()[`0`]);
8402	VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8403	VPBuilder VectorPHBuilder(
8404	cast<VPBasicBlock>(Val: VectorRegion->getSinglePredecessor()));
8405	VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8406	VPBuilder ScalarPHBuilder(ScalarPH);
8407	for (VPRecipeBase &ScalarPhiR : Plan.getScalarHeader()->phis()) {
8408	auto *ScalarPhiIRI = cast<VPIRPhi>(Val: &ScalarPhiR);
8409
8410	// TODO: Extract final value from induction recipe initially, optimize to
8411	// pre-computed end value together in optimizeInductionExitUsers.
8412	auto *VectorPhiR =
8413	cast<VPHeaderPHIRecipe>(Val: Builder.getRecipe(I: &ScalarPhiIRI->getIRPhi()));
8414	if (auto *WideIVR = dyn_cast<VPWidenInductionRecipe>(Val: VectorPhiR)) {
8415	if (VPInstruction *ResumePhi = addResumePhiRecipeForInduction(
8416	WideIV: WideIVR, VectorPHBuilder, ScalarPHBuilder, TypeInfo,
8417	VectorTC: &Plan.getVectorTripCount())) {
8418	assert(isa<VPPhi>(ResumePhi) && "Expected a phi");
8419	IVEndValues [WideIVR] = ResumePhi->getOperand(N: `0`);
8420	ScalarPhiIRI->addOperand(Operand: ResumePhi);
8421	continue;
8422	}
8423	// TODO: Also handle truncated inductions here. Computing end-values
8424	// separately should be done as VPlan-to-VPlan optimization, after
8425	// legalizing all resume values to use the last lane from the loop.
8426	assert(cast<VPWidenIntOrFpInductionRecipe>(VectorPhiR)->getTruncInst() &&
8427	"should only skip truncated wide inductions");
8428	continue;
8429	}
8430
8431	// The backedge value provides the value to resume coming out of a loop,
8432	// which for FORs is a vector whose last element needs to be extracted. The
8433	// start value provides the value if the loop is bypassed.
8434	bool IsFOR = isa<VPFirstOrderRecurrencePHIRecipe>(Val: VectorPhiR);
8435	auto *ResumeFromVectorLoop = VectorPhiR->getBackedgeValue();
8436	assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
8437	"Cannot handle loops with uncountable early exits");
8438	if (IsFOR)
8439	ResumeFromVectorLoop = MiddleBuilder.createNaryOp(
8440	Opcode: VPInstruction::ExtractLastElement, Operands: {ResumeFromVectorLoop}, Inst: {},
8441	Name: "vector.recur.extract");
8442	StringRef Name = IsFOR ? "scalar.recur.init" : "bc.merge.rdx";
8443	auto *ResumePhiR = ScalarPHBuilder.createScalarPhi(
8444	IncomingValues: {ResumeFromVectorLoop, VectorPhiR->getStartValue()}, DL: {}, Name);
8445	ScalarPhiIRI->addOperand(Operand: ResumePhiR);
8446	}
8447	}
8448
8449	// Collect VPIRInstructions for phis in the exit block from the latch only.
8450	static SetVector<VPIRInstruction *> collectUsersInLatchExitBlock(VPlan &Plan) {
8451	SetVector<VPIRInstruction *> ExitUsersToFix;
8452	for (VPIRBasicBlock *ExitVPBB : Plan.getExitBlocks()) {
8453
8454	if (ExitVPBB->getSinglePredecessor() != Plan.getMiddleBlock())
8455	continue;
8456
8457	for (VPRecipeBase &R : ExitVPBB->phis()) {
8458	auto *ExitIRI = cast<VPIRPhi>(Val: &R);
8459	assert(ExitIRI->getNumOperands() == `1` && "must have a single operand");
8460	VPValue *V = ExitIRI->getOperand(N: `0`);
8461	if (V->isLiveIn())
8462	continue;
8463	assert(V->getDefiningRecipe()->getParent()->getEnclosingLoopRegion() &&
8464	"Only recipes defined inside a region should need fixing.");
8465	ExitUsersToFix.insert(X: ExitIRI);
8466	}
8467	}
8468	return ExitUsersToFix;
8469	}
8470
8471	// Add exit values to \p Plan. Extracts are added for each entry in \p
8472	// ExitUsersToFix if needed and their operands are updated.
8473	static void
8474	addUsersInExitBlocks(VPlan &Plan,
8475	const SetVector<VPIRInstruction *> &ExitUsersToFix) {
8476	if (ExitUsersToFix.empty())
8477	return;
8478
8479	auto *MiddleVPBB = Plan.getMiddleBlock();
8480	VPBuilder B(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8481
8482	// Introduce extract for exiting values and update the VPIRInstructions
8483	// modeling the corresponding LCSSA phis.
8484	for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
8485	assert(ExitIRI->getNumOperands() == `1` &&
8486	ExitIRI->getParent()->getSinglePredecessor() == MiddleVPBB &&
8487	"exit values from early exits must be fixed when branch to "
8488	"early-exit is added");
8489	ExitIRI->extractLastLaneOfFirstOperand(Builder&: B);
8490	}
8491	}
8492
8493	/// Handle users in the exit block for first order reductions in the original
8494	/// exit block. The penultimate value of recurrences is fed to their LCSSA phi
8495	/// users in the original exit block using the VPIRInstruction wrapping to the
8496	/// LCSSA phi.
8497	static void addExitUsersForFirstOrderRecurrences(
8498	VPlan &Plan, SetVector<VPIRInstruction *> &ExitUsersToFix, VFRange &Range) {
8499	VPRegionBlock *VectorRegion = Plan.getVectorLoopRegion();
8500	auto *ScalarPHVPBB = Plan.getScalarPreheader();
8501	auto *MiddleVPBB = Plan.getMiddleBlock();
8502	VPBuilder ScalarPHBuilder(ScalarPHVPBB);
8503	VPBuilder MiddleBuilder(MiddleVPBB, MiddleVPBB->getFirstNonPhi());
8504
8505	auto IsScalableOne = [](ElementCount VF) -> bool {
8506	return VF == ElementCount::getScalable(MinVal: `1`);
8507	};
8508
8509	for (auto &HeaderPhi : VectorRegion->getEntryBasicBlock()->phis()) {
8510	auto *FOR = dyn_cast<VPFirstOrderRecurrencePHIRecipe>(Val: &HeaderPhi);
8511	if (!FOR)
8512	continue;
8513
8514	assert(VectorRegion->getSingleSuccessor() == Plan.getMiddleBlock() &&
8515	"Cannot handle loops with uncountable early exits");
8516
8517	// This is the second phase of vectorizing first-order recurrences, creating
8518	// extract for users outside the loop. An overview of the transformation is
8519	// described below. Suppose we have the following loop with some use after
8520	// the loop of the last a[i-1],
8521	//
8522	// for (int i = 0; i < n; ++i) {
8523	// t = a[i - 1];
8524	// b[i] = a[i] - t;
8525	// }
8526	// use t;
8527	//
8528	// There is a first-order recurrence on "a". For this loop, the shorthand
8529	// scalar IR looks like:
8530	//
8531	// scalar.ph:
8532	// s.init = a[-1]
8533	// br scalar.body
8534	//
8535	// scalar.body:
8536	// i = phi [0, scalar.ph], [i+1, scalar.body]
8537	// s1 = phi [s.init, scalar.ph], [s2, scalar.body]
8538	// s2 = a[i]
8539	// b[i] = s2 - s1
8540	// br cond, scalar.body, exit.block
8541	//
8542	// exit.block:
8543	// use = lcssa.phi [s1, scalar.body]
8544	//
8545	// In this example, s1 is a recurrence because it's value depends on the
8546	// previous iteration. In the first phase of vectorization, we created a
8547	// VPFirstOrderRecurrencePHIRecipe v1 for s1. Now we create the extracts
8548	// for users in the scalar preheader and exit block.
8549	//
8550	// vector.ph:
8551	// v_init = vector(..., ..., ..., a[-1])
8552	// br vector.body
8553	//
8554	// vector.body
8555	// i = phi [0, vector.ph], [i+4, vector.body]
8556	// v1 = phi [v_init, vector.ph], [v2, vector.body]
8557	// v2 = a[i, i+1, i+2, i+3]
8558	// b[i] = v2 - v1
8559	// // Next, third phase will introduce v1' = splice(v1(3), v2(0, 1, 2))
8560	// b[i, i+1, i+2, i+3] = v2 - v1
8561	// br cond, vector.body, middle.block
8562	//
8563	// middle.block:
8564	// vector.recur.extract.for.phi = v2(2)
8565	// vector.recur.extract = v2(3)
8566	// br cond, scalar.ph, exit.block
8567	//
8568	// scalar.ph:
8569	// scalar.recur.init = phi [vector.recur.extract, middle.block],
8570	// [s.init, otherwise]
8571	// br scalar.body
8572	//
8573	// scalar.body:
8574	// i = phi [0, scalar.ph], [i+1, scalar.body]
8575	// s1 = phi [scalar.recur.init, scalar.ph], [s2, scalar.body]
8576	// s2 = a[i]
8577	// b[i] = s2 - s1
8578	// br cond, scalar.body, exit.block
8579	//
8580	// exit.block:
8581	// lo = lcssa.phi [s1, scalar.body],
8582	// [vector.recur.extract.for.phi, middle.block]
8583	//
8584	// Now update VPIRInstructions modeling LCSSA phis in the exit block.
8585	// Extract the penultimate value of the recurrence and use it as operand for
8586	// the VPIRInstruction modeling the phi.
8587	for (VPIRInstruction *ExitIRI : ExitUsersToFix) {
8588	if (ExitIRI->getOperand(N: `0`) != FOR)
8589	continue;
8590	// For VF vscale x 1, if vscale = 1, we are unable to extract the
8591	// penultimate value of the recurrence. Instead, we rely on function
8592	// addUsersInExitBlocks to extract the last element from the result of
8593	// VPInstruction::FirstOrderRecurrenceSplice by leaving the user of the
8594	// recurrence phi in ExitUsersToFix.
8595	// TODO: Consider vscale_range info and UF.
8596	if (LoopVectorizationPlanner::getDecisionAndClampRange(Predicate: IsScalableOne,
8597	Range))
8598	return;
8599	VPValue *PenultimateElement = MiddleBuilder.createNaryOp(
8600	Opcode: VPInstruction::ExtractPenultimateElement, Operands: {FOR->getBackedgeValue()},
8601	Inst: {}, Name: "vector.recur.extract.for.phi");
8602	ExitIRI->setOperand(I: `0`, New: PenultimateElement);
8603	ExitUsersToFix.remove(X: ExitIRI);
8604	}
8605	}
8606	}
8607
8608	VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
8609	VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
8610
8611	using namespace llvm::VPlanPatternMatch;
8612	SmallPtrSet<const InterleaveGroup<Instruction> *, `1`> InterleaveGroups;
8613
8614	// ---------------------------------------------------------------------------
8615	// Build initial VPlan: Scan the body of the loop in a topological order to
8616	// visit each basic block after having visited its predecessor basic blocks.
8617	// ---------------------------------------------------------------------------
8618
8619	// Create initial VPlan skeleton, having a basic block for the pre-header
8620	// which contains SCEV expansions that need to happen before the CFG is
8621	// modified; a basic block for the vector pre-header, followed by a region for
8622	// the vector loop, followed by the middle basic block. The skeleton vector
8623	// loop region contains a header and latch basic blocks.
8624
8625	bool RequiresScalarEpilogueCheck =
8626	LoopVectorizationPlanner::getDecisionAndClampRange(
8627	Predicate: [this](ElementCount VF) {
8628	return !CM.requiresScalarEpilogue(IsVectorizing: VF.isVector());
8629	},
8630	Range);
8631	VPlanTransforms::prepareForVectorization(
8632	Plan&: *Plan, InductionTy: Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck,
8633	TailFolded: CM.foldTailByMasking(), TheLoop: OrigLoop,
8634	IVDL: getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()),
8635	HasUncountableExit: Legal->hasUncountableEarlyExit(), Range);
8636	VPlanTransforms::createLoopRegions(Plan&: *Plan);
8637
8638	// Don't use getDecisionAndClampRange here, because we don't know the UF
8639	// so this function is better to be conservative, rather than to split
8640	// it up into different VPlans.
8641	// TODO: Consider using getDecisionAndClampRange here to split up VPlans.
8642	bool IVUpdateMayOverflow = false;
8643	for (ElementCount VF : Range)
8644	IVUpdateMayOverflow \|= !isIndvarOverflowCheckKnownFalse(Cost: &CM, VF);
8645
8646	TailFoldingStyle Style = CM.getTailFoldingStyle(IVUpdateMayOverflow);
8647	// Use NUW for the induction increment if we proved that it won't overflow in
8648	// the vector loop or when not folding the tail. In the later case, we know
8649	// that the canonical induction increment will not overflow as the vector trip
8650	// count is >= increment and a multiple of the increment.
8651	bool HasNUW = !IVUpdateMayOverflow \|\| Style == TailFoldingStyle::None;
8652	if (!HasNUW) {
8653	auto *IVInc = Plan ->getVectorLoopRegion()
8654	->getExitingBasicBlock()
8655	->getTerminator()
8656	->getOperand(N: `0`);
8657	assert(match(IVInc, m_VPInstruction<Instruction::Add>(
8658	m_Specific(Plan->getCanonicalIV()), m_VPValue())) &&
8659	"Did not find the canonical IV increment");
8660	cast<VPRecipeWithIRFlags>(Val: IVInc)->dropPoisonGeneratingFlags();
8661	}
8662
8663	// ---------------------------------------------------------------------------
8664	// Pre-construction: record ingredients whose recipes we'll need to further
8665	// process after constructing the initial VPlan.
8666	// ---------------------------------------------------------------------------
8667
8668	// For each interleave group which is relevant for this (possibly trimmed)
8669	// Range, add it to the set of groups to be later applied to the VPlan and add
8670	// placeholders for its members' Recipes which we'll be replacing with a
8671	// single VPInterleaveRecipe.
8672	for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
8673	auto ApplyIG = [IG, this](ElementCount VF) -> bool {
8674	bool Result = (VF.isVector() && // Query is illegal for VF == 1
8675	CM.getWideningDecision(I: IG->getInsertPos(), VF) ==
8676	LoopVectorizationCostModel::CM_Interleave);
8677	// For scalable vectors, the interleave factors must be <= 8 since we
8678	// require the (de)interleaveN intrinsics instead of shufflevectors.
8679	assert((!Result \|\| !VF.isScalable() \|\| IG->getFactor() <= `8`) &&
8680	"Unsupported interleave factor for scalable vectors");
8681	return Result;
8682	};
8683	if (!getDecisionAndClampRange(Predicate: ApplyIG, Range))
8684	continue;
8685	InterleaveGroups.insert(Ptr: IG);
8686	}
8687
8688	// ---------------------------------------------------------------------------
8689	// Predicate and linearize the top-level loop region.
8690	// ---------------------------------------------------------------------------
8691	auto BlockMaskCache = VPlanTransforms::introduceMasksAndLinearize(
8692	Plan&: *Plan, FoldTail: CM.foldTailByMasking());
8693
8694	// ---------------------------------------------------------------------------
8695	// Construct wide recipes and apply predication for original scalar
8696	// VPInstructions in the loop.
8697	// ---------------------------------------------------------------------------
8698	VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
8699	Builder, BlockMaskCache, LVer);
8700	RecipeBuilder.collectScaledReductions(Range);
8701
8702	// Scan the body of the loop in a topological order to visit each basic block
8703	// after having visited its predecessor basic blocks.
8704	VPRegionBlock *LoopRegion = Plan ->getVectorLoopRegion();
8705	VPBasicBlock *HeaderVPBB = LoopRegion->getEntryBasicBlock();
8706	ReversePostOrderTraversal<VPBlockShallowTraversalWrapper<VPBlockBase *>> RPOT(
8707	HeaderVPBB);
8708
8709	auto *MiddleVPBB = Plan ->getMiddleBlock();
8710	VPBasicBlock::iterator MBIP = MiddleVPBB->getFirstNonPhi();
8711	// Mapping from VPValues in the initial plan to their widened VPValues. Needed
8712	// temporarily to update created block masks.
8713	DenseMap<VPValue , VPValue > Old2New;
8714	for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(Range: RPOT)) {
8715	// Convert input VPInstructions to widened recipes.
8716	for (VPRecipeBase &R : make_early_inc_range(Range&: *VPBB)) {
8717	auto *SingleDef = cast<VPSingleDefRecipe>(Val: &R);
8718	auto *UnderlyingValue = SingleDef->getUnderlyingValue();
8719	// Skip recipes that do not need transforming, including canonical IV,
8720	// wide canonical IV and VPInstructions without underlying values. The
8721	// latter are added above for masking.
8722	// FIXME: Migrate code relying on the underlying instruction from VPlan0
8723	// to construct recipes below to not use the underlying instruction.
8724	if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe, VPBlendRecipe>(
8725	Val: &R) \|\|
8726	(isa<VPInstruction>(Val: &R) && !UnderlyingValue))
8727	continue;
8728
8729	// FIXME: VPlan0, which models a copy of the original scalar loop, should
8730	// not use VPWidenPHIRecipe to model the phis.
8731	assert((isa<VPWidenPHIRecipe>(&R) \|\| isa<VPInstruction>(&R)) &&
8732	UnderlyingValue && "unsupported recipe");
8733
8734	// TODO: Gradually replace uses of underlying instruction by analyses on
8735	// VPlan.
8736	Instruction *Instr = cast<Instruction>(Val: UnderlyingValue);
8737	Builder.setInsertPoint(SingleDef);
8738
8739	// The stores with invariant address inside the loop will be deleted, and
8740	// in the exit block, a uniform store recipe will be created for the final
8741	// invariant store of the reduction.
8742	StoreInst *SI;
8743	if ((SI = dyn_cast<StoreInst>(Val: Instr)) &&
8744	Legal->isInvariantAddressOfReduction(V: SI->getPointerOperand())) {
8745	// Only create recipe for the final invariant store of the reduction.
8746	if (Legal->isInvariantStoreOfReduction(SI)) {
8747	auto *Recipe =
8748	new VPReplicateRecipe (SI, R.operands(), true / IsUniform /,
8749	nullptr /Mask/, VPIRMetadata (*SI, LVer));
8750	Recipe->insertBefore(BB&: *MiddleVPBB, IP: MBIP);
8751	}
8752	R.eraseFromParent();
8753	continue;
8754	}
8755
8756	VPRecipeBase *Recipe =
8757	RecipeBuilder.tryToCreateWidenRecipe(R: SingleDef, Range);
8758	if (!Recipe) {
8759	SmallVector<VPValue *, `4`> Operands(R.operands());
8760	Recipe = RecipeBuilder.handleReplication(I: Instr, Operands, Range);
8761	}
8762
8763	RecipeBuilder.setRecipe(I: Instr, R: Recipe);
8764	if (isa<VPWidenIntOrFpInductionRecipe>(Val: Recipe) && isa<TruncInst>(Val: Instr)) {
8765	// Optimized a truncate to VPWidenIntOrFpInductionRecipe. It needs to be
8766	// moved to the phi section in the header.
8767	Recipe->insertBefore(BB&: *HeaderVPBB, IP: HeaderVPBB->getFirstNonPhi());
8768	} else {
8769	Builder.insert(R: Recipe);
8770	}
8771	if (Recipe->getNumDefinedValues() == `1`) {
8772	SingleDef->replaceAllUsesWith(New: Recipe->getVPSingleValue());
8773	Old2New [SingleDef] = Recipe->getVPSingleValue();
8774	} else {
8775	assert(Recipe->getNumDefinedValues() == `0` &&
8776	"Unexpected multidef recipe");
8777	R.eraseFromParent();
8778	}
8779	}
8780	}
8781
8782	// replaceAllUsesWith above may invalidate the block masks. Update them here.
8783	// TODO: Include the masks as operands in the predicated VPlan directly
8784	// to remove the need to keep a map of masks beyond the predication
8785	// transform.
8786	RecipeBuilder.updateBlockMaskCache(Old2New);
8787	for (const auto &[Old, _] : Old2New)
8788	Old->getDefiningRecipe()->eraseFromParent();
8789
8790	assert(isa<VPRegionBlock>(Plan->getVectorLoopRegion()) &&
8791	!Plan->getVectorLoopRegion()->getEntryBasicBlock()->empty() &&
8792	"entry block must be set to a VPRegionBlock having a non-empty entry "
8793	"VPBasicBlock");
8794
8795	// Update wide induction increments to use the same step as the corresponding
8796	// wide induction. This enables detecting induction increments directly in
8797	// VPlan and removes redundant splats.
8798	for (const auto &[Phi, ID] : Legal->getInductionVars()) {
8799	auto *IVInc = cast<Instruction>(
8800	Val: Phi->getIncomingValueForBlock(BB: OrigLoop->getLoopLatch()));
8801	if (IVInc->getOperand(i: `0`) != Phi \|\| IVInc->getOpcode() != Instruction::Add)
8802	continue;
8803	VPWidenInductionRecipe *WideIV =
8804	cast<VPWidenInductionRecipe>(Val: RecipeBuilder.getRecipe(I: Phi));
8805	VPRecipeBase *R = RecipeBuilder.getRecipe(I: IVInc);
8806	R->setOperand(I: `1`, New: WideIV->getStepValue());
8807	}
8808
8809	DenseMap<VPValue , VPValue > IVEndValues;
8810	addScalarResumePhis(Builder&: RecipeBuilder, Plan&: *Plan, IVEndValues);
8811	SetVector<VPIRInstruction *> ExitUsersToFix =
8812	collectUsersInLatchExitBlock(Plan&: *Plan);
8813	addExitUsersForFirstOrderRecurrences(Plan&: *Plan, ExitUsersToFix, Range);
8814	addUsersInExitBlocks(Plan&: *Plan, ExitUsersToFix);
8815
8816	// ---------------------------------------------------------------------------
8817	// Transform initial VPlan: Apply previously taken decisions, in order, to
8818	// bring the VPlan to its final state.
8819	// ---------------------------------------------------------------------------
8820
8821	// Adjust the recipes for any inloop reductions.
8822	adjustRecipesForReductions(Plan, RecipeBuilder, MinVF: Range.Start);
8823
8824	// Apply mandatory transformation to handle FP maxnum/minnum reduction with
8825	// NaNs if possible, bail out otherwise.
8826	if (!VPlanTransforms::runPass(Transform: VPlanTransforms::handleMaxMinNumReductions,
8827	Plan&: *Plan))
8828	return nullptr;
8829
8830	// Transform recipes to abstract recipes if it is legal and beneficial and
8831	// clamp the range for better cost estimation.
8832	// TODO: Enable following transform when the EVL-version of extended-reduction
8833	// and mulacc-reduction are implemented.
8834	if (!CM.foldTailWithEVL()) {
8835	VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM,
8836	CM.CostKind);
8837	VPlanTransforms::runPass(Fn: VPlanTransforms::convertToAbstractRecipes, Plan&: *Plan,
8838	Args&: CostCtx, Args&: Range);
8839	}
8840
8841	for (ElementCount VF : Range)
8842	Plan ->addVF(VF);
8843	Plan ->setName("Initial VPlan");
8844
8845	// Interleave memory: for each Interleave Group we marked earlier as relevant
8846	// for this VPlan, replace the Recipes widening its memory instructions with a
8847	// single VPInterleaveRecipe at its insertion point.
8848	VPlanTransforms::runPass(Fn: VPlanTransforms::createInterleaveGroups, Plan&: *Plan,
8849	Args: InterleaveGroups, Args&: RecipeBuilder,
8850	Args: CM.isScalarEpilogueAllowed());
8851
8852	// Replace VPValues for known constant strides guaranteed by predicate scalar
8853	// evolution.
8854	auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) {
8855	auto *R = cast<VPRecipeBase>(Val: &U);
8856	return R->getParent()->getParent() \|\|
8857	R->getParent() ==
8858	Plan ->getVectorLoopRegion()->getSinglePredecessor();
8859	};
8860	for (auto [_, Stride] : Legal->getLAI()->getSymbolicStrides()) {
8861	auto *StrideV = cast<SCEVUnknown>(Val: Stride)->getValue();
8862	auto *ScevStride = dyn_cast<SCEVConstant>(Val: PSE.getSCEV(V: StrideV));
8863	// Only handle constant strides for now.
8864	if (!ScevStride)
8865	continue;
8866
8867	auto *CI = Plan ->getOrAddLiveIn(
8868	V: ConstantInt::get(Ty: Stride->getType(), V: ScevStride->getAPInt()));
8869	if (VPValue *StrideVPV = Plan ->getLiveIn(V: StrideV))
8870	StrideVPV->replaceUsesWithIf(New: CI, ShouldReplace: CanUseVersionedStride);
8871
8872	// The versioned value may not be used in the loop directly but through a
8873	// sext/zext. Add new live-ins in those cases.
8874	for (Value *U : StrideV->users()) {
8875	if (!isa<SExtInst, ZExtInst>(Val: U))
8876	continue;
8877	VPValue *StrideVPV = Plan ->getLiveIn(V: U);
8878	if (!StrideVPV)
8879	continue;
8880	unsigned BW = U->getType()->getScalarSizeInBits();
8881	APInt C = isa<SExtInst>(Val: U) ? ScevStride->getAPInt().sext(width: BW)
8882	: ScevStride->getAPInt().zext(width: BW);
8883	VPValue *CI = Plan ->getOrAddLiveIn(V: ConstantInt::get(Ty: U->getType(), V: C));
8884	StrideVPV->replaceUsesWithIf(New: CI, ShouldReplace: CanUseVersionedStride);
8885	}
8886	}
8887
8888	auto BlockNeedsPredication = [this](BasicBlock *BB) {
8889	return Legal->blockNeedsPredication(BB);
8890	};
8891	VPlanTransforms::runPass(Fn: VPlanTransforms::dropPoisonGeneratingRecipes, Plan&: *Plan,
8892	Args: BlockNeedsPredication);
8893
8894	// Sink users of fixed-order recurrence past the recipe defining the previous
8895	// value and introduce FirstOrderRecurrenceSplice VPInstructions.
8896	if (!VPlanTransforms::runPass(Transform: VPlanTransforms::adjustFixedOrderRecurrences,
8897	Plan&: *Plan, Args&: Builder))
8898	return nullptr;
8899
8900	if (useActiveLaneMask(Style)) {
8901	// TODO: Move checks to VPlanTransforms::addActiveLaneMask once
8902	// TailFoldingStyle is visible there.
8903	bool ForControlFlow = useActiveLaneMaskForControlFlow(Style);
8904	bool WithoutRuntimeCheck =
8905	Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
8906	VPlanTransforms::addActiveLaneMask(Plan&: *Plan, UseActiveLaneMaskForControlFlow: ForControlFlow,
8907	DataAndControlFlowWithoutRuntimeCheck: WithoutRuntimeCheck);
8908	}
8909	VPlanTransforms::optimizeInductionExitUsers(Plan&: *Plan, EndValues&: IVEndValues);
8910
8911	assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8912	return Plan;
8913	}
8914
8915	VPlanPtr LoopVectorizationPlanner::tryToBuildVPlan(VFRange &Range) {
8916	// Outer loop handling: They may require CFG and instruction level
8917	// transformations before even evaluating whether vectorization is profitable.
8918	// Since we cannot modify the incoming IR, we need to build VPlan upfront in
8919	// the vectorization pipeline.
8920	assert(!OrigLoop->isInnermost());
8921	assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
8922
8923	auto Plan = VPlanTransforms::buildPlainCFG(TheLoop: OrigLoop, LI&: *LI);
8924	VPlanTransforms::prepareForVectorization(
8925	Plan&: Plan, InductionTy: Legal->getWidestInductionType(), PSE, RequiresScalarEpilogueCheck: true, TailFolded: false*, TheLoop: OrigLoop,
8926	IVDL: getDebugLocFromInstOrOperands(I: Legal->getPrimaryInduction()), HasUncountableExit: false,
8927	Range);
8928	VPlanTransforms::createLoopRegions(Plan&: *Plan);
8929
8930	for (ElementCount VF : Range)
8931	Plan ->addVF(VF);
8932
8933	if (!VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
8934	Plan,
8935	GetIntOrFpInductionDescriptor: [this](PHINode *P) {
8936	return Legal->getIntOrFpInductionDescriptor(Phi: P);
8937	},
8938	SE&: PSE.getSE(), TLI: TLI))
8939	return nullptr;
8940
8941	// Collect mapping of IR header phis to header phi recipes, to be used in
8942	// addScalarResumePhis.
8943	DenseMap<VPBasicBlock , VPValue > BlockMaskCache;
8944	VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, &TTI, Legal, CM, PSE,
8945	Builder, BlockMaskCache, nullptr /LVer/);
8946	for (auto &R : Plan ->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
8947	if (isa<VPCanonicalIVPHIRecipe>(Val: &R))
8948	continue;
8949	auto *HeaderR = cast<VPHeaderPHIRecipe>(Val: &R);
8950	RecipeBuilder.setRecipe(I: HeaderR->getUnderlyingInstr(), R: HeaderR);
8951	}
8952	DenseMap<VPValue , VPValue > IVEndValues;
8953	// TODO: IVEndValues are not used yet in the native path, to optimize exit
8954	// values.
8955	addScalarResumePhis(Builder&: RecipeBuilder, Plan&: *Plan, IVEndValues);
8956
8957	assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
8958	return Plan;
8959	}
8960
8961	// Adjust the recipes for reductions. For in-loop reductions the chain of
8962	// instructions leading from the loop exit instr to the phi need to be converted
8963	// to reductions, with one operand being vector and the other being the scalar
8964	// reduction chain. For other reductions, a select is introduced between the phi
8965	// and users outside the vector region when folding the tail.
8966	//
8967	// A ComputeReductionResult recipe is added to the middle block, also for
8968	// in-loop reductions which compute their result in-loop, because generating
8969	// the subsequent bc.merge.rdx phi is driven by ComputeReductionResult recipes.
8970	//
8971	// Adjust AnyOf reductions; replace the reduction phi for the selected value
8972	// with a boolean reduction phi node to check if the condition is true in any
8973	// iteration. The final value is selected by the final ComputeReductionResult.
8974	void LoopVectorizationPlanner::adjustRecipesForReductions(
8975	VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
8976	using namespace VPlanPatternMatch;
8977	VPRegionBlock *VectorLoopRegion = Plan ->getVectorLoopRegion();
8978	VPBasicBlock *Header = VectorLoopRegion->getEntryBasicBlock();
8979	VPBasicBlock *MiddleVPBB = Plan ->getMiddleBlock();
8980	SmallVector<VPRecipeBase *> ToDelete;
8981
8982	for (VPRecipeBase &R : Header->phis()) {
8983	auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
8984	if (!PhiR \|\| !PhiR->isInLoop() \|\| (MinVF.isScalar() && !PhiR->isOrdered()))
8985	continue;
8986
8987	RecurKind Kind = PhiR->getRecurrenceKind();
8988	assert(
8989	!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind) &&
8990	!RecurrenceDescriptor::isFindIVRecurrenceKind(Kind) &&
8991	"AnyOf and FindIV reductions are not allowed for in-loop reductions");
8992
8993	// Collect the chain of "link" recipes for the reduction starting at PhiR.
8994	SetVector<VPSingleDefRecipe *> Worklist;
8995	Worklist.insert(X: PhiR);
8996	for (unsigned I = `0`; I != Worklist.size(); ++I) {
8997	VPSingleDefRecipe *Cur = Worklist [I];
8998	for (VPUser *U : Cur->users()) {
8999	auto *UserRecipe = cast<VPSingleDefRecipe>(Val: U);
9000	if (!UserRecipe->getParent()->getEnclosingLoopRegion()) {
9001	assert((UserRecipe->getParent() == MiddleVPBB \|\|
9002	UserRecipe->getParent() == Plan->getScalarPreheader()) &&
9003	"U must be either in the loop region, the middle block or the "
9004	"scalar preheader.");
9005	continue;
9006	}
9007	Worklist.insert(X: UserRecipe);
9008	}
9009	}
9010
9011	// Visit operation "Links" along the reduction chain top-down starting from
9012	// the phi until LoopExitValue. We keep track of the previous item
9013	// (PreviousLink) to tell which of the two operands of a Link will remain
9014	// scalar and which will be reduced. For minmax by select(cmp), Link will be
9015	// the select instructions. Blend recipes of in-loop reduction phi's will
9016	// get folded to their non-phi operand, as the reduction recipe handles the
9017	// condition directly.
9018	VPSingleDefRecipe PreviousLink = PhiR; // Aka Worklist[0].*
9019	for (VPSingleDefRecipe *CurrentLink : drop_begin(RangeOrContainer&: Worklist)) {
9020	if (auto *Blend = dyn_cast<VPBlendRecipe>(Val: CurrentLink)) {
9021	assert(Blend->getNumIncomingValues() == `2` &&
9022	"Blend must have 2 incoming values");
9023	if (Blend->getIncomingValue(Idx: `0`) == PhiR) {
9024	Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: `1`));
9025	} else {
9026	assert(Blend->getIncomingValue(`1`) == PhiR &&
9027	"PhiR must be an operand of the blend");
9028	Blend->replaceAllUsesWith(New: Blend->getIncomingValue(Idx: `0`));
9029	}
9030	continue;
9031	}
9032
9033	Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
9034
9035	// Index of the first operand which holds a non-mask vector operand.
9036	unsigned IndexOfFirstOperand;
9037	// Recognize a call to the llvm.fmuladd intrinsic.
9038	bool IsFMulAdd = (Kind == RecurKind::FMulAdd);
9039	VPValue *VecOp;
9040	VPBasicBlock *LinkVPBB = CurrentLink->getParent();
9041	if (IsFMulAdd) {
9042	assert(
9043	RecurrenceDescriptor::isFMulAddIntrinsic(CurrentLinkI) &&
9044	"Expected instruction to be a call to the llvm.fmuladd intrinsic");
9045	assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) \|\|
9046	isa<VPWidenIntrinsicRecipe>(CurrentLink)) &&
9047	CurrentLink->getOperand(`2`) == PreviousLink &&
9048	"expected a call where the previous link is the added operand");
9049
9050	// If the instruction is a call to the llvm.fmuladd intrinsic then we
9051	// need to create an fmul recipe (multiplying the first two operands of
9052	// the fmuladd together) to use as the vector operand for the fadd
9053	// reduction.
9054	VPInstruction FMulRecipe = new* VPInstruction (
9055	Instruction::FMul,
9056	{CurrentLink->getOperand(N: `0`), CurrentLink->getOperand(N: `1`)},
9057	CurrentLinkI->getFastMathFlags());
9058	LinkVPBB->insert(Recipe: FMulRecipe, InsertPt: CurrentLink->getIterator());
9059	VecOp = FMulRecipe;
9060	} else {
9061	if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) {
9062	if (isa<VPWidenRecipe>(Val: CurrentLink)) {
9063	assert(isa<CmpInst>(CurrentLinkI) &&
9064	"need to have the compare of the select");
9065	continue;
9066	}
9067	assert(isa<VPWidenSelectRecipe>(CurrentLink) &&
9068	"must be a select recipe");
9069	IndexOfFirstOperand = `1`;
9070	} else {
9071	assert((MinVF.isScalar() \|\| isa<VPWidenRecipe>(CurrentLink)) &&
9072	"Expected to replace a VPWidenSC");
9073	IndexOfFirstOperand = `0`;
9074	}
9075	// Note that for non-commutable operands (cmp-selects), the semantics of
9076	// the cmp-select are captured in the recurrence kind.
9077	unsigned VecOpId =
9078	CurrentLink->getOperand(N: IndexOfFirstOperand) == PreviousLink
9079	? IndexOfFirstOperand + `1`
9080	: IndexOfFirstOperand;
9081	VecOp = CurrentLink->getOperand(N: VecOpId);
9082	assert(VecOp != PreviousLink &&
9083	CurrentLink->getOperand(CurrentLink->getNumOperands() - `1` -
9084	(VecOpId - IndexOfFirstOperand)) ==
9085	PreviousLink &&
9086	"PreviousLink must be the operand other than VecOp");
9087	}
9088
9089	VPValue CondOp = nullptr*;
9090	if (CM.blockNeedsPredicationForAnyReason(BB: CurrentLinkI->getParent()))
9091	CondOp = RecipeBuilder.getBlockInMask(VPBB: CurrentLink->getParent());
9092
9093	// TODO: Retrieve FMFs from recipes directly.
9094	RecurrenceDescriptor RdxDesc = Legal->getRecurrenceDescriptor(
9095	PN: cast<PHINode>(Val: PhiR->getUnderlyingInstr()));
9096	// Non-FP RdxDescs will have all fast math flags set, so clear them.
9097	FastMathFlags FMFs = isa<FPMathOperator>(Val: CurrentLinkI)
9098	? RdxDesc.getFastMathFlags()
9099	: FastMathFlags ();
9100	auto RedRecipe = new* VPReductionRecipe (
9101	Kind, FMFs, CurrentLinkI, PreviousLink, VecOp, CondOp,
9102	PhiR->isOrdered(), CurrentLinkI->getDebugLoc());
9103	// Append the recipe to the end of the VPBasicBlock because we need to
9104	// ensure that it comes after all of it's inputs, including CondOp.
9105	// Delete CurrentLink as it will be invalid if its operand is replaced
9106	// with a reduction defined at the bottom of the block in the next link.
9107	if (LinkVPBB->getNumSuccessors() == `0`)
9108	RedRecipe->insertBefore(InsertPos: &*std::prev(x: std::prev(x: LinkVPBB->end())));
9109	else
9110	LinkVPBB->appendRecipe(Recipe: RedRecipe);
9111
9112	CurrentLink->replaceAllUsesWith(New: RedRecipe);
9113	ToDelete.push_back(Elt: CurrentLink);
9114	PreviousLink = RedRecipe;
9115	}
9116	}
9117	VPBasicBlock *LatchVPBB = VectorLoopRegion->getExitingBasicBlock();
9118	Builder.setInsertPoint(&*std::prev(x: std::prev(x: LatchVPBB->end())));
9119	VPBasicBlock::iterator IP = MiddleVPBB->getFirstNonPhi();
9120	for (VPRecipeBase &R :
9121	Plan ->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9122	VPReductionPHIRecipe *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
9123	if (!PhiR)
9124	continue;
9125
9126	const RecurrenceDescriptor &RdxDesc = Legal->getRecurrenceDescriptor(
9127	PN: cast<PHINode>(Val: PhiR->getUnderlyingInstr()));
9128	Type *PhiTy = PhiR->getUnderlyingValue()->getType();
9129	// If tail is folded by masking, introduce selects between the phi
9130	// and the users outside the vector region of each reduction, at the
9131	// beginning of the dedicated latch block.
9132	auto *OrigExitingVPV = PhiR->getBackedgeValue();
9133	auto *NewExitingVPV = PhiR->getBackedgeValue();
9134	// Don't output selects for partial reductions because they have an output
9135	// with fewer lanes than the VF. So the operands of the select would have
9136	// different numbers of lanes. Partial reductions mask the input instead.
9137	if (!PhiR->isInLoop() && CM.foldTailByMasking() &&
9138	!isa<VPPartialReductionRecipe>(Val: OrigExitingVPV->getDefiningRecipe())) {
9139	VPValue *Cond = RecipeBuilder.getBlockInMask(VPBB: PhiR->getParent());
9140	std::optional<FastMathFlags> FMFs =
9141	PhiTy->isFloatingPointTy()
9142	? std::make_optional(t: RdxDesc.getFastMathFlags())
9143	: std::nullopt;
9144	NewExitingVPV =
9145	Builder.createSelect(Cond, TrueVal: OrigExitingVPV, FalseVal: PhiR, DL: {}, Name: "", FMFs);
9146	OrigExitingVPV->replaceUsesWithIf(New: NewExitingVPV, ShouldReplace: [](VPUser &U, unsigned) {
9147	return isa<VPInstruction>(Val: &U) &&
9148	(cast<VPInstruction>(Val: &U)->getOpcode() ==
9149	VPInstruction::ComputeAnyOfResult \|\|
9150	cast<VPInstruction>(Val: &U)->getOpcode() ==
9151	VPInstruction::ComputeReductionResult \|\|
9152	cast<VPInstruction>(Val: &U)->getOpcode() ==
9153	VPInstruction::ComputeFindIVResult);
9154	});
9155	if (CM.usePredicatedReductionSelect())
9156	PhiR->setOperand(I: `1`, New: NewExitingVPV);
9157	}
9158
9159	// We want code in the middle block to appear to execute on the location of
9160	// the scalar loop's latch terminator because: (a) it is all compiler
9161	// generated, (b) these instructions are always executed after evaluating
9162	// the latch conditional branch, and (c) other passes may add new
9163	// predecessors which terminate on this line. This is the easiest way to
9164	// ensure we don't accidentally cause an extra step back into the loop while
9165	// debugging.
9166	DebugLoc ExitDL = OrigLoop->getLoopLatch()->getTerminator()->getDebugLoc();
9167
9168	// TODO: At the moment ComputeReductionResult also drives creation of the
9169	// bc.merge.rdx phi nodes, hence it needs to be created unconditionally here
9170	// even for in-loop reductions, until the reduction resume value handling is
9171	// also modeled in VPlan.
9172	VPInstruction *FinalReductionResult;
9173	VPBuilder::InsertPointGuard Guard(Builder);
9174	Builder.setInsertPoint(TheBB: MiddleVPBB, IP);
9175	RecurKind RecurrenceKind = PhiR->getRecurrenceKind();
9176	if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RecurrenceKind)) {
9177	VPValue *Start = PhiR->getStartValue();
9178	VPValue *Sentinel = Plan ->getOrAddLiveIn(V: RdxDesc.getSentinelValue());
9179	FinalReductionResult =
9180	Builder.createNaryOp(Opcode: VPInstruction::ComputeFindIVResult,
9181	Operands: {PhiR, Start, Sentinel, NewExitingVPV}, DL: ExitDL);
9182	} else if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RecurrenceKind)) {
9183	VPValue *Start = PhiR->getStartValue();
9184	FinalReductionResult =
9185	Builder.createNaryOp(Opcode: VPInstruction::ComputeAnyOfResult,
9186	Operands: {PhiR, Start, NewExitingVPV}, DL: ExitDL);
9187	} else {
9188	VPIRFlags Flags =
9189	RecurrenceDescriptor::isFloatingPointRecurrenceKind(Kind: RecurrenceKind)
9190	? VPIRFlags (RdxDesc.getFastMathFlags())
9191	: VPIRFlags ();
9192	FinalReductionResult =
9193	Builder.createNaryOp(Opcode: VPInstruction::ComputeReductionResult,
9194	Operands: {PhiR, NewExitingVPV}, Flags, DL: ExitDL);
9195	}
9196	// If the vector reduction can be performed in a smaller type, we truncate
9197	// then extend the loop exit value to enable InstCombine to evaluate the
9198	// entire expression in the smaller type.
9199	if (MinVF.isVector() && PhiTy != RdxDesc.getRecurrenceType() &&
9200	!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RecurrenceKind)) {
9201	assert(!PhiR->isInLoop() && "Unexpected truncated inloop reduction!");
9202	assert(!RecurrenceDescriptor::isMinMaxRecurrenceKind(RecurrenceKind) &&
9203	"Unexpected truncated min-max recurrence!");
9204	Type *RdxTy = RdxDesc.getRecurrenceType();
9205	auto *Trunc =
9206	new VPWidenCastRecipe (Instruction::Trunc, NewExitingVPV, RdxTy);
9207	Instruction::CastOps ExtendOpc =
9208	RdxDesc.isSigned() ? Instruction::SExt : Instruction::ZExt;
9209	auto Extnd = new* VPWidenCastRecipe (ExtendOpc, Trunc, PhiTy);
9210	Trunc->insertAfter(InsertPos: NewExitingVPV->getDefiningRecipe());
9211	Extnd->insertAfter(InsertPos: Trunc);
9212	if (PhiR->getOperand(N: `1`) == NewExitingVPV)
9213	PhiR->setOperand(I: `1`, New: Extnd->getVPSingleValue());
9214
9215	// Update ComputeReductionResult with the truncated exiting value and
9216	// extend its result.
9217	FinalReductionResult->setOperand(I: `1`, New: Trunc);
9218	FinalReductionResult =
9219	Builder.createScalarCast(Opcode: ExtendOpc, Op: FinalReductionResult, ResultTy: PhiTy, DL: {});
9220	}
9221
9222	// Update all users outside the vector region. Also replace redundant
9223	// ExtractLastElement.
9224	for (auto *U : to_vector(Range: OrigExitingVPV->users())) {
9225	auto *Parent = cast<VPRecipeBase>(Val: U)->getParent();
9226	if (FinalReductionResult == U \|\| Parent->getParent())
9227	continue;
9228	U->replaceUsesOfWith(From: OrigExitingVPV, To: FinalReductionResult);
9229	if (match(U, P: m_VPInstruction<VPInstruction::ExtractLastElement>(
9230	Op0: m_VPValue())))
9231	cast<VPInstruction>(Val: U)->replaceAllUsesWith(New: FinalReductionResult);
9232	}
9233
9234	// Adjust AnyOf reductions; replace the reduction phi for the selected value
9235	// with a boolean reduction phi node to check if the condition is true in
9236	// any iteration. The final value is selected by the final
9237	// ComputeReductionResult.
9238	if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RecurrenceKind)) {
9239	auto Select = cast<VPRecipeBase>(Val: find_if(Range: PhiR->users(), P: [](VPUser *U) {
9240	return isa<VPWidenSelectRecipe>(Val: U) \|\|
9241	(isa<VPReplicateRecipe>(Val: U) &&
9242	cast<VPReplicateRecipe>(Val: U)->getUnderlyingInstr()->getOpcode() ==
9243	Instruction::Select);
9244	}));
9245	VPValue *Cmp = Select->getOperand(N: `0`);
9246	// If the compare is checking the reduction PHI node, adjust it to check
9247	// the start value.
9248	if (VPRecipeBase *CmpR = Cmp->getDefiningRecipe())
9249	CmpR->replaceUsesOfWith(From: PhiR, To: PhiR->getStartValue());
9250	Builder.setInsertPoint(Select);
9251
9252	// If the true value of the select is the reduction phi, the new value is
9253	// selected if the negated condition is true in any iteration.
9254	if (Select->getOperand(N: `1`) == PhiR)
9255	Cmp = Builder.createNot(Operand: Cmp);
9256	VPValue *Or = Builder.createOr(LHS: PhiR, RHS: Cmp);
9257	Select->getVPSingleValue()->replaceAllUsesWith(New: Or);
9258	// Delete Select now that it has invalid types.
9259	ToDelete.push_back(Elt: Select);
9260
9261	// Convert the reduction phi to operate on bools.
9262	PhiR->setOperand(I: `0`, New: Plan ->getOrAddLiveIn(V: ConstantInt::getFalse(
9263	Context&: OrigLoop->getHeader()->getContext())));
9264	continue;
9265	}
9266
9267	if (RecurrenceDescriptor::isFindIVRecurrenceKind(
9268	Kind: RdxDesc.getRecurrenceKind())) {
9269	// Adjust the start value for FindFirstIV/FindLastIV recurrences to use
9270	// the sentinel value after generating the ResumePhi recipe, which uses
9271	// the original start value.
9272	PhiR->setOperand(I: `0`, New: Plan ->getOrAddLiveIn(V: RdxDesc.getSentinelValue()));
9273	}
9274	RecurKind RK = RdxDesc.getRecurrenceKind();
9275	if ((!RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK) &&
9276	!RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RK) &&
9277	!RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind: RK))) {
9278	VPBuilder PHBuilder(Plan ->getVectorPreheader());
9279	VPValue *Iden = Plan ->getOrAddLiveIn(
9280	V: getRecurrenceIdentity(K: RK, Tp: PhiTy, FMF: RdxDesc.getFastMathFlags()));
9281	// If the PHI is used by a partial reduction, set the scale factor.
9282	unsigned ScaleFactor =
9283	RecipeBuilder.getScalingForReduction(ExitInst: RdxDesc.getLoopExitInstr())
9284	.value_or(u: `1`);
9285	Type *I32Ty = IntegerType::getInt32Ty(C&: PhiTy->getContext());
9286	auto *ScaleFactorVPV =
9287	Plan ->getOrAddLiveIn(V: ConstantInt::get(Ty: I32Ty, V: ScaleFactor));
9288	VPValue *StartV = PHBuilder.createNaryOp(
9289	Opcode: VPInstruction::ReductionStartVector,
9290	Operands: {PhiR->getStartValue(), Iden, ScaleFactorVPV},
9291	Flags: PhiTy->isFloatingPointTy() ? RdxDesc.getFastMathFlags()
9292	: FastMathFlags ());
9293	PhiR->setOperand(I: `0`, New: StartV);
9294	}
9295	}
9296	for (VPRecipeBase *R : ToDelete)
9297	R->eraseFromParent();
9298
9299	VPlanTransforms::runPass(Fn: VPlanTransforms::clearReductionWrapFlags, Plan&: *Plan);
9300	}
9301
9302	void LoopVectorizationPlanner::attachRuntimeChecks(
9303	VPlan &Plan, GeneratedRTChecks &RTChecks, bool HasBranchWeights) const {
9304	const auto &[SCEVCheckCond, SCEVCheckBlock] = RTChecks.getSCEVChecks();
9305	if (SCEVCheckBlock) {
9306	assert((!CM.OptForSize \|\|
9307	CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled) &&
9308	"Cannot SCEV check stride or overflow when optimizing for size");
9309	VPlanTransforms::attachCheckBlock(Plan, Cond: SCEVCheckCond, CheckBlock: SCEVCheckBlock,
9310	AddBranchWeights: HasBranchWeights);
9311	}
9312	const auto &[MemCheckCond, MemCheckBlock] = RTChecks.getMemRuntimeChecks();
9313	if (MemCheckBlock) {
9314	// VPlan-native path does not do any analysis for runtime checks
9315	// currently.
9316	assert((!EnableVPlanNativePath \|\| OrigLoop->isInnermost()) &&
9317	"Runtime checks are not supported for outer loops yet");
9318
9319	if (CM.OptForSize) {
9320	assert(
9321	CM.Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
9322	"Cannot emit memory checks when optimizing for size, unless forced "
9323	"to vectorize.");
9324	ORE->emit(RemarkBuilder: [&]() {
9325	return OptimizationRemarkAnalysis (DEBUG_TYPE, "VectorizationCodeSize",
9326	OrigLoop->getStartLoc(),
9327	OrigLoop->getHeader())
9328	<< "Code-size may be reduced by not forcing "
9329	"vectorization, or by source-code modifications "
9330	"eliminating the need for runtime checks "
9331	"(e.g., adding 'restrict').";
9332	});
9333	}
9334	VPlanTransforms::attachCheckBlock(Plan, Cond: MemCheckCond, CheckBlock: MemCheckBlock,
9335	AddBranchWeights: HasBranchWeights);
9336	}
9337	}
9338
9339	void VPDerivedIVRecipe::execute(VPTransformState &State) {
9340	assert(!State.Lane && "VPDerivedIVRecipe being replicated.");
9341
9342	// Fast-math-flags propagate from the original induction instruction.
9343	IRBuilder<>::FastMathFlagGuard FMFG(State.Builder);
9344	if (FPBinOp)
9345	State.Builder.setFastMathFlags(FPBinOp->getFastMathFlags());
9346
9347	Value *Step = State.get(Def: getStepValue(), Lane: VPLane (`0`));
9348	Value *Index = State.get(Def: getOperand(N: `1`), Lane: VPLane (`0`));
9349	Value *DerivedIV = emitTransformedIndex(
9350	B&: State.Builder, Index, StartValue: getStartValue()->getLiveInIRValue(), Step, InductionKind: Kind,
9351	InductionBinOp: cast_if_present<BinaryOperator>(Val: FPBinOp));
9352	DerivedIV->setName(Name);
9353	// If index is the vector trip count, the concrete value will only be set in
9354	// prepareToExecute, leading to missed simplifications, e.g. if it is 0.
9355	// TODO: Remove the special case for the vector trip count once it is computed
9356	// in VPlan and can be used during VPlan simplification.
9357	assert((DerivedIV != Index \|\|
9358	getOperand(`1`) == &getParent()->getPlan()->getVectorTripCount()) &&
9359	"IV didn't need transforming?");
9360	State.set(Def: this, V: DerivedIV, Lane: VPLane (`0`));
9361	}
9362
9363	// Determine how to lower the scalar epilogue, which depends on 1) optimising
9364	// for minimum code-size, 2) predicate compiler options, 3) loop hints forcing
9365	// predication, and 4) a TTI hook that analyses whether the loop is suitable
9366	// for predication.
9367	static ScalarEpilogueLowering getScalarEpilogueLowering(
9368	Function F, Loop L, LoopVectorizeHints &Hints, ProfileSummaryInfo *PSI,
9369	BlockFrequencyInfo BFI, TargetTransformInfo TTI, TargetLibraryInfo *TLI,
9370	LoopVectorizationLegality &LVL, InterleavedAccessInfo *IAI) {
9371	// 1) OptSize takes precedence over all other options, i.e. if this is set,
9372	// don't look at hints or options, and don't request a scalar epilogue.
9373	// (For PGSO, as shouldOptimizeForSize isn't currently accessible from
9374	// LoopAccessInfo (due to code dependency and not being able to reliably get
9375	// PSI/BFI from a loop analysis under NPM), we cannot suppress the collection
9376	// of strides in LoopAccessInfo::analyzeLoop() and vectorize without
9377	// versioning when the vectorization is forced, unlike hasOptSize. So revert
9378	// back to the old way and vectorize with versioning when forced. See D81345.)
9379	if (F->hasOptSize() \|\| (llvm::shouldOptimizeForSize(BB: L->getHeader(), PSI, BFI,
9380	QueryType: PGSOQueryType::IRPass) &&
9381	Hints.getForce() != LoopVectorizeHints::FK_Enabled))
9382	return CM_ScalarEpilogueNotAllowedOptSize;
9383
9384	// 2) If set, obey the directives
9385	if (PreferPredicateOverEpilogue.getNumOccurrences()) {
9386	switch (PreferPredicateOverEpilogue) {
9387	case PreferPredicateTy::ScalarEpilogue:
9388	return CM_ScalarEpilogueAllowed;
9389	case PreferPredicateTy::PredicateElseScalarEpilogue:
9390	return CM_ScalarEpilogueNotNeededUsePredicate;
9391	case PreferPredicateTy::PredicateOrDontVectorize:
9392	return CM_ScalarEpilogueNotAllowedUsePredicate;
9393	};
9394	}
9395
9396	// 3) If set, obey the hints
9397	switch (Hints.getPredicate()) {
9398	case LoopVectorizeHints::FK_Enabled:
9399	return CM_ScalarEpilogueNotNeededUsePredicate;
9400	case LoopVectorizeHints::FK_Disabled:
9401	return CM_ScalarEpilogueAllowed;
9402	};
9403
9404	// 4) if the TTI hook indicates this is profitable, request predication.
9405	TailFoldingInfo TFI(TLI, &LVL, IAI);
9406	if (TTI->preferPredicateOverEpilogue(TFI: &TFI))
9407	return CM_ScalarEpilogueNotNeededUsePredicate;
9408
9409	return CM_ScalarEpilogueAllowed;
9410	}
9411
9412	// Process the loop in the VPlan-native vectorization path. This path builds
9413	// VPlan upfront in the vectorization pipeline, which allows to apply
9414	// VPlan-to-VPlan transformations from the very beginning without modifying the
9415	// input LLVM IR.
9416	static bool processLoopInVPlanNativePath(
9417	Loop L, PredicatedScalarEvolution &PSE, LoopInfo LI, DominatorTree *DT,
9418	LoopVectorizationLegality LVL, TargetTransformInfo TTI,
9419	TargetLibraryInfo TLI, DemandedBits DB, AssumptionCache *AC,
9420	OptimizationRemarkEmitter ORE, BlockFrequencyInfo BFI,
9421	ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints,
9422	LoopVectorizationRequirements &Requirements) {
9423
9424	if (isa<SCEVCouldNotCompute>(Val: PSE.getBackedgeTakenCount())) {
9425	LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
9426	return false;
9427	}
9428	assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
9429	Function *F = L->getHeader()->getParent();
9430	InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
9431
9432	ScalarEpilogueLowering SEL =
9433	getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL&: *LVL, IAI: &IAI);
9434
9435	LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
9436	&Hints, IAI, PSI, BFI);
9437	// Use the planner for outer loop vectorization.
9438	// TODO: CM is not used at this point inside the planner. Turn CM into an
9439	// optional argument if we don't need it in the future.
9440	LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, LVL, CM, IAI, PSE, Hints,
9441	ORE);
9442
9443	// Get user vectorization factor.
9444	ElementCount UserVF = Hints.getWidth();
9445
9446	CM.collectElementTypesForWidening();
9447
9448	// Plan how to best vectorize, return the best VF and its cost.
9449	const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
9450
9451	// If we are stress testing VPlan builds, do not attempt to generate vector
9452	// code. Masked vector code generation support will follow soon.
9453	// Also, do not attempt to vectorize if no vector code will be produced.
9454	if (VPlanBuildStressTest \|\| VectorizationFactor::Disabled() == VF)
9455	return false;
9456
9457	VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
9458
9459	{
9460	GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
9461	InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
9462	VF.Width, `1`, &CM, BFI, PSI, Checks, BestPlan);
9463	LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
9464	<< L->getHeader()->getParent()->getName() << "\"\n");
9465	LVP.executePlan(BestVF: VF.Width, BestUF: `1`, BestVPlan&: BestPlan, ILV&: LB, DT, VectorizingEpilogue: false);
9466	}
9467
9468	reportVectorization(ORE, TheLoop: L, VF, IC: `1`);
9469
9470	// Mark the loop as already vectorized to avoid vectorizing again.
9471	Hints.setAlreadyVectorized();
9472	assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9473	return true;
9474	}
9475
9476	// Emit a remark if there are stores to floats that required a floating point
9477	// extension. If the vectorized loop was generated with floating point there
9478	// will be a performance penalty from the conversion overhead and the change in
9479	// the vector width.
9480	static void checkMixedPrecision(Loop L, OptimizationRemarkEmitter ORE) {
9481	SmallVector<Instruction *, `4`> Worklist;
9482	for (BasicBlock *BB : L->getBlocks()) {
9483	for (Instruction &Inst : *BB) {
9484	if (auto *S = dyn_cast<StoreInst>(Val: &Inst)) {
9485	if (S->getValueOperand()->getType()->isFloatTy())
9486	Worklist.push_back(Elt: S);
9487	}
9488	}
9489	}
9490
9491	// Traverse the floating point stores upwards searching, for floating point
9492	// conversions.
9493	SmallPtrSet<const Instruction *, `4`> Visited;
9494	SmallPtrSet<const Instruction *, `4`> EmittedRemark;
9495	while (!Worklist.empty()) {
9496	auto *I = Worklist.pop_back_val();
9497	if (!L->contains(Inst: I))
9498	continue;
9499	if (!Visited.insert(Ptr: I).second)
9500	continue;
9501
9502	// Emit a remark if the floating point store required a floating
9503	// point conversion.
9504	// TODO: More work could be done to identify the root cause such as a
9505	// constant or a function return type and point the user to it.
9506	if (isa<FPExtInst>(Val: I) && EmittedRemark.insert(Ptr: I).second)
9507	ORE->emit(RemarkBuilder: [&]() {
9508	return OptimizationRemarkAnalysis (LV_NAME, "VectorMixedPrecision",
9509	I->getDebugLoc(), L->getHeader())
9510	<< "floating point conversion changes vector width. "
9511	<< "Mixed floating point precision requires an up/down "
9512	<< "cast that will negatively impact performance.";
9513	});
9514
9515	for (Use &Op : I->operands())
9516	if (auto *OpI = dyn_cast<Instruction>(Val&: Op))
9517	Worklist.push_back(Elt: OpI);
9518	}
9519	}
9520
9521	/// For loops with uncountable early exits, find the cost of doing work when
9522	/// exiting the loop early, such as calculating the final exit values of
9523	/// variables used outside the loop.
9524	/// TODO: This is currently overly pessimistic because the loop may not take
9525	/// the early exit, but better to keep this conservative for now. In future,
9526	/// it might be possible to relax this by using branch probabilities.
9527	static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx,
9528	VPlan &Plan, ElementCount VF) {
9529	InstructionCost Cost = `0`;
9530	for (auto *ExitVPBB : Plan.getExitBlocks()) {
9531	for (auto *PredVPBB : ExitVPBB->getPredecessors()) {
9532	// If the predecessor is not the middle.block, then it must be the
9533	// vector.early.exit block, which may contain work to calculate the exit
9534	// values of variables used outside the loop.
9535	if (PredVPBB != Plan.getMiddleBlock()) {
9536	LLVM_DEBUG(dbgs() << "Calculating cost of work in exit block "
9537	<< PredVPBB->getName() << ":\n");
9538	Cost += PredVPBB->cost(VF, Ctx&: CostCtx);
9539	}
9540	}
9541	}
9542	return Cost;
9543	}
9544
9545	/// This function determines whether or not it's still profitable to vectorize
9546	/// the loop given the extra work we have to do outside of the loop:
9547	/// 1. Perform the runtime checks before entering the loop to ensure it's safe
9548	/// to vectorize.
9549	/// 2. In the case of loops with uncountable early exits, we may have to do
9550	/// extra work when exiting the loop early, such as calculating the final
9551	/// exit values of variables used outside the loop.
9552	static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
9553	VectorizationFactor &VF, Loop *L,
9554	PredicatedScalarEvolution &PSE,
9555	VPCostContext &CostCtx, VPlan &Plan,
9556	ScalarEpilogueLowering SEL,
9557	std::optional<unsigned> VScale) {
9558	InstructionCost TotalCost = Checks.getCost();
9559	if (!TotalCost.isValid())
9560	return false;
9561
9562	// Add on the cost of any work required in the vector early exit block, if
9563	// one exists.
9564	TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF: VF.Width);
9565
9566	// When interleaving only scalar and vector cost will be equal, which in turn
9567	// would lead to a divide by 0. Fall back to hard threshold.
9568	if (VF.Width.isScalar()) {
9569	// TODO: Should we rename VectorizeMemoryCheckThreshold?
9570	if (TotalCost > VectorizeMemoryCheckThreshold) {
9571	LLVM_DEBUG(
9572	dbgs()
9573	<< "LV: Interleaving only is not profitable due to runtime checks\n");
9574	return false;
9575	}
9576	return true;
9577	}
9578
9579	// The scalar cost should only be 0 when vectorizing with a user specified
9580	// VF/IC. In those cases, runtime checks should always be generated.
9581	uint64_t ScalarC = VF.ScalarCost.getValue();
9582	if (ScalarC == `0`)
9583	return true;
9584
9585	// First, compute the minimum iteration count required so that the vector
9586	// loop outperforms the scalar loop.
9587	// The total cost of the scalar loop is
9588	// ScalarC TC*
9589	// where
9590	// TC is the actual trip count of the loop.*
9591	// ScalarC is the cost of a single scalar iteration.*
9592	//
9593	// The total cost of the vector loop is
9594	// RtC + VecC (TC / VF) + EpiC*
9595	// where
9596	// RtC is the cost of the generated runtime checks plus the cost of*
9597	// performing any additional work in the vector.early.exit block for loops
9598	// with uncountable early exits.
9599	// VecC is the cost of a single vector iteration.*
9600	// TC is the actual trip count of the loop*
9601	// VF is the vectorization factor*
9602	// EpiCost is the cost of the generated epilogue, including the cost*
9603	// of the remaining scalar operations.
9604	//
9605	// Vectorization is profitable once the total vector cost is less than the
9606	// total scalar cost:
9607	// RtC + VecC (TC / VF) + EpiC < ScalarC * TC*
9608	//
9609	// Now we can compute the minimum required trip count TC as
9610	// VF (RtC + EpiC) / (ScalarC * VF - VecC) < TC*
9611	//
9612	// For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
9613	// the computations are performed on doubles, not integers and the result
9614	// is rounded up, hence we get an upper estimate of the TC.
9615	unsigned IntVF = getEstimatedRuntimeVF(VF: VF.Width, VScale);
9616	uint64_t RtC = TotalCost.getValue();
9617	uint64_t Div = ScalarC * IntVF - VF.Cost.getValue();
9618	uint64_t MinTC1 = Div == `0` ? `0` : divideCeil(Numerator: RtC * IntVF, Denominator: Div);
9619
9620	// Second, compute a minimum iteration count so that the cost of the
9621	// runtime checks is only a fraction of the total scalar loop cost. This
9622	// adds a loop-dependent bound on the overhead incurred if the runtime
9623	// checks fail. In case the runtime checks fail, the cost is RtC + ScalarC
9624	// TC. To bound the runtime check to be a fraction 1/X of the scalar*
9625	// cost, compute
9626	// RtC < ScalarC TC * (1 / X) ==> RtC * X / ScalarC < TC*
9627	uint64_t MinTC2 = divideCeil(Numerator: RtC * `10`, Denominator: ScalarC);
9628
9629	// Now pick the larger minimum. If it is not a multiple of VF and a scalar
9630	// epilogue is allowed, choose the next closest multiple of VF. This should
9631	// partly compensate for ignoring the epilogue cost.
9632	uint64_t MinTC = std::max(a: MinTC1, b: MinTC2);
9633	if (SEL == CM_ScalarEpilogueAllowed)
9634	MinTC = alignTo(Value: MinTC, Align: IntVF);
9635	VF.MinProfitableTripCount = ElementCount::getFixed(MinVal: MinTC);
9636
9637	LLVM_DEBUG(
9638	dbgs() << "LV: Minimum required TC for runtime checks to be profitable:"
9639	<< VF.MinProfitableTripCount << "\n");
9640
9641	// Skip vectorization if the expected trip count is less than the minimum
9642	// required trip count.
9643	if (auto ExpectedTC = getSmallBestKnownTC(PSE, L)) {
9644	if (ElementCount::isKnownLT(LHS: *ExpectedTC, RHS: VF.MinProfitableTripCount)) {
9645	LLVM_DEBUG(dbgs() << "LV: Vectorization is not beneficial: expected "
9646	"trip count < minimum profitable VF ("
9647	<< *ExpectedTC << " < " << VF.MinProfitableTripCount
9648	<< ")\n");
9649
9650	return false;
9651	}
9652	}
9653	return true;
9654	}
9655
9656	LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
9657	: InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced \|\|
9658	!EnableLoopInterleaving),
9659	VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced \|\|
9660	!EnableLoopVectorization) {}
9661
9662	/// Prepare \p MainPlan for vectorizing the main vector loop during epilogue
9663	/// vectorization. Remove ResumePhis from \p MainPlan for inductions that
9664	/// don't have a corresponding wide induction in \p EpiPlan.
9665	static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
9666	// Collect PHI nodes of widened phis in the VPlan for the epilogue. Those
9667	// will need their resume-values computed in the main vector loop. Others
9668	// can be removed from the main VPlan.
9669	SmallPtrSet<PHINode *, `2`> EpiWidenedPhis;
9670	for (VPRecipeBase &R :
9671	EpiPlan.getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
9672	if (isa<VPCanonicalIVPHIRecipe>(Val: &R))
9673	continue;
9674	EpiWidenedPhis.insert(
9675	Ptr: cast<PHINode>(Val: R.getVPSingleValue()->getUnderlyingValue()));
9676	}
9677	for (VPRecipeBase &R :
9678	make_early_inc_range(Range: MainPlan.getScalarHeader()->phis())) {
9679	auto *VPIRInst = cast<VPIRPhi>(Val: &R);
9680	if (EpiWidenedPhis.contains(Ptr: &VPIRInst->getIRPhi()))
9681	continue;
9682	// There is no corresponding wide induction in the epilogue plan that would
9683	// need a resume value. Remove the VPIRInst wrapping the scalar header phi
9684	// together with the corresponding ResumePhi. The resume values for the
9685	// scalar loop will be created during execution of EpiPlan.
9686	VPRecipeBase *ResumePhi = VPIRInst->getOperand(N: `0`)->getDefiningRecipe();
9687	VPIRInst->eraseFromParent();
9688	ResumePhi->eraseFromParent();
9689	}
9690	VPlanTransforms::runPass(Fn: VPlanTransforms::removeDeadRecipes, Plan&: MainPlan);
9691
9692	using namespace VPlanPatternMatch;
9693	// When vectorizing the epilogue, FindFirstIV & FindLastIV reductions can
9694	// introduce multiple uses of undef/poison. If the reduction start value may
9695	// be undef or poison it needs to be frozen and the frozen start has to be
9696	// used when computing the reduction result. We also need to use the frozen
9697	// value in the resume phi generated by the main vector loop, as this is also
9698	// used to compute the reduction result after the epilogue vector loop.
9699	auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
9700	bool UpdateResumePhis) {
9701	VPBuilder Builder(Plan.getEntry());
9702	for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
9703	auto *VPI = dyn_cast<VPInstruction>(Val: &R);
9704	if (!VPI \|\| VPI->getOpcode() != VPInstruction::ComputeFindIVResult)
9705	continue;
9706	VPValue *OrigStart = VPI->getOperand(N: `1`);
9707	if (isGuaranteedNotToBeUndefOrPoison(V: OrigStart->getLiveInIRValue()))
9708	continue;
9709	VPInstruction *Freeze =
9710	Builder.createNaryOp(Opcode: Instruction::Freeze, Operands: {OrigStart}, Inst: {}, Name: "fr");
9711	VPI->setOperand(I: `1`, New: Freeze);
9712	if (UpdateResumePhis)
9713	OrigStart->replaceUsesWithIf(New: Freeze, ShouldReplace: [Freeze](VPUser &U, unsigned) {
9714	return Freeze != &U && isa<VPPhi>(Val: &U);
9715	});
9716	}
9717	};
9718	AddFreezeForFindLastIVReductions (MainPlan, true);
9719	AddFreezeForFindLastIVReductions (EpiPlan, false);
9720
9721	VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
9722	VPValue *VectorTC = &MainPlan.getVectorTripCount();
9723	// If there is a suitable resume value for the canonical induction in the
9724	// scalar (which will become vector) epilogue loop we are done. Otherwise
9725	// create it below.
9726	if (any_of(Range&: *MainScalarPH, P: [VectorTC](VPRecipeBase &R) {
9727	return match(V: &R, P: m_VPInstruction<Instruction::PHI>(Op0: m_Specific(VPV: VectorTC),
9728	Op1: m_SpecificInt(V: `0`)));
9729	}))
9730	return;
9731	VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
9732	ScalarPHBuilder.createScalarPhi(
9733	IncomingValues: {VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, DL: {},
9734	Name: "vec.epilog.resume.val");
9735	}
9736
9737	/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
9738	/// SCEVs from \p ExpandedSCEVs and set resume values for header recipes.
9739	static void
9740	preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
9741	const SCEV2ValueTy &ExpandedSCEVs,
9742	const EpilogueLoopVectorizationInfo &EPI) {
9743	VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
9744	VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
9745	Header->setName("vec.epilog.vector.body");
9746
9747	DenseMap<Value , Value > ToFrozen;
9748	// Ensure that the start values for all header phi recipes are updated before
9749	// vectorizing the epilogue loop.
9750	for (VPRecipeBase &R : Header->phis()) {
9751	if (auto *IV = dyn_cast<VPCanonicalIVPHIRecipe>(Val: &R)) {
9752	// When vectorizing the epilogue loop, the canonical induction start
9753	// value needs to be changed from zero to the value after the main
9754	// vector loop. Find the resume value created during execution of the main
9755	// VPlan.
9756	// FIXME: Improve modeling for canonical IV start values in the epilogue
9757	// loop.
9758	using namespace llvm::PatternMatch;
9759	Type *IdxTy = IV->getScalarType();
9760	PHINode *EPResumeVal = find_singleton<PHINode>(
9761	Range: L->getLoopPreheader()->phis(),
9762	P: [&EPI, IdxTy](PHINode &P, bool) -> PHINode * {
9763	if (P.getType() == IdxTy &&
9764	match(
9765	V: P.getIncomingValueForBlock(BB: EPI.MainLoopIterationCountCheck),
9766	P: m_SpecificInt(V: `0`)) &&
9767	all_of(Range: P.incoming_values(), P: [&EPI](Value *Inc) {
9768	return Inc == EPI.VectorTripCount \|\|
9769	match(V: Inc, P: m_SpecificInt(V: `0`));
9770	}))
9771	return &P;
9772	return nullptr;
9773	});
9774	assert(EPResumeVal && "must have a resume value for the canonical IV");
9775	VPValue *VPV = Plan.getOrAddLiveIn(V: EPResumeVal);
9776	assert(all_of(IV->users(),
9777	[](const VPUser *U) {
9778	return isa<VPScalarIVStepsRecipe>(U) \|\|
9779	isa<VPDerivedIVRecipe>(U) \|\|
9780	cast<VPRecipeBase>(U)->isScalarCast() \|\|
9781	cast<VPInstruction>(U)->getOpcode() ==
9782	Instruction::Add;
9783	}) &&
9784	"the canonical IV should only be used by its increment or "
9785	"ScalarIVSteps when resetting the start value");
9786	IV->setOperand(I: `0`, New: VPV);
9787	continue;
9788	}
9789
9790	Value ResumeV = nullptr*;
9791	// TODO: Move setting of resume values to prepareToExecute.
9792	if (auto *ReductionPhi = dyn_cast<VPReductionPHIRecipe>(Val: &R)) {
9793	auto *RdxResult =
9794	cast<VPInstruction>(Val: find_if(Range: ReductionPhi->users(), P: [](VPUser U) {
9795	auto *VPI = dyn_cast<VPInstruction>(Val: U);
9796	return VPI &&
9797	(VPI->getOpcode() == VPInstruction::ComputeAnyOfResult \|\|
9798	VPI->getOpcode() == VPInstruction::ComputeReductionResult \|\|
9799	VPI->getOpcode() == VPInstruction::ComputeFindIVResult);
9800	}));
9801	ResumeV = cast<PHINode>(Val: ReductionPhi->getUnderlyingInstr())
9802	->getIncomingValueForBlock(BB: L->getLoopPreheader());
9803	RecurKind RK = ReductionPhi->getRecurrenceKind();
9804	if (RecurrenceDescriptor::isAnyOfRecurrenceKind(Kind: RK)) {
9805	Value *StartV = RdxResult->getOperand(N: `1`)->getLiveInIRValue();
9806	// VPReductionPHIRecipes for AnyOf reductions expect a boolean as
9807	// start value; compare the final value from the main vector loop
9808	// to the start value.
9809	BasicBlock *PBB = cast<Instruction>(Val: ResumeV)->getParent();
9810	IRBuilder<> Builder(PBB, PBB->getFirstNonPHIIt());
9811	ResumeV = Builder.CreateICmpNE(LHS: ResumeV, RHS: StartV);
9812	} else if (RecurrenceDescriptor::isFindIVRecurrenceKind(Kind: RK)) {
9813	Value *StartV = getStartValueFromReductionResult(RdxResult);
9814	ToFrozen [StartV] = cast<PHINode>(Val: ResumeV)->getIncomingValueForBlock(
9815	BB: EPI.MainLoopIterationCountCheck);
9816
9817	// VPReductionPHIRecipe for FindFirstIV/FindLastIV reductions requires
9818	// an adjustment to the resume value. The resume value is adjusted to
9819	// the sentinel value when the final value from the main vector loop
9820	// equals the start value. This ensures correctness when the start value
9821	// might not be less than the minimum value of a monotonically
9822	// increasing induction variable.
9823	BasicBlock *ResumeBB = cast<Instruction>(Val: ResumeV)->getParent();
9824	IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
9825	Value *Cmp = Builder.CreateICmpEQ(LHS: ResumeV, RHS: ToFrozen [StartV]);
9826	Value *Sentinel = RdxResult->getOperand(N: `2`)->getLiveInIRValue();
9827	ResumeV = Builder.CreateSelect(C: Cmp, True: Sentinel, False: ResumeV);
9828	} else {
9829	VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
9830	auto *PhiR = dyn_cast<VPReductionPHIRecipe>(Val: &R);
9831	if (auto *VPI = dyn_cast<VPInstruction>(Val: PhiR->getStartValue())) {
9832	assert(VPI->getOpcode() == VPInstruction::ReductionStartVector &&
9833	"unexpected start value");
9834	VPI->setOperand(I: `0`, New: StartVal);
9835	continue;
9836	}
9837	}
9838	} else {
9839	// Retrieve the induction resume values for wide inductions from
9840	// their original phi nodes in the scalar loop.
9841	PHINode *IndPhi = cast<VPWidenInductionRecipe>(Val: &R)->getPHINode();
9842	// Hook up to the PHINode generated by a ResumePhi recipe of main
9843	// loop VPlan, which feeds the scalar loop.
9844	ResumeV = IndPhi->getIncomingValueForBlock(BB: L->getLoopPreheader());
9845	}
9846	assert(ResumeV && "Must have a resume value");
9847	VPValue *StartVal = Plan.getOrAddLiveIn(V: ResumeV);
9848	cast<VPHeaderPHIRecipe>(Val: &R)->setStartValue(StartVal);
9849	}
9850
9851	// For some VPValues in the epilogue plan we must re-use the generated IR
9852	// values from the main plan. Replace them with live-in VPValues.
9853	// TODO: This is a workaround needed for epilogue vectorization and it
9854	// should be removed once induction resume value creation is done
9855	// directly in VPlan.
9856	for (auto &R : make_early_inc_range(Range&: *Plan.getEntry())) {
9857	// Re-use frozen values from the main plan for Freeze VPInstructions in the
9858	// epilogue plan. This ensures all users use the same frozen value.
9859	auto *VPI = dyn_cast<VPInstruction>(Val: &R);
9860	if (VPI && VPI->getOpcode() == Instruction::Freeze) {
9861	VPI->replaceAllUsesWith(New: Plan.getOrAddLiveIn(
9862	V: ToFrozen.lookup(Val: VPI->getOperand(N: `0`)->getLiveInIRValue())));
9863	continue;
9864	}
9865
9866	// Re-use the trip count and steps expanded for the main loop, as
9867	// skeleton creation needs it as a value that dominates both the scalar
9868	// and vector epilogue loops
9869	auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(Val: &R);
9870	if (!ExpandR)
9871	continue;
9872	VPValue *ExpandedVal =
9873	Plan.getOrAddLiveIn(V: ExpandedSCEVs.lookup(Val: ExpandR->getSCEV()));
9874	ExpandR->replaceAllUsesWith(New: ExpandedVal);
9875	if (Plan.getTripCount() == ExpandR)
9876	Plan.resetTripCount(NewTripCount: ExpandedVal);
9877	ExpandR->eraseFromParent();
9878	}
9879	}
9880
9881	// Generate bypass values from the additional bypass block. Note that when the
9882	// vectorized epilogue is skipped due to iteration count check, then the
9883	// resume value for the induction variable comes from the trip count of the
9884	// main vector loop, passed as the second argument.
9885	static Value *createInductionAdditionalBypassValues(
9886	PHINode OrigPhi, const* InductionDescriptor &II, IRBuilder<> &BypassBuilder,
9887	const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
9888	Instruction *OldInduction) {
9889	Value *Step = getExpandedStep(ID: II, ExpandedSCEVs);
9890	// For the primary induction the additional bypass end value is known.
9891	// Otherwise it is computed.
9892	Value *EndValueFromAdditionalBypass = MainVectorTripCount;
9893	if (OrigPhi != OldInduction) {
9894	auto *BinOp = II.getInductionBinOp();
9895	// Fast-math-flags propagate from the original induction instruction.
9896	if (isa_and_nonnull<FPMathOperator>(Val: BinOp))
9897	BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
9898
9899	// Compute the end value for the additional bypass.
9900	EndValueFromAdditionalBypass =
9901	emitTransformedIndex(B&: BypassBuilder, Index: MainVectorTripCount,
9902	StartValue: II.getStartValue(), Step, InductionKind: II.getKind(), InductionBinOp: BinOp);
9903	EndValueFromAdditionalBypass->setName("ind.end");
9904	}
9905	return EndValueFromAdditionalBypass;
9906	}
9907
9908	bool LoopVectorizePass::processLoop(Loop *L) {
9909	assert((EnableVPlanNativePath \|\| L->isInnermost()) &&
9910	"VPlan-native path is not enabled. Only process inner loops.");
9911
9912	LLVM_DEBUG(dbgs() << "\nLV: Checking a loop in '"
9913	<< L->getHeader()->getParent()->getName() << "' from "
9914	<< L->getLocStr() << "\n");
9915
9916	LoopVectorizeHints Hints(L, InterleaveOnlyWhenForced, *ORE, TTI);
9917
9918	LLVM_DEBUG(
9919	dbgs() << "LV: Loop hints:"
9920	<< " force="
9921	<< (Hints.getForce() == LoopVectorizeHints::FK_Disabled
9922	? "disabled"
9923	: (Hints.getForce() == LoopVectorizeHints::FK_Enabled
9924	? "enabled"
9925	: "?"))
9926	<< " width=" << Hints.getWidth()
9927	<< " interleave=" << Hints.getInterleave() << "\n");
9928
9929	// Function containing loop
9930	Function *F = L->getHeader()->getParent();
9931
9932	// Looking at the diagnostic output is the only way to determine if a loop
9933	// was vectorized (other than looking at the IR or machine code), so it
9934	// is important to generate an optimization remark for each loop. Most of
9935	// these messages are generated as OptimizationRemarkAnalysis. Remarks
9936	// generated as OptimizationRemark and OptimizationRemarkMissed are
9937	// less verbose reporting vectorized loops and unvectorized loops that may
9938	// benefit from vectorization, respectively.
9939
9940	if (!Hints.allowVectorization(F, L, VectorizeOnlyWhenForced)) {
9941	LLVM_DEBUG(dbgs() << "LV: Loop hints prevent vectorization.\n");
9942	return false;
9943	}
9944
9945	PredicatedScalarEvolution PSE(SE, L);
9946
9947	// Check if it is legal to vectorize the loop.
9948	LoopVectorizationRequirements Requirements;
9949	LoopVectorizationLegality LVL(L, PSE, DT, TTI, TLI, F, *LAIs, LI, ORE,
9950	&Requirements, &Hints, DB, AC, BFI, PSI);
9951	if (!LVL.canVectorize(UseVPlanNativePath: EnableVPlanNativePath)) {
9952	LLVM_DEBUG(dbgs() << "LV: Not vectorizing: Cannot prove legality.\n");
9953	Hints.emitRemarkWithHints();
9954	return false;
9955	}
9956
9957	if (LVL.hasUncountableEarlyExit() && !EnableEarlyExitVectorization) {
9958	reportVectorizationFailure(DebugMsg: "Auto-vectorization of loops with uncountable "
9959	"early exit is not enabled",
9960	ORETag: "UncountableEarlyExitLoopsDisabled", ORE, TheLoop: L);
9961	return false;
9962	}
9963
9964	// Entrance to the VPlan-native vectorization path. Outer loops are processed
9965	// here. They may require CFG and instruction level transformations before
9966	// even evaluating whether vectorization is profitable. Since we cannot modify
9967	// the incoming IR, we need to build VPlan upfront in the vectorization
9968	// pipeline.
9969	if (!L->isInnermost())
9970	return processLoopInVPlanNativePath(L, PSE, LI, DT, LVL: &LVL, TTI, TLI, DB, AC,
9971	ORE, BFI, PSI, Hints, Requirements);
9972
9973	assert(L->isInnermost() && "Inner loop expected.");
9974
9975	InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
9976	bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
9977
9978	// If an override option has been passed in for interleaved accesses, use it.
9979	if (EnableInterleavedMemAccesses.getNumOccurrences() > `0`)
9980	UseInterleaved = EnableInterleavedMemAccesses;
9981
9982	// Analyze interleaved memory accesses.
9983	if (UseInterleaved)
9984	IAI.analyzeInterleaving(EnableMaskedInterleavedGroup: useMaskedInterleavedAccesses(TTI: *TTI));
9985
9986	if (LVL.hasUncountableEarlyExit()) {
9987	BasicBlock *LoopLatch = L->getLoopLatch();
9988	if (IAI.requiresScalarEpilogue() \|\|
9989	any_of(Range: LVL.getCountableExitingBlocks(),
9990	P: [LoopLatch](BasicBlock BB) { return* BB != LoopLatch; })) {
9991	reportVectorizationFailure(DebugMsg: "Auto-vectorization of early exit loops "
9992	"requiring a scalar epilogue is unsupported",
9993	ORETag: "UncountableEarlyExitUnsupported", ORE, TheLoop: L);
9994	return false;
9995	}
9996	}
9997
9998	// Check the function attributes and profiles to find out if this function
9999	// should be optimized for size.
10000	ScalarEpilogueLowering SEL =
10001	getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, IAI: &IAI);
10002
10003	// Check the loop for a trip count threshold: vectorize loops with a tiny trip
10004	// count by optimizing for size, to minimize overheads.
10005	auto ExpectedTC = getSmallBestKnownTC(PSE, L);
10006	if (ExpectedTC && ExpectedTC ->isFixed() &&
10007	ExpectedTC ->getFixedValue() < TinyTripCountVectorThreshold) {
10008	LLVM_DEBUG(dbgs() << "LV: Found a loop with a very small trip count. "
10009	<< "This loop is worth vectorizing only if no scalar "
10010	<< "iteration overheads are incurred.");
10011	if (Hints.getForce() == LoopVectorizeHints::FK_Enabled)
10012	LLVM_DEBUG(dbgs() << " But vectorizing was explicitly forced.\n");
10013	else {
10014	LLVM_DEBUG(dbgs() << "\n");
10015	// Predicate tail-folded loops are efficient even when the loop
10016	// iteration count is low. However, setting the epilogue policy to
10017	// `CM_ScalarEpilogueNotAllowedLowTripLoop` prevents vectorizing loops
10018	// with runtime checks. It's more effective to let
10019	// `isOutsideLoopWorkProfitable` determine if vectorization is
10020	// beneficial for the loop.
10021	if (SEL != CM_ScalarEpilogueNotNeededUsePredicate)
10022	SEL = CM_ScalarEpilogueNotAllowedLowTripLoop;
10023	}
10024	}
10025
10026	// Check the function attributes to see if implicit floats or vectors are
10027	// allowed.
10028	if (F->hasFnAttribute(Kind: Attribute::NoImplicitFloat)) {
10029	reportVectorizationFailure(
10030	DebugMsg: "Can't vectorize when the NoImplicitFloat attribute is used",
10031	OREMsg: "loop not vectorized due to NoImplicitFloat attribute",
10032	ORETag: "NoImplicitFloat", ORE, TheLoop: L);
10033	Hints.emitRemarkWithHints();
10034	return false;
10035	}
10036
10037	// Check if the target supports potentially unsafe FP vectorization.
10038	// FIXME: Add a check for the type of safety issue (denormal, signaling)
10039	// for the target we're vectorizing for, to make sure none of the
10040	// additional fp-math flags can help.
10041	if (Hints.isPotentiallyUnsafe() &&
10042	TTI->isFPVectorizationPotentiallyUnsafe()) {
10043	reportVectorizationFailure(
10044	DebugMsg: "Potentially unsafe FP op prevents vectorization",
10045	OREMsg: "loop not vectorized due to unsafe FP support.",
10046	ORETag: "UnsafeFP", ORE, TheLoop: L);
10047	Hints.emitRemarkWithHints();
10048	return false;
10049	}
10050
10051	bool AllowOrderedReductions;
10052	// If the flag is set, use that instead and override the TTI behaviour.
10053	if (ForceOrderedReductions.getNumOccurrences() > `0`)
10054	AllowOrderedReductions = ForceOrderedReductions;
10055	else
10056	AllowOrderedReductions = TTI->enableOrderedReductions();
10057	if (!LVL.canVectorizeFPMath(EnableStrictReductions: AllowOrderedReductions)) {
10058	ORE->emit(RemarkBuilder: [&]() {
10059	auto *ExactFPMathInst = Requirements.getExactFPInst();
10060	return OptimizationRemarkAnalysisFPCommute (DEBUG_TYPE, "CantReorderFPOps",
10061	ExactFPMathInst->getDebugLoc(),
10062	ExactFPMathInst->getParent())
10063	<< "loop not vectorized: cannot prove it is safe to reorder "
10064	"floating-point operations";
10065	});
10066	LLVM_DEBUG(dbgs() << "LV: loop not vectorized: cannot prove it is safe to "
10067	"reorder floating-point operations\n");
10068	Hints.emitRemarkWithHints();
10069	return false;
10070	}
10071
10072	// Use the cost model.
10073	LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
10074	F, &Hints, IAI, PSI, BFI);
10075	// Use the planner for vectorization.
10076	LoopVectorizationPlanner LVP(L, LI, DT, TLI, *TTI, &LVL, CM, IAI, PSE, Hints,
10077	ORE);
10078
10079	// Get user vectorization factor and interleave count.
10080	ElementCount UserVF = Hints.getWidth();
10081	unsigned UserIC = Hints.getInterleave();
10082	if (LVL.hasUncountableEarlyExit() && UserIC != `1`) {
10083	UserIC = `1`;
10084	reportVectorizationInfo(Msg: "Interleaving not supported for loops "
10085	"with uncountable early exits",
10086	ORETag: "InterleaveEarlyExitDisabled", ORE, TheLoop: L);
10087	}
10088
10089	// Plan how to best vectorize.
10090	LVP.plan(UserVF, UserIC);
10091	VectorizationFactor VF = LVP.computeBestVF();
10092	unsigned IC = `1`;
10093
10094	if (ORE->allowExtraAnalysis(LV_NAME))
10095	LVP.emitInvalidCostRemarks(ORE);
10096
10097	GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(), CM.CostKind);
10098	if (LVP.hasPlanWithVF(VF: VF.Width)) {
10099	// Select the interleave count.
10100	IC = CM.selectInterleaveCount(Plan&: LVP.getPlanFor(VF: VF.Width), VF: VF.Width, LoopCost: VF.Cost);
10101
10102	unsigned SelectedIC = std::max(a: IC, b: UserIC);
10103	// Optimistically generate runtime checks if they are needed. Drop them if
10104	// they turn out to not be profitable.
10105	if (VF.Width.isVector() \|\| SelectedIC > `1`)
10106	Checks.create(L, LAI: *LVL.getLAI(), UnionPred: PSE.getPredicate(), VF: VF.Width, IC: SelectedIC);
10107
10108	// Check if it is profitable to vectorize with runtime checks.
10109	bool ForceVectorization =
10110	Hints.getForce() == LoopVectorizeHints::FK_Enabled;
10111	VPCostContext CostCtx(CM.TTI, *CM.TLI, CM.Legal->getWidestInductionType(),
10112	CM, CM.CostKind);
10113	if (!ForceVectorization &&
10114	!isOutsideLoopWorkProfitable(Checks, VF, L, PSE, CostCtx,
10115	Plan&: LVP.getPlanFor(VF: VF.Width), SEL,
10116	VScale: CM.getVScaleForTuning())) {
10117	ORE->emit(RemarkBuilder: [&]() {
10118	return OptimizationRemarkAnalysisAliasing (
10119	DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(),
10120	L->getHeader())
10121	<< "loop not vectorized: cannot prove it is safe to reorder "
10122	"memory operations";
10123	});
10124	LLVM_DEBUG(dbgs() << "LV: Too many memory checks needed.\n");
10125	Hints.emitRemarkWithHints();
10126	return false;
10127	}
10128	}
10129
10130	// Identify the diagnostic messages that should be produced.
10131	std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
10132	bool VectorizeLoop = true, InterleaveLoop = true;
10133	if (VF.Width.isScalar()) {
10134	LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
10135	VecDiagMsg = {
10136	"VectorizationNotBeneficial",
10137	"the cost-model indicates that vectorization is not beneficial"};
10138	VectorizeLoop = false;
10139	}
10140
10141	if (!LVP.hasPlanWithVF(VF: VF.Width) && UserIC > `1`) {
10142	// Tell the user interleaving was avoided up-front, despite being explicitly
10143	// requested.
10144	LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
10145	"interleaving should be avoided up front\n");
10146	IntDiagMsg = {"InterleavingAvoided",
10147	"Ignoring UserIC, because interleaving was avoided up front"};
10148	InterleaveLoop = false;
10149	} else if (IC == `1` && UserIC <= `1`) {
10150	// Tell the user interleaving is not beneficial.
10151	LLVM_DEBUG(dbgs() << "LV: Interleaving is not beneficial.\n");
10152	IntDiagMsg = {
10153	"InterleavingNotBeneficial",
10154	"the cost-model indicates that interleaving is not beneficial"};
10155	InterleaveLoop = false;
10156	if (UserIC == `1`) {
10157	IntDiagMsg.first = "InterleavingNotBeneficialAndDisabled";
10158	IntDiagMsg.second +=
10159	" and is explicitly disabled or interleave count is set to 1";
10160	}
10161	} else if (IC > `1` && UserIC == `1`) {
10162	// Tell the user interleaving is beneficial, but it explicitly disabled.
10163	LLVM_DEBUG(
10164	dbgs() << "LV: Interleaving is beneficial but is explicitly disabled.");
10165	IntDiagMsg = {"InterleavingBeneficialButDisabled",
10166	"the cost-model indicates that interleaving is beneficial "
10167	"but is explicitly disabled or interleave count is set to 1"};
10168	InterleaveLoop = false;
10169	}
10170
10171	// If there is a histogram in the loop, do not just interleave without
10172	// vectorizing. The order of operations will be incorrect without the
10173	// histogram intrinsics, which are only used for recipes with VF > 1.
10174	if (!VectorizeLoop && InterleaveLoop && LVL.hasHistograms()) {
10175	LLVM_DEBUG(dbgs() << "LV: Not interleaving without vectorization due "
10176	<< "to histogram operations.\n");
10177	IntDiagMsg = {
10178	"HistogramPreventsScalarInterleaving",
10179	"Unable to interleave without vectorization due to constraints on "
10180	"the order of histogram operations"};
10181	InterleaveLoop = false;
10182	}
10183
10184	// Override IC if user provided an interleave count.
10185	IC = UserIC > `0` ? UserIC : IC;
10186
10187	// Emit diagnostic messages, if any.
10188	const char *VAPassName = Hints.vectorizeAnalysisPassName();
10189	if (!VectorizeLoop && !InterleaveLoop) {
10190	// Do not vectorize or interleaving the loop.
10191	ORE->emit(RemarkBuilder: [&]() {
10192	return OptimizationRemarkMissed (VAPassName, VecDiagMsg.first,
10193	L->getStartLoc(), L->getHeader())
10194	<< VecDiagMsg.second;
10195	});
10196	ORE->emit(RemarkBuilder: [&]() {
10197	return OptimizationRemarkMissed (LV_NAME, IntDiagMsg.first,
10198	L->getStartLoc(), L->getHeader())
10199	<< IntDiagMsg.second;
10200	});
10201	return false;
10202	}
10203
10204	if (!VectorizeLoop && InterleaveLoop) {
10205	LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << `'\n'`);
10206	ORE->emit(RemarkBuilder: [&]() {
10207	return OptimizationRemarkAnalysis (VAPassName, VecDiagMsg.first,
10208	L->getStartLoc(), L->getHeader())
10209	<< VecDiagMsg.second;
10210	});
10211	} else if (VectorizeLoop && !InterleaveLoop) {
10212	LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10213	<< ") in " << L->getLocStr() << `'\n'`);
10214	ORE->emit(RemarkBuilder: [&]() {
10215	return OptimizationRemarkAnalysis (LV_NAME, IntDiagMsg.first,
10216	L->getStartLoc(), L->getHeader())
10217	<< IntDiagMsg.second;
10218	});
10219	} else if (VectorizeLoop && InterleaveLoop) {
10220	LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
10221	<< ") in " << L->getLocStr() << `'\n'`);
10222	LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << `'\n'`);
10223	}
10224
10225	bool DisableRuntimeUnroll = false;
10226	MDNode *OrigLoopID = L->getLoopID();
10227	{
10228	using namespace ore;
10229	if (!VectorizeLoop) {
10230	assert(IC > `1` && "interleave count should not be 1 or 0");
10231	// If we decided that it is not legal to vectorize the loop, then
10232	// interleave it.
10233	VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
10234	InnerLoopVectorizer Unroller(
10235	L, PSE, LI, DT, TLI, TTI, AC, ORE, ElementCount::getFixed(MinVal: `1`),
10236	ElementCount::getFixed(MinVal: `1`), IC, &CM, BFI, PSI, Checks, BestPlan);
10237
10238	LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: Unroller, DT, VectorizingEpilogue: false);
10239
10240	ORE->emit(RemarkBuilder: [&]() {
10241	return OptimizationRemark (LV_NAME, "Interleaved", L->getStartLoc(),
10242	L->getHeader())
10243	<< "interleaved loop (interleaved count: "
10244	<< NV ("InterleaveCount", IC) << ")";
10245	});
10246	} else {
10247	// If we decided that it is legal* to vectorize the loop, then do it.*
10248
10249	VPlan &BestPlan = LVP.getPlanFor(VF: VF.Width);
10250	// Consider vectorizing the epilogue too if it's profitable.
10251	VectorizationFactor EpilogueVF =
10252	LVP.selectEpilogueVectorizationFactor(MainLoopVF: VF.Width, IC);
10253	if (EpilogueVF.Width.isVector()) {
10254	std::unique_ptr<VPlan> BestMainPlan(BestPlan.duplicate());
10255
10256	// The first pass vectorizes the main loop and creates a scalar epilogue
10257	// to be vectorized by executing the plan (potentially with a different
10258	// factor) again shortly afterwards.
10259	VPlan &BestEpiPlan = LVP.getPlanFor(VF: EpilogueVF.Width);
10260	BestEpiPlan.getMiddleBlock()->setName("vec.epilog.middle.block");
10261	preparePlanForMainVectorLoop(MainPlan&: *BestMainPlan, EpiPlan&: BestEpiPlan);
10262	EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, `1`,
10263	BestEpiPlan);
10264	EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
10265	EPI, &CM, BFI, PSI, Checks,
10266	*BestMainPlan);
10267	auto ExpandedSCEVs = LVP.executePlan(BestVF: EPI.MainLoopVF, BestUF: EPI.MainLoopUF,
10268	BestVPlan&: BestMainPlan, ILV&: MainILV, DT, VectorizingEpilogue: false*);
10269	++LoopsVectorized;
10270
10271	// Second pass vectorizes the epilogue and adjusts the control flow
10272	// edges from the first pass.
10273	EPI.MainLoopVF = EPI.EpilogueVF;
10274	EPI.MainLoopUF = EPI.EpilogueUF;
10275	EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
10276	ORE, EPI, &CM, BFI, PSI,
10277	Checks, BestEpiPlan);
10278	EpilogILV.setTripCount(MainILV.getTripCount());
10279	preparePlanForEpilogueVectorLoop(Plan&: BestEpiPlan, L, ExpandedSCEVs, EPI);
10280
10281	LVP.executePlan(BestVF: EPI.EpilogueVF, BestUF: EPI.EpilogueUF, BestVPlan&: BestEpiPlan, ILV&: EpilogILV,
10282	DT, VectorizingEpilogue: true);
10283
10284	// Fix induction resume values from the additional bypass block.
10285	BasicBlock *BypassBlock = EpilogILV.getAdditionalBypassBlock();
10286	IRBuilder<> BypassBuilder(BypassBlock,
10287	BypassBlock->getFirstInsertionPt());
10288	BasicBlock *PH = L->getLoopPreheader();
10289	for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
10290	auto *Inc = cast<PHINode>(Val: IVPhi->getIncomingValueForBlock(BB: PH));
10291	Value *V = createInductionAdditionalBypassValues(
10292	OrigPhi: IVPhi, II, BypassBuilder, ExpandedSCEVs, MainVectorTripCount: EPI.VectorTripCount,
10293	OldInduction: LVL.getPrimaryInduction());
10294	// TODO: Directly add as extra operand to the VPResumePHI recipe.
10295	Inc->setIncomingValueForBlock(BB: BypassBlock, V);
10296	}
10297	++LoopsEpilogueVectorized;
10298
10299	if (!Checks.hasChecks())
10300	DisableRuntimeUnroll = true;
10301	} else {
10302	InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
10303	VF.MinProfitableTripCount, IC, &CM, BFI, PSI,
10304	Checks, BestPlan);
10305	LVP.executePlan(BestVF: VF.Width, BestUF: IC, BestVPlan&: BestPlan, ILV&: LB, DT, VectorizingEpilogue: false);
10306	++LoopsVectorized;
10307
10308	// Add metadata to disable runtime unrolling a scalar loop when there
10309	// are no runtime checks about strides and memory. A scalar loop that is
10310	// rarely used is not worth unrolling.
10311	if (!Checks.hasChecks())
10312	DisableRuntimeUnroll = true;
10313	}
10314	// Report the vectorization decision.
10315	reportVectorization(ORE, TheLoop: L, VF, IC);
10316	}
10317
10318	if (ORE->allowExtraAnalysis(LV_NAME))
10319	checkMixedPrecision(L, ORE);
10320	}
10321
10322	assert(DT->verify(DominatorTree::VerificationLevel::Fast) &&
10323	"DT not preserved correctly");
10324
10325	std::optional<MDNode *> RemainderLoopID =
10326	makeFollowupLoopID(OrigLoopID, FollowupAttrs: {LLVMLoopVectorizeFollowupAll,
10327	LLVMLoopVectorizeFollowupEpilogue});
10328	if (RemainderLoopID) {
10329	L->setLoopID(*RemainderLoopID);
10330	} else {
10331	if (DisableRuntimeUnroll)
10332	addRuntimeUnrollDisableMetaData(L);
10333
10334	// Mark the loop as already vectorized to avoid vectorizing again.
10335	Hints.setAlreadyVectorized();
10336	}
10337
10338	assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
10339	return true;
10340	}
10341
10342	LoopVectorizeResult LoopVectorizePass::runImpl(Function &F) {
10343
10344	// Don't attempt if
10345	// 1. the target claims to have no vector registers, and
10346	// 2. interleaving won't help ILP.
10347	//
10348	// The second condition is necessary because, even if the target has no
10349	// vector registers, loop vectorization may still enable scalar
10350	// interleaving.
10351	if (!TTI->getNumberOfRegisters(ClassID: TTI->getRegisterClassForType(Vector: true)) &&
10352	TTI->getMaxInterleaveFactor(VF: ElementCount::getFixed(MinVal: `1`)) < `2`)
10353	return LoopVectorizeResult (false, false);
10354
10355	bool Changed = false, CFGChanged = false;
10356
10357	// The vectorizer requires loops to be in simplified form.
10358	// Since simplification may add new inner loops, it has to run before the
10359	// legality and profitability checks. This means running the loop vectorizer
10360	// will simplify all loops, regardless of whether anything end up being
10361	// vectorized.
10362	for (const auto &L : *LI)
10363	Changed \|= CFGChanged \|=
10364	simplifyLoop(L, DT, LI, SE, AC, MSSAU: nullptr, PreserveLCSSA: false / PreserveLCSSA /);
10365
10366	// Build up a worklist of inner-loops to vectorize. This is necessary as
10367	// the act of vectorizing or partially unrolling a loop creates new loops
10368	// and can invalidate iterators across the loops.
10369	SmallVector<Loop *, `8`> Worklist;
10370
10371	for (Loop L : LI)
10372	collectSupportedLoops(L&: *L, LI, ORE, V&: Worklist);
10373
10374	LoopsAnalyzed += Worklist.size();
10375
10376	// Now walk the identified inner loops.
10377	while (!Worklist.empty()) {
10378	Loop *L = Worklist.pop_back_val();
10379
10380	// For the inner loops we actually process, form LCSSA to simplify the
10381	// transform.
10382	Changed \|= formLCSSARecursively(L&: L, DT: DT, LI, SE);
10383
10384	Changed \|= CFGChanged \|= processLoop(L);
10385
10386	if (Changed) {
10387	LAIs->clear();
10388
10389	#ifndef NDEBUG
10390	if (VerifySCEV)
10391	SE->verify();
10392	#endif
10393	}
10394	}
10395
10396	// Process each loop nest in the function.
10397	return LoopVectorizeResult (Changed, CFGChanged);
10398	}
10399
10400	PreservedAnalyses LoopVectorizePass::run(Function &F,
10401	FunctionAnalysisManager &AM) {
10402	LI = &AM.getResult<LoopAnalysis>(IR&: F);
10403	// There are no loops in the function. Return before computing other
10404	// expensive analyses.
10405	if (LI->empty())
10406	return PreservedAnalyses::all();
10407	SE = &AM.getResult<ScalarEvolutionAnalysis>(IR&: F);
10408	TTI = &AM.getResult<TargetIRAnalysis>(IR&: F);
10409	DT = &AM.getResult<DominatorTreeAnalysis>(IR&: F);
10410	TLI = &AM.getResult<TargetLibraryAnalysis>(IR&: F);
10411	AC = &AM.getResult<AssumptionAnalysis>(IR&: F);
10412	DB = &AM.getResult<DemandedBitsAnalysis>(IR&: F);
10413	ORE = &AM.getResult<OptimizationRemarkEmitterAnalysis>(IR&: F);
10414	LAIs = &AM.getResult<LoopAccessAnalysis>(IR&: F);
10415
10416	auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(IR&: F);
10417	PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(IR&: *F.getParent());
10418	BFI = nullptr;
10419	if (PSI && PSI->hasProfileSummary())
10420	BFI = &AM.getResult<BlockFrequencyAnalysis>(IR&: F);
10421	LoopVectorizeResult Result = runImpl(F);
10422	if (!Result.MadeAnyChange)
10423	return PreservedAnalyses::all();
10424	PreservedAnalyses PA;
10425
10426	if (isAssignmentTrackingEnabled(M: *F.getParent())) {
10427	for (auto &BB : F)
10428	RemoveRedundantDbgInstrs(BB: &BB);
10429	}
10430
10431	PA.preserve<LoopAnalysis>();
10432	PA.preserve<DominatorTreeAnalysis>();
10433	PA.preserve<ScalarEvolutionAnalysis>();
10434	PA.preserve<LoopAccessAnalysis>();
10435
10436	if (Result.MadeCFGChange) {
10437	// Making CFG changes likely means a loop got vectorized. Indicate that
10438	// extra simplification passes should be run.
10439	// TODO: MadeCFGChanges is not a prefect proxy. Extra passes should only
10440	// be run if runtime checks have been added.
10441	AM.getResult<ShouldRunExtraVectorPasses>(IR&: F);
10442	PA.preserve<ShouldRunExtraVectorPasses>();
10443	} else {
10444	PA.preserveSet<CFGAnalyses>();
10445	}
10446	return PA;
10447	}
10448
10449	void LoopVectorizePass::printPipeline(
10450	raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
10451	static_cast<PassInfoMixin<LoopVectorizePass> >(this*)->printPipeline(
10452	OS, MapClassName2PassName);
10453
10454	OS << `'<'`;
10455	OS << (InterleaveOnlyWhenForced ? "" : "no-") << "interleave-forced-only;";
10456	OS << (VectorizeOnlyWhenForced ? "" : "no-") << "vectorize-forced-only;";
10457	OS << `'>'`;
10458	}
10459

source code of llvm/lib/Transforms/Vectorize/LoopVectorize.cpp