LoopIdiomRecognize.cpp source code [llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp]

1	//===- LoopIdiomRecognize.cpp - Loop idiom recognition --------------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// This pass implements an idiom recognizer that transforms simple loops into a
10	// non-loop form. In cases that this kicks in, it can be a significant
11	// performance win.
12	//
13	// If compiling for code size we avoid idiom recognition if the resulting
14	// code could be larger than the code for the original loop. One way this could
15	// happen is if the loop is not removable after idiom recognition due to the
16	// presence of non-idiom instructions. The initial implementation of the
17	// heuristics applies to idioms in multi-block loops.
18	//
19	//===----------------------------------------------------------------------===//
20	//
21	// TODO List:
22	//
23	// Future loop memory idioms to recognize: memcmp, etc.
24	//
25	// This could recognize common matrix multiplies and dot product idioms and
26	// replace them with calls to BLAS (if linked in??).
27	//
28	//===----------------------------------------------------------------------===//
29
30	#include "llvm/Transforms/Scalar/LoopIdiomRecognize.h"
31	#include "llvm/ADT/APInt.h"
32	#include "llvm/ADT/ArrayRef.h"
33	#include "llvm/ADT/DenseMap.h"
34	#include "llvm/ADT/MapVector.h"
35	#include "llvm/ADT/SetVector.h"
36	#include "llvm/ADT/SmallPtrSet.h"
37	#include "llvm/ADT/SmallVector.h"
38	#include "llvm/ADT/Statistic.h"
39	#include "llvm/ADT/StringRef.h"
40	#include "llvm/Analysis/AliasAnalysis.h"
41	#include "llvm/Analysis/CmpInstAnalysis.h"
42	#include "llvm/Analysis/LoopInfo.h"
43	#include "llvm/Analysis/LoopPass.h"
44	#include "llvm/Analysis/MemoryLocation.h"
45	#include "llvm/Analysis/MemorySSA.h"
46	#include "llvm/Analysis/MemorySSAUpdater.h"
47	#include "llvm/Analysis/MustExecute.h"
48	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
49	#include "llvm/Analysis/ScalarEvolution.h"
50	#include "llvm/Analysis/ScalarEvolutionExpressions.h"
51	#include "llvm/Analysis/ScalarEvolutionPatternMatch.h"
52	#include "llvm/Analysis/TargetLibraryInfo.h"
53	#include "llvm/Analysis/TargetTransformInfo.h"
54	#include "llvm/Analysis/ValueTracking.h"
55	#include "llvm/IR/BasicBlock.h"
56	#include "llvm/IR/Constant.h"
57	#include "llvm/IR/Constants.h"
58	#include "llvm/IR/DataLayout.h"
59	#include "llvm/IR/DebugLoc.h"
60	#include "llvm/IR/DerivedTypes.h"
61	#include "llvm/IR/Dominators.h"
62	#include "llvm/IR/GlobalValue.h"
63	#include "llvm/IR/GlobalVariable.h"
64	#include "llvm/IR/IRBuilder.h"
65	#include "llvm/IR/InstrTypes.h"
66	#include "llvm/IR/Instruction.h"
67	#include "llvm/IR/Instructions.h"
68	#include "llvm/IR/IntrinsicInst.h"
69	#include "llvm/IR/Intrinsics.h"
70	#include "llvm/IR/LLVMContext.h"
71	#include "llvm/IR/Module.h"
72	#include "llvm/IR/PassManager.h"
73	#include "llvm/IR/PatternMatch.h"
74	#include "llvm/IR/Type.h"
75	#include "llvm/IR/User.h"
76	#include "llvm/IR/Value.h"
77	#include "llvm/IR/ValueHandle.h"
78	#include "llvm/Support/Casting.h"
79	#include "llvm/Support/CommandLine.h"
80	#include "llvm/Support/Debug.h"
81	#include "llvm/Support/InstructionCost.h"
82	#include "llvm/Support/raw_ostream.h"
83	#include "llvm/Transforms/Utils/BuildLibCalls.h"
84	#include "llvm/Transforms/Utils/Local.h"
85	#include "llvm/Transforms/Utils/LoopUtils.h"
86	#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
87	#include <algorithm>
88	#include <cassert>
89	#include <cstdint>
90	#include <utility>
91	#include <vector>
92
93	using namespace llvm;
94	using namespace SCEVPatternMatch;
95
96	#define DEBUG_TYPE "loop-idiom"
97
98	STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
99	STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
100	STATISTIC(NumMemMove, "Number of memmove's formed from loop load+stores");
101	STATISTIC(NumStrLen, "Number of strlen's and wcslen's formed from loop loads");
102	STATISTIC(
103	NumShiftUntilBitTest,
104	"Number of uncountable loops recognized as 'shift until bitttest' idiom");
105	STATISTIC(NumShiftUntilZero,
106	"Number of uncountable loops recognized as 'shift until zero' idiom");
107
108	bool DisableLIRP::All;
109	static cl::opt<bool, true>
110	DisableLIRPAll("disable-" DEBUG_TYPE "-all",
111	cl::desc ("Options to disable Loop Idiom Recognize Pass."),
112	cl::location(L&: DisableLIRP::All), cl::init(Val: false),
113	cl::ReallyHidden);
114
115	bool DisableLIRP::Memset;
116	static cl::opt<bool, true>
117	DisableLIRPMemset("disable-" DEBUG_TYPE "-memset",
118	cl::desc ("Proceed with loop idiom recognize pass, but do "
119	"not convert loop(s) to memset."),
120	cl::location(L&: DisableLIRP::Memset), cl::init(Val: false),
121	cl::ReallyHidden);
122
123	bool DisableLIRP::Memcpy;
124	static cl::opt<bool, true>
125	DisableLIRPMemcpy("disable-" DEBUG_TYPE "-memcpy",
126	cl::desc ("Proceed with loop idiom recognize pass, but do "
127	"not convert loop(s) to memcpy."),
128	cl::location(L&: DisableLIRP::Memcpy), cl::init(Val: false),
129	cl::ReallyHidden);
130
131	bool DisableLIRP::Strlen;
132	static cl::opt<bool, true>
133	DisableLIRPStrlen("disable-loop-idiom-strlen",
134	cl::desc ("Proceed with loop idiom recognize pass, but do "
135	"not convert loop(s) to strlen."),
136	cl::location(L&: DisableLIRP::Strlen), cl::init(Val: false),
137	cl::ReallyHidden);
138
139	bool DisableLIRP::Wcslen;
140	static cl::opt<bool, true>
141	EnableLIRPWcslen("disable-loop-idiom-wcslen",
142	cl::desc ("Proceed with loop idiom recognize pass, "
143	"enable conversion of loop(s) to wcslen."),
144	cl::location(L&: DisableLIRP::Wcslen), cl::init(Val: false),
145	cl::ReallyHidden);
146
147	static cl::opt<bool> UseLIRCodeSizeHeurs(
148	"use-lir-code-size-heurs",
149	cl::desc ("Use loop idiom recognition code size heuristics when compiling "
150	"with -Os/-Oz"),
151	cl::init(Val: true), cl::Hidden);
152
153	static cl::opt<bool> ForceMemsetPatternIntrinsic(
154	"loop-idiom-force-memset-pattern-intrinsic",
155	cl::desc ("Use memset.pattern intrinsic whenever possible"), cl::init(Val: false),
156	cl::Hidden);
157
158	namespace {
159
160	class LoopIdiomRecognize {
161	Loop CurLoop = nullptr*;
162	AliasAnalysis *AA;
163	DominatorTree *DT;
164	LoopInfo *LI;
165	ScalarEvolution *SE;
166	TargetLibraryInfo *TLI;
167	const TargetTransformInfo *TTI;
168	const DataLayout *DL;
169	OptimizationRemarkEmitter &ORE;
170	bool ApplyCodeSizeHeuristics;
171	std::unique_ptr<MemorySSAUpdater> MSSAU;
172
173	public:
174	explicit LoopIdiomRecognize(AliasAnalysis AA, DominatorTree DT,
175	LoopInfo LI, ScalarEvolution SE,
176	TargetLibraryInfo *TLI,
177	const TargetTransformInfo TTI, MemorySSA MSSA,
178	const DataLayout *DL,
179	OptimizationRemarkEmitter &ORE)
180	: AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
181	if (MSSA)
182	MSSAU = std::make_unique<MemorySSAUpdater>(args&: MSSA);
183	}
184
185	bool runOnLoop(Loop *L);
186
187	private:
188	using StoreList = SmallVector<StoreInst *, `8`>;
189	using StoreListMap = MapVector<Value *, StoreList>;
190
191	StoreListMap StoreRefsForMemset;
192	StoreListMap StoreRefsForMemsetPattern;
193	StoreList StoreRefsForMemcpy;
194	bool HasMemset;
195	bool HasMemsetPattern;
196	bool HasMemcpy;
197
198	/// Return code for isLegalStore()
199	enum LegalStoreKind {
200	None = `0`,
201	Memset,
202	MemsetPattern,
203	Memcpy,
204	UnorderedAtomicMemcpy,
205	DontUse // Dummy retval never to be used. Allows catching errors in retval
206	// handling.
207	};
208
209	/// \name Countable Loop Idiom Handling
210	/// @{
211
212	bool runOnCountableLoop();
213	bool runOnLoopBlock(BasicBlock BB, const* SCEV *BECount,
214	SmallVectorImpl<BasicBlock *> &ExitBlocks);
215
216	void collectStores(BasicBlock *BB);
217	LegalStoreKind isLegalStore(StoreInst *SI);
218	enum class ForMemset { No, Yes };
219	bool processLoopStores(SmallVectorImpl<StoreInst > &SL, const* SCEV *BECount,
220	ForMemset For);
221
222	template <typename MemInst>
223	bool processLoopMemIntrinsic(
224	BasicBlock *BB,
225	bool (LoopIdiomRecognize::Processor)(MemInst , const SCEV *),
226	const SCEV *BECount);
227	bool processLoopMemCpy(MemCpyInst MCI, const* SCEV *BECount);
228	bool processLoopMemSet(MemSetInst MSI, const* SCEV *BECount);
229
230	bool processLoopStridedStore(Value DestPtr, const* SCEV *StoreSizeSCEV,
231	MaybeAlign StoreAlignment, Value *StoredVal,
232	Instruction *TheStore,
233	SmallPtrSetImpl<Instruction *> &Stores,
234	const SCEVAddRecExpr Ev, const* SCEV *BECount,
235	bool IsNegStride, bool IsLoopMemset = false);
236	bool processLoopStoreOfLoopLoad(StoreInst SI, const* SCEV *BECount);
237	bool processLoopStoreOfLoopLoad(Value DestPtr, Value SourcePtr,
238	const SCEV *StoreSize, MaybeAlign StoreAlign,
239	MaybeAlign LoadAlign, Instruction *TheStore,
240	Instruction *TheLoad,
241	const SCEVAddRecExpr *StoreEv,
242	const SCEVAddRecExpr *LoadEv,
243	const SCEV *BECount);
244	bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
245	bool IsLoopMemset = false);
246
247	/// @}
248	/// \name Noncountable Loop Idiom Handling
249	/// @{
250
251	bool runOnNoncountableLoop();
252
253	bool recognizePopcount();
254	void transformLoopToPopcount(BasicBlock PreCondBB, Instruction CntInst,
255	PHINode CntPhi, Value Var);
256	bool isProfitableToInsertFFS(Intrinsic::ID IntrinID, Value *InitX,
257	bool ZeroCheck, size_t CanonicalSize);
258	bool insertFFSIfProfitable(Intrinsic::ID IntrinID, Value *InitX,
259	Instruction DefX, PHINode CntPhi,
260	Instruction *CntInst);
261	bool recognizeAndInsertFFS(); /// Find First Set: ctlz or cttz
262	bool recognizeShiftUntilLessThan();
263	void transformLoopToCountable(Intrinsic::ID IntrinID, BasicBlock *PreCondBB,
264	Instruction CntInst, PHINode CntPhi,
265	Value Var, Instruction DefX,
266	const DebugLoc &DL, bool ZeroCheck,
267	bool IsCntPhiUsedOutsideLoop,
268	bool InsertSub = false);
269
270	bool recognizeShiftUntilBitTest();
271	bool recognizeShiftUntilZero();
272	bool recognizeAndInsertStrLen();
273
274	/// @}
275	};
276	} // end anonymous namespace
277
278	PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
279	LoopStandardAnalysisResults &AR,
280	LPMUpdater &) {
281	if (DisableLIRP::All)
282	return PreservedAnalyses::all();
283
284	const auto *DL = &L.getHeader()->getDataLayout();
285
286	// For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
287	// pass. Function analyses need to be preserved across loop transformations
288	// but ORE cannot be preserved (see comment before the pass definition).
289	OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
290
291	LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
292	AR.MSSA, DL, ORE);
293	if (!LIR.runOnLoop(L: &L))
294	return PreservedAnalyses::all();
295
296	auto PA = getLoopPassPreservedAnalyses();
297	if (AR.MSSA)
298	PA.preserve<MemorySSAAnalysis>();
299	return PA;
300	}
301
302	static void deleteDeadInstruction(Instruction *I) {
303	I->replaceAllUsesWith(V: PoisonValue::get(T: I->getType()));
304	I->eraseFromParent();
305	}
306
307	//===----------------------------------------------------------------------===//
308	//
309	// Implementation of LoopIdiomRecognize
310	//
311	//===----------------------------------------------------------------------===//
312
313	bool LoopIdiomRecognize::runOnLoop(Loop *L) {
314	CurLoop = L;
315	// If the loop could not be converted to canonical form, it must have an
316	// indirectbr in it, just give up.
317	if (!L->getLoopPreheader())
318	return false;
319
320	// Disable loop idiom recognition if the function's name is a common idiom.
321	StringRef Name = L->getHeader()->getParent()->getName();
322	if (Name == "memset" \|\| Name == "memcpy" \|\| Name == "strlen" \|\|
323	Name == "wcslen")
324	return false;
325
326	// Determine if code size heuristics need to be applied.
327	ApplyCodeSizeHeuristics =
328	L->getHeader()->getParent()->hasOptSize() && UseLIRCodeSizeHeurs;
329
330	HasMemset = TLI->has(F: LibFunc_memset);
331	// TODO: Unconditionally enable use of the memset pattern intrinsic (or at
332	// least, opt-in via target hook) once we are confident it will never result
333	// in worse codegen than without. For now, use it only when the target
334	// supports memset_pattern16 libcall (or unless this is overridden by
335	// command line option).
336	HasMemsetPattern = TLI->has(F: LibFunc_memset_pattern16);
337	HasMemcpy = TLI->has(F: LibFunc_memcpy);
338
339	if (HasMemset \|\| HasMemsetPattern \|\| ForceMemsetPatternIntrinsic \|\| HasMemcpy)
340	if (SE->hasLoopInvariantBackedgeTakenCount(L))
341	return runOnCountableLoop();
342
343	return runOnNoncountableLoop();
344	}
345
346	bool LoopIdiomRecognize::runOnCountableLoop() {
347	const SCEV *BECount = SE->getBackedgeTakenCount(L: CurLoop);
348	assert(!isa<SCEVCouldNotCompute>(BECount) &&
349	"runOnCountableLoop() called on a loop without a predictable"
350	"backedge-taken count");
351
352	// If this loop executes exactly one time, then it should be peeled, not
353	// optimized by this pass.
354	if (BECount->isZero())
355	return false;
356
357	SmallVector<BasicBlock *, `8`> ExitBlocks;
358	CurLoop->getUniqueExitBlocks(ExitBlocks);
359
360	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
361	<< CurLoop->getHeader()->getParent()->getName()
362	<< "] Countable Loop %" << CurLoop->getHeader()->getName()
363	<< "\n");
364
365	// The following transforms hoist stores/memsets into the loop pre-header.
366	// Give up if the loop has instructions that may throw.
367	SimpleLoopSafetyInfo SafetyInfo;
368	SafetyInfo.computeLoopSafetyInfo(CurLoop);
369	if (SafetyInfo.anyBlockMayThrow())
370	return false;
371
372	bool MadeChange = false;
373
374	// Scan all the blocks in the loop that are not in subloops.
375	for (auto *BB : CurLoop->getBlocks()) {
376	// Ignore blocks in subloops.
377	if (LI->getLoopFor(BB) != CurLoop)
378	continue;
379
380	MadeChange \|= runOnLoopBlock(BB, BECount, ExitBlocks);
381	}
382	return MadeChange;
383	}
384
385	static APInt getStoreStride(const SCEVAddRecExpr *StoreEv) {
386	const SCEVConstant *ConstStride = cast<SCEVConstant>(Val: StoreEv->getOperand(i: `1`));
387	return ConstStride->getAPInt();
388	}
389
390	/// getMemSetPatternValue - If a strided store of the specified value is safe to
391	/// turn into a memset.patternn intrinsic, return the Constant that should
392	/// be passed in. Otherwise, return null.
393	///
394	/// TODO this function could allow more constants than it does today (e.g.
395	/// those over 16 bytes) now it has transitioned to being used for the
396	/// memset.pattern intrinsic rather than directly the memset_pattern16
397	/// libcall.
398	static Constant getMemSetPatternValue(Value V, const DataLayout *DL) {
399	// FIXME: This could check for UndefValue because it can be merged into any
400	// other valid pattern.
401
402	// If the value isn't a constant, we can't promote it to being in a constant
403	// array. We could theoretically do a store to an alloca or something, but
404	// that doesn't seem worthwhile.
405	Constant *C = dyn_cast<Constant>(Val: V);
406	if (!C \|\| isa<ConstantExpr>(Val: C))
407	return nullptr;
408
409	// Only handle simple values that are a power of two bytes in size.
410	uint64_t Size = DL->getTypeSizeInBits(Ty: V->getType());
411	if (Size == `0` \|\| (Size & `7`) \|\| (Size & (Size - `1`)))
412	return nullptr;
413
414	// Don't care enough about darwin/ppc to implement this.
415	if (DL->isBigEndian())
416	return nullptr;
417
418	// Convert to size in bytes.
419	Size /= `8`;
420
421	// TODO: If CI is larger than 16-bytes, we can try slicing it in half to see
422	// if the top and bottom are the same (e.g. for vectors and large integers).
423	if (Size > `16`)
424	return nullptr;
425
426	// For now, don't handle types that aren't int, floats, or pointers.
427	Type *CTy = C->getType();
428	if (!CTy->isIntOrPtrTy() && !CTy->isFloatingPointTy())
429	return nullptr;
430
431	return C;
432	}
433
434	LoopIdiomRecognize::LegalStoreKind
435	LoopIdiomRecognize::isLegalStore(StoreInst *SI) {
436	// Don't touch volatile stores.
437	if (SI->isVolatile())
438	return LegalStoreKind::None;
439	// We only want simple or unordered-atomic stores.
440	if (!SI->isUnordered())
441	return LegalStoreKind::None;
442
443	// Avoid merging nontemporal stores.
444	if (SI->getMetadata(KindID: LLVMContext::MD_nontemporal))
445	return LegalStoreKind::None;
446
447	Value *StoredVal = SI->getValueOperand();
448	Value *StorePtr = SI->getPointerOperand();
449
450	// Don't convert stores of non-integral pointer types to memsets (which stores
451	// integers).
452	if (DL->isNonIntegralPointerType(Ty: StoredVal->getType()->getScalarType()))
453	return LegalStoreKind::None;
454
455	// Reject stores that are so large that they overflow an unsigned.
456	// When storing out scalable vectors we bail out for now, since the code
457	// below currently only works for constant strides.
458	TypeSize SizeInBits = DL->getTypeSizeInBits(Ty: StoredVal->getType());
459	if (SizeInBits.isScalable() \|\| (SizeInBits.getFixedValue() & `7`) \|\|
460	(SizeInBits.getFixedValue() >> `32`) != `0`)
461	return LegalStoreKind::None;
462
463	// See if the pointer expression is an AddRec like {base,+,1} on the current
464	// loop, which indicates a strided store. If we have something else, it's a
465	// random store we can't handle.
466	const SCEV *StoreEv = SE->getSCEV(V: StorePtr);
467	const SCEVConstant *Stride;
468	if (!match(S: StoreEv, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEVConstant(V&: Stride),
469	L: m_SpecificLoop(L: CurLoop))))
470	return LegalStoreKind::None;
471
472	// See if the store can be turned into a memset.
473
474	// If the stored value is a byte-wise value (like i32 -1), then it may be
475	// turned into a memset of i8 -1, assuming that all the consecutive bytes
476	// are stored. A store of i32 0x01020304 can never be turned into a memset,
477	// but it can be turned into memset_pattern if the target supports it.
478	Value SplatValue = isBytewiseValue(V: StoredVal, DL: DL);
479
480	// Note: memset and memset_pattern on unordered-atomic is yet not supported
481	bool UnorderedAtomic = SI->isUnordered() && !SI->isSimple();
482
483	// If we're allowed to form a memset, and the stored value would be
484	// acceptable for memset, use it.
485	if (!UnorderedAtomic && HasMemset && SplatValue && !DisableLIRP::Memset &&
486	// Verify that the stored value is loop invariant. If not, we can't
487	// promote the memset.
488	CurLoop->isLoopInvariant(V: SplatValue)) {
489	// It looks like we can use SplatValue.
490	return LegalStoreKind::Memset;
491	}
492	if (!UnorderedAtomic && (HasMemsetPattern \|\| ForceMemsetPatternIntrinsic) &&
493	!DisableLIRP::Memset &&
494	// Don't create memset_pattern16s with address spaces.
495	StorePtr->getType()->getPointerAddressSpace() == `0` &&
496	getMemSetPatternValue(V: StoredVal, DL)) {
497	// It looks like we can use PatternValue!
498	return LegalStoreKind::MemsetPattern;
499	}
500
501	// Otherwise, see if the store can be turned into a memcpy.
502	if (HasMemcpy && !DisableLIRP::Memcpy) {
503	// Check to see if the stride matches the size of the store. If so, then we
504	// know that every byte is touched in the loop.
505	unsigned StoreSize = DL->getTypeStoreSize(Ty: SI->getValueOperand()->getType());
506	APInt StrideAP = Stride->getAPInt();
507	if (StoreSize != StrideAP && StoreSize != -StrideAP)
508	return LegalStoreKind::None;
509
510	// The store must be feeding a non-volatile load.
511	LoadInst *LI = dyn_cast<LoadInst>(Val: SI->getValueOperand());
512
513	// Only allow non-volatile loads
514	if (!LI \|\| LI->isVolatile())
515	return LegalStoreKind::None;
516	// Only allow simple or unordered-atomic loads
517	if (!LI->isUnordered())
518	return LegalStoreKind::None;
519
520	// See if the pointer expression is an AddRec like {base,+,1} on the current
521	// loop, which indicates a strided load. If we have something else, it's a
522	// random load we can't handle.
523	const SCEV *LoadEv = SE->getSCEV(V: LI->getPointerOperand());
524
525	// The store and load must share the same stride.
526	if (!match(S: LoadEv, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_scev_Specific(S: Stride),
527	L: m_SpecificLoop(L: CurLoop))))
528	return LegalStoreKind::None;
529
530	// Success. This store can be converted into a memcpy.
531	UnorderedAtomic = UnorderedAtomic \|\| LI->isAtomic();
532	return UnorderedAtomic ? LegalStoreKind::UnorderedAtomicMemcpy
533	: LegalStoreKind::Memcpy;
534	}
535	// This store can't be transformed into a memset/memcpy.
536	return LegalStoreKind::None;
537	}
538
539	void LoopIdiomRecognize::collectStores(BasicBlock *BB) {
540	StoreRefsForMemset.clear();
541	StoreRefsForMemsetPattern.clear();
542	StoreRefsForMemcpy.clear();
543	for (Instruction &I : *BB) {
544	StoreInst *SI = dyn_cast<StoreInst>(Val: &I);
545	if (!SI)
546	continue;
547
548	// Make sure this is a strided store with a constant stride.
549	switch (isLegalStore(SI)) {
550	case LegalStoreKind::None:
551	// Nothing to do
552	break;
553	case LegalStoreKind::Memset: {
554	// Find the base pointer.
555	Value *Ptr = getUnderlyingObject(V: SI->getPointerOperand());
556	StoreRefsForMemset [Ptr].push_back(Elt: SI);
557	} break;
558	case LegalStoreKind::MemsetPattern: {
559	// Find the base pointer.
560	Value *Ptr = getUnderlyingObject(V: SI->getPointerOperand());
561	StoreRefsForMemsetPattern [Ptr].push_back(Elt: SI);
562	} break;
563	case LegalStoreKind::Memcpy:
564	case LegalStoreKind::UnorderedAtomicMemcpy:
565	StoreRefsForMemcpy.push_back(Elt: SI);
566	break;
567	default:
568	assert(false && "unhandled return value");
569	break;
570	}
571	}
572	}
573
574	/// runOnLoopBlock - Process the specified block, which lives in a counted loop
575	/// with the specified backedge count. This block is known to be in the current
576	/// loop and not in any subloops.
577	bool LoopIdiomRecognize::runOnLoopBlock(
578	BasicBlock BB, const* SCEV *BECount,
579	SmallVectorImpl<BasicBlock *> &ExitBlocks) {
580	// We can only promote stores in this block if they are unconditionally
581	// executed in the loop. For a block to be unconditionally executed, it has
582	// to dominate all the exit blocks of the loop. Verify this now.
583	for (BasicBlock *ExitBlock : ExitBlocks)
584	if (!DT->dominates(A: BB, B: ExitBlock))
585	return false;
586
587	bool MadeChange = false;
588	// Look for store instructions, which may be optimized to memset/memcpy.
589	collectStores(BB);
590
591	// Look for a single store or sets of stores with a common base, which can be
592	// optimized into a memset (memset_pattern). The latter most commonly happens
593	// with structs and handunrolled loops.
594	for (auto &SL : StoreRefsForMemset)
595	MadeChange \|= processLoopStores(SL&: SL.second, BECount, For: ForMemset::Yes);
596
597	for (auto &SL : StoreRefsForMemsetPattern)
598	MadeChange \|= processLoopStores(SL&: SL.second, BECount, For: ForMemset::No);
599
600	// Optimize the store into a memcpy, if it feeds an similarly strided load.
601	for (auto &SI : StoreRefsForMemcpy)
602	MadeChange \|= processLoopStoreOfLoopLoad(SI, BECount);
603
604	MadeChange \|= processLoopMemIntrinsic<MemCpyInst>(
605	BB, Processor: &LoopIdiomRecognize::processLoopMemCpy, BECount);
606	MadeChange \|= processLoopMemIntrinsic<MemSetInst>(
607	BB, Processor: &LoopIdiomRecognize::processLoopMemSet, BECount);
608
609	return MadeChange;
610	}
611
612	/// See if this store(s) can be promoted to a memset.
613	bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
614	const SCEV *BECount, ForMemset For) {
615	// Try to find consecutive stores that can be transformed into memsets.
616	SetVector<StoreInst *> Heads, Tails;
617	SmallDenseMap<StoreInst , StoreInst > ConsecutiveChain;
618
619	// Do a quadratic search on all of the given stores and find
620	// all of the pairs of stores that follow each other.
621	SmallVector<unsigned, `16`> IndexQueue;
622	for (unsigned i = `0`, e = SL.size(); i < e; ++i) {
623	assert(SL[i]->isSimple() && "Expected only non-volatile stores.");
624
625	Value *FirstStoredVal = SL [i]->getValueOperand();
626	Value *FirstStorePtr = SL [i]->getPointerOperand();
627	const SCEVAddRecExpr *FirstStoreEv =
628	cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: FirstStorePtr));
629	APInt FirstStride = getStoreStride(StoreEv: FirstStoreEv);
630	unsigned FirstStoreSize = DL->getTypeStoreSize(Ty: SL [i]->getValueOperand()->getType());
631
632	// See if we can optimize just this store in isolation.
633	if (FirstStride == FirstStoreSize \|\| -FirstStride == FirstStoreSize) {
634	Heads.insert(X: SL [i]);
635	continue;
636	}
637
638	Value FirstSplatValue = nullptr*;
639	Constant FirstPatternValue = nullptr*;
640
641	if (For == ForMemset::Yes)
642	FirstSplatValue = isBytewiseValue(V: FirstStoredVal, DL: *DL);
643	else
644	FirstPatternValue = getMemSetPatternValue(V: FirstStoredVal, DL);
645
646	assert((FirstSplatValue \|\| FirstPatternValue) &&
647	"Expected either splat value or pattern value.");
648
649	IndexQueue.clear();
650	// If a store has multiple consecutive store candidates, search Stores
651	// array according to the sequence: from i+1 to e, then from i-1 to 0.
652	// This is because usually pairing with immediate succeeding or preceding
653	// candidate create the best chance to find memset opportunity.
654	unsigned j = `0`;
655	for (j = i + `1`; j < e; ++j)
656	IndexQueue.push_back(Elt: j);
657	for (j = i; j > `0`; --j)
658	IndexQueue.push_back(Elt: j - `1`);
659
660	for (auto &k : IndexQueue) {
661	assert(SL[k]->isSimple() && "Expected only non-volatile stores.");
662	Value *SecondStorePtr = SL [k]->getPointerOperand();
663	const SCEVAddRecExpr *SecondStoreEv =
664	cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: SecondStorePtr));
665	APInt SecondStride = getStoreStride(StoreEv: SecondStoreEv);
666
667	if (FirstStride != SecondStride)
668	continue;
669
670	Value *SecondStoredVal = SL [k]->getValueOperand();
671	Value SecondSplatValue = nullptr*;
672	Constant SecondPatternValue = nullptr*;
673
674	if (For == ForMemset::Yes)
675	SecondSplatValue = isBytewiseValue(V: SecondStoredVal, DL: *DL);
676	else
677	SecondPatternValue = getMemSetPatternValue(V: SecondStoredVal, DL);
678
679	assert((SecondSplatValue \|\| SecondPatternValue) &&
680	"Expected either splat value or pattern value.");
681
682	if (isConsecutiveAccess(A: SL [i], B: SL [k], DL: DL, SE&: SE, CheckType: false)) {
683	if (For == ForMemset::Yes) {
684	if (isa<UndefValue>(Val: FirstSplatValue))
685	FirstSplatValue = SecondSplatValue;
686	if (FirstSplatValue != SecondSplatValue)
687	continue;
688	} else {
689	if (isa<UndefValue>(Val: FirstPatternValue))
690	FirstPatternValue = SecondPatternValue;
691	if (FirstPatternValue != SecondPatternValue)
692	continue;
693	}
694	Tails.insert(X: SL [k]);
695	Heads.insert(X: SL [i]);
696	ConsecutiveChain [SL [i]] = SL [k];
697	break;
698	}
699	}
700	}
701
702	// We may run into multiple chains that merge into a single chain. We mark the
703	// stores that we transformed so that we don't visit the same store twice.
704	SmallPtrSet<Value *, `16`> TransformedStores;
705	bool Changed = false;
706
707	// For stores that start but don't end a link in the chain:
708	for (StoreInst *I : Heads) {
709	if (Tails.count(key: I))
710	continue;
711
712	// We found a store instr that starts a chain. Now follow the chain and try
713	// to transform it.
714	SmallPtrSet<Instruction *, `8`> AdjacentStores;
715	StoreInst *HeadStore = I;
716	unsigned StoreSize = `0`;
717
718	// Collect the chain into a list.
719	while (Tails.count(key: I) \|\| Heads.count(key: I)) {
720	if (TransformedStores.count(Ptr: I))
721	break;
722	AdjacentStores.insert(Ptr: I);
723
724	StoreSize += DL->getTypeStoreSize(Ty: I->getValueOperand()->getType());
725	// Move to the next value in the chain.
726	I = ConsecutiveChain [I];
727	}
728
729	Value *StoredVal = HeadStore->getValueOperand();
730	Value *StorePtr = HeadStore->getPointerOperand();
731	const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: StorePtr));
732	APInt Stride = getStoreStride(StoreEv);
733
734	// Check to see if the stride matches the size of the stores. If so, then
735	// we know that every byte is touched in the loop.
736	if (StoreSize != Stride && StoreSize != -Stride)
737	continue;
738
739	bool IsNegStride = StoreSize == -Stride;
740
741	Type *IntIdxTy = DL->getIndexType(PtrTy: StorePtr->getType());
742	const SCEV *StoreSizeSCEV = SE->getConstant(Ty: IntIdxTy, V: StoreSize);
743	if (processLoopStridedStore(DestPtr: StorePtr, StoreSizeSCEV,
744	StoreAlignment: MaybeAlign (HeadStore->getAlign()), StoredVal,
745	TheStore: HeadStore, Stores&: AdjacentStores, Ev: StoreEv, BECount,
746	IsNegStride)) {
747	TransformedStores.insert_range(R&: AdjacentStores);
748	Changed = true;
749	}
750	}
751
752	return Changed;
753	}
754
755	/// processLoopMemIntrinsic - Template function for calling different processor
756	/// functions based on mem intrinsic type.
757	template <typename MemInst>
758	bool LoopIdiomRecognize::processLoopMemIntrinsic(
759	BasicBlock *BB,
760	bool (LoopIdiomRecognize::Processor)(MemInst , const SCEV *),
761	const SCEV *BECount) {
762	bool MadeChange = false;
763	for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
764	Instruction Inst = &I ++;
765	// Look for memory instructions, which may be optimized to a larger one.
766	if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
767	WeakTrackingVH InstPtr(&*I);
768	if (!(this->*Processor)(MI, BECount))
769	continue;
770	MadeChange = true;
771
772	// If processing the instruction invalidated our iterator, start over from
773	// the top of the block.
774	if (!InstPtr)
775	I = BB->begin();
776	}
777	}
778	return MadeChange;
779	}
780
781	/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
782	bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
783	const SCEV *BECount) {
784	// We can only handle non-volatile memcpys with a constant size.
785	if (MCI->isVolatile() \|\| !isa<ConstantInt>(Val: MCI->getLength()))
786	return false;
787
788	// If we're not allowed to hack on memcpy, we fail.
789	if ((!HasMemcpy && !MCI->isForceInlined()) \|\| DisableLIRP::Memcpy)
790	return false;
791
792	Value *Dest = MCI->getDest();
793	Value *Source = MCI->getSource();
794	if (!Dest \|\| !Source)
795	return false;
796
797	// See if the load and store pointer expressions are AddRec like {base,+,1} on
798	// the current loop, which indicates a strided load and store. If we have
799	// something else, it's a random load or store we can't handle.
800	const SCEV *StoreEv = SE->getSCEV(V: Dest);
801	const SCEV *LoadEv = SE->getSCEV(V: Source);
802	const APInt StoreStrideValue, LoadStrideValue;
803	if (!match(S: StoreEv,
804	P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_scev_APInt(C&: StoreStrideValue),
805	L: m_SpecificLoop(L: CurLoop))) \|\|
806	!match(S: LoadEv,
807	P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_scev_APInt(C&: LoadStrideValue),
808	L: m_SpecificLoop(L: CurLoop))))
809	return false;
810
811	// Reject memcpys that are so large that they overflow an unsigned.
812	uint64_t SizeInBytes = cast<ConstantInt>(Val: MCI->getLength())->getZExtValue();
813	if ((SizeInBytes >> `32`) != `0`)
814	return false;
815
816	// Huge stride value - give up
817	if (StoreStrideValue->getBitWidth() > `64` \|\|
818	LoadStrideValue->getBitWidth() > `64`)
819	return false;
820
821	if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
822	ORE.emit(RemarkBuilder: [&]() {
823	return OptimizationRemarkMissed (DEBUG_TYPE, "SizeStrideUnequal", MCI)
824	<< ore::NV ("Inst", "memcpy") << " in "
825	<< ore::NV ("Function", MCI->getFunction())
826	<< " function will not be hoisted: "
827	<< ore::NV ("Reason", "memcpy size is not equal to stride");
828	});
829	return false;
830	}
831
832	int64_t StoreStrideInt = StoreStrideValue->getSExtValue();
833	int64_t LoadStrideInt = LoadStrideValue->getSExtValue();
834	// Check if the load stride matches the store stride.
835	if (StoreStrideInt != LoadStrideInt)
836	return false;
837
838	return processLoopStoreOfLoopLoad(
839	DestPtr: Dest, SourcePtr: Source, StoreSize: SE->getConstant(Ty: Dest->getType(), V: SizeInBytes),
840	StoreAlign: MCI->getDestAlign(), LoadAlign: MCI->getSourceAlign(), TheStore: MCI, TheLoad: MCI,
841	StoreEv: cast<SCEVAddRecExpr>(Val: StoreEv), LoadEv: cast<SCEVAddRecExpr>(Val: LoadEv), BECount);
842	}
843
844	/// processLoopMemSet - See if this memset can be promoted to a large memset.
845	bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
846	const SCEV *BECount) {
847	// We can only handle non-volatile memsets.
848	if (MSI->isVolatile())
849	return false;
850
851	// If we're not allowed to hack on memset, we fail.
852	if (!HasMemset \|\| DisableLIRP::Memset)
853	return false;
854
855	Value *Pointer = MSI->getDest();
856
857	// See if the pointer expression is an AddRec like {base,+,1} on the current
858	// loop, which indicates a strided store. If we have something else, it's a
859	// random store we can't handle.
860	const SCEV *Ev = SE->getSCEV(V: Pointer);
861	const SCEV *PointerStrideSCEV;
862	if (!match(S: Ev, P: m_scev_AffineAddRec(Op0: m_SCEV(), Op1: m_SCEV(V&: PointerStrideSCEV),
863	L: m_SpecificLoop(L: CurLoop)))) {
864	LLVM_DEBUG(dbgs() << " Pointer is not affine, abort\n");
865	return false;
866	}
867
868	const SCEV *MemsetSizeSCEV = SE->getSCEV(V: MSI->getLength());
869
870	bool IsNegStride = false;
871	const bool IsConstantSize = isa<ConstantInt>(Val: MSI->getLength());
872
873	if (IsConstantSize) {
874	// Memset size is constant.
875	// Check if the pointer stride matches the memset size. If so, then
876	// we know that every byte is touched in the loop.
877	LLVM_DEBUG(dbgs() << " memset size is constant\n");
878	uint64_t SizeInBytes = cast<ConstantInt>(Val: MSI->getLength())->getZExtValue();
879	const APInt *Stride;
880	if (!match(S: PointerStrideSCEV, P: m_scev_APInt(C&: Stride)))
881	return false;
882
883	if (SizeInBytes != Stride && SizeInBytes != -Stride)
884	return false;
885
886	IsNegStride = SizeInBytes == -*Stride;
887	} else {
888	// Memset size is non-constant.
889	// Check if the pointer stride matches the memset size.
890	// To be conservative, the pass would not promote pointers that aren't in
891	// address space zero. Also, the pass only handles memset length and stride
892	// that are invariant for the top level loop.
893	LLVM_DEBUG(dbgs() << " memset size is non-constant\n");
894	if (Pointer->getType()->getPointerAddressSpace() != `0`) {
895	LLVM_DEBUG(dbgs() << " pointer is not in address space zero, "
896	<< "abort\n");
897	return false;
898	}
899	if (!SE->isLoopInvariant(S: MemsetSizeSCEV, L: CurLoop)) {
900	LLVM_DEBUG(dbgs() << " memset size is not a loop-invariant, "
901	<< "abort\n");
902	return false;
903	}
904
905	// Compare positive direction PointerStrideSCEV with MemsetSizeSCEV
906	IsNegStride = PointerStrideSCEV->isNonConstantNegative();
907	const SCEV *PositiveStrideSCEV =
908	IsNegStride ? SE->getNegativeSCEV(V: PointerStrideSCEV)
909	: PointerStrideSCEV;
910	LLVM_DEBUG(dbgs() << " MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
911	<< " PositiveStrideSCEV: " << *PositiveStrideSCEV
912	<< "\n");
913
914	if (PositiveStrideSCEV != MemsetSizeSCEV) {
915	// If an expression is covered by the loop guard, compare again and
916	// proceed with optimization if equal.
917	const SCEV *FoldedPositiveStride =
918	SE->applyLoopGuards(Expr: PositiveStrideSCEV, L: CurLoop);
919	const SCEV *FoldedMemsetSize =
920	SE->applyLoopGuards(Expr: MemsetSizeSCEV, L: CurLoop);
921
922	LLVM_DEBUG(dbgs() << " Try to fold SCEV based on loop guard\n"
923	<< " FoldedMemsetSize: " << *FoldedMemsetSize << "\n"
924	<< " FoldedPositiveStride: " << *FoldedPositiveStride
925	<< "\n");
926
927	if (FoldedPositiveStride != FoldedMemsetSize) {
928	LLVM_DEBUG(dbgs() << " SCEV don't match, abort\n");
929	return false;
930	}
931	}
932	}
933
934	// Verify that the memset value is loop invariant. If not, we can't promote
935	// the memset.
936	Value *SplatValue = MSI->getValue();
937	if (!SplatValue \|\| !CurLoop->isLoopInvariant(V: SplatValue))
938	return false;
939
940	SmallPtrSet<Instruction *, `1`> MSIs;
941	MSIs.insert(Ptr: MSI);
942	return processLoopStridedStore(DestPtr: Pointer, StoreSizeSCEV: SE->getSCEV(V: MSI->getLength()),
943	StoreAlignment: MSI->getDestAlign(), StoredVal: SplatValue, TheStore: MSI, Stores&: MSIs,
944	Ev: cast<SCEVAddRecExpr>(Val: Ev), BECount, IsNegStride,
945	/IsLoopMemset=/true);
946	}
947
948	/// mayLoopAccessLocation - Return true if the specified loop might access the
949	/// specified pointer location, which is a loop-strided access. The 'Access'
950	/// argument specifies what the verboten forms of access are (read or write).
951	static bool
952	mayLoopAccessLocation(Value Ptr, ModRefInfo Access, Loop L,
953	const SCEV BECount, const* SCEV *StoreSizeSCEV,
954	AliasAnalysis &AA,
955	SmallPtrSetImpl<Instruction *> &IgnoredInsts) {
956	// Get the location that may be stored across the loop. Since the access is
957	// strided positively through memory, we say that the modified location starts
958	// at the pointer and has infinite size.
959	LocationSize AccessSize = LocationSize::afterPointer();
960
961	// If the loop iterates a fixed number of times, we can refine the access size
962	// to be exactly the size of the memset, which is (BECount+1)StoreSize*
963	const APInt BECst, ConstSize;
964	if (match(S: BECount, P: m_scev_APInt(C&: BECst)) &&
965	match(S: StoreSizeSCEV, P: m_scev_APInt(C&: ConstSize))) {
966	std::optional<uint64_t> BEInt = BECst->tryZExtValue();
967	std::optional<uint64_t> SizeInt = ConstSize->tryZExtValue();
968	// FIXME: Should this check for overflow?
969	if (BEInt && SizeInt)
970	AccessSize = LocationSize::precise(Value: (BEInt + `1`) *SizeInt);
971	}
972
973	// TODO: For this to be really effective, we have to dive into the pointer
974	// operand in the store. Store to &A[i] of 100 will always return may alias
975	// with store of &A[100], we need to StoreLoc to be "A" with size of 100,
976	// which will then no-alias a store to &A[100].
977	MemoryLocation StoreLoc(Ptr, AccessSize);
978
979	for (BasicBlock *B : L->blocks())
980	for (Instruction &I : *B)
981	if (!IgnoredInsts.contains(Ptr: &I) &&
982	isModOrRefSet(MRI: AA.getModRefInfo(I: &I, OptLoc: StoreLoc) & Access))
983	return true;
984	return false;
985	}
986
987	// If we have a negative stride, Start refers to the end of the memory location
988	// we're trying to memset. Therefore, we need to recompute the base pointer,
989	// which is just Start - BECountSize.*
990	static const SCEV getStartForNegStride(const* SCEV Start, const* SCEV *BECount,
991	Type IntPtr, const* SCEV *StoreSizeSCEV,
992	ScalarEvolution *SE) {
993	const SCEV *Index = SE->getTruncateOrZeroExtend(V: BECount, Ty: IntPtr);
994	if (!StoreSizeSCEV->isOne()) {
995	// index = back edge count store size*
996	Index = SE->getMulExpr(LHS: Index,
997	RHS: SE->getTruncateOrZeroExtend(V: StoreSizeSCEV, Ty: IntPtr),
998	Flags: SCEV::FlagNUW);
999	}
1000	// base pointer = start - index store size*
1001	return SE->getMinusSCEV(LHS: Start, RHS: Index);
1002	}
1003
1004	/// Compute the number of bytes as a SCEV from the backedge taken count.
1005	///
1006	/// This also maps the SCEV into the provided type and tries to handle the
1007	/// computation in a way that will fold cleanly.
1008	static const SCEV getNumBytes(const* SCEV BECount, Type IntPtr,
1009	const SCEV StoreSizeSCEV, Loop CurLoop,
1010	const DataLayout DL, ScalarEvolution SE) {
1011	const SCEV *TripCountSCEV =
1012	SE->getTripCountFromExitCount(ExitCount: BECount, EvalTy: IntPtr, L: CurLoop);
1013	return SE->getMulExpr(LHS: TripCountSCEV,
1014	RHS: SE->getTruncateOrZeroExtend(V: StoreSizeSCEV, Ty: IntPtr),
1015	Flags: SCEV::FlagNUW);
1016	}
1017
1018	/// processLoopStridedStore - We see a strided store of some value. If we can
1019	/// transform this into a memset or memset_pattern in the loop preheader, do so.
1020	bool LoopIdiomRecognize::processLoopStridedStore(
1021	Value DestPtr, const* SCEV *StoreSizeSCEV, MaybeAlign StoreAlignment,
1022	Value StoredVal, Instruction TheStore,
1023	SmallPtrSetImpl<Instruction > &Stores, const* SCEVAddRecExpr *Ev,
1024	const SCEV BECount, bool* IsNegStride, bool IsLoopMemset) {
1025	Module *M = TheStore->getModule();
1026
1027	// The trip count of the loop and the base pointer of the addrec SCEV is
1028	// guaranteed to be loop invariant, which means that it should dominate the
1029	// header. This allows us to insert code for it in the preheader.
1030	unsigned DestAS = DestPtr->getType()->getPointerAddressSpace();
1031	BasicBlock *Preheader = CurLoop->getLoopPreheader();
1032	IRBuilder<> Builder(Preheader->getTerminator());
1033	SCEVExpander Expander(SE, DL, "loop-idiom");
1034	SCEVExpanderCleaner ExpCleaner(Expander);
1035
1036	Type *DestInt8PtrTy = Builder.getPtrTy(AddrSpace: DestAS);
1037	Type *IntIdxTy = DL->getIndexType(PtrTy: DestPtr->getType());
1038
1039	bool Changed = false;
1040	const SCEV *Start = Ev->getStart();
1041	// Handle negative strided loops.
1042	if (IsNegStride)
1043	Start = getStartForNegStride(Start, BECount, IntPtr: IntIdxTy, StoreSizeSCEV, SE);
1044
1045	// TODO: ideally we should still be able to generate memset if SCEV expander
1046	// is taught to generate the dependencies at the latest point.
1047	if (!Expander.isSafeToExpand(S: Start))
1048	return Changed;
1049
1050	// Okay, we have a strided store "p[i]" of a splattable value. We can turn
1051	// this into a memset in the loop preheader now if we want. However, this
1052	// would be unsafe to do if there is anything else in the loop that may read
1053	// or write to the aliased location. Check for any overlap by generating the
1054	// base pointer and checking the region.
1055	Value *BasePtr =
1056	Expander.expandCodeFor(SH: Start, Ty: DestInt8PtrTy, I: Preheader->getTerminator());
1057
1058	// From here on out, conservatively report to the pass manager that we've
1059	// changed the IR, even if we later clean up these added instructions. There
1060	// may be structural differences e.g. in the order of use lists not accounted
1061	// for in just a textual dump of the IR. This is written as a variable, even
1062	// though statically all the places this dominates could be replaced with
1063	// 'true', with the hope that anyone trying to be clever / "more precise" with
1064	// the return value will read this comment, and leave them alone.
1065	Changed = true;
1066
1067	if (mayLoopAccessLocation(Ptr: BasePtr, Access: ModRefInfo::ModRef, L: CurLoop, BECount,
1068	StoreSizeSCEV, AA&: *AA, IgnoredInsts&: Stores))
1069	return Changed;
1070
1071	if (avoidLIRForMultiBlockLoop(/IsMemset=/true, IsLoopMemset))
1072	return Changed;
1073
1074	// Okay, everything looks good, insert the memset.
1075	Value SplatValue = isBytewiseValue(V: StoredVal, DL: DL);
1076	Constant PatternValue = nullptr*;
1077	if (!SplatValue)
1078	PatternValue = getMemSetPatternValue(V: StoredVal, DL);
1079
1080	// MemsetArg is the number of bytes for the memset libcall, and the number
1081	// of pattern repetitions if the memset.pattern intrinsic is being used.
1082	Value *MemsetArg;
1083	std::optional<int64_t> BytesWritten;
1084
1085	if (PatternValue && (HasMemsetPattern \|\| ForceMemsetPatternIntrinsic)) {
1086	const SCEV *TripCountS =
1087	SE->getTripCountFromExitCount(ExitCount: BECount, EvalTy: IntIdxTy, L: CurLoop);
1088	if (!Expander.isSafeToExpand(S: TripCountS))
1089	return Changed;
1090	const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(Val: StoreSizeSCEV);
1091	if (!ConstStoreSize)
1092	return Changed;
1093	Value *TripCount = Expander.expandCodeFor(SH: TripCountS, Ty: IntIdxTy,
1094	I: Preheader->getTerminator());
1095	uint64_t PatternRepsPerTrip =
1096	(ConstStoreSize->getValue()->getZExtValue() * `8`) /
1097	DL->getTypeSizeInBits(Ty: PatternValue->getType());
1098	// If ConstStoreSize is not equal to the width of PatternValue, then
1099	// MemsetArg is TripCount (ConstStoreSize/PatternValueWidth). Else*
1100	// MemSetArg is just TripCount.
1101	MemsetArg =
1102	PatternRepsPerTrip == `1`
1103	? TripCount
1104	: Builder.CreateMul(LHS: TripCount,
1105	RHS: Builder.getIntN(N: IntIdxTy->getIntegerBitWidth(),
1106	C: PatternRepsPerTrip));
1107	if (auto *CI = dyn_cast<ConstantInt>(Val: TripCount))
1108	BytesWritten =
1109	CI->getZExtValue() * ConstStoreSize->getValue()->getZExtValue();
1110
1111	} else {
1112	const SCEV *NumBytesS =
1113	getNumBytes(BECount, IntPtr: IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1114
1115	// TODO: ideally we should still be able to generate memset if SCEV expander
1116	// is taught to generate the dependencies at the latest point.
1117	if (!Expander.isSafeToExpand(S: NumBytesS))
1118	return Changed;
1119	MemsetArg =
1120	Expander.expandCodeFor(SH: NumBytesS, Ty: IntIdxTy, I: Preheader->getTerminator());
1121	if (auto *CI = dyn_cast<ConstantInt>(Val: MemsetArg))
1122	BytesWritten = CI->getZExtValue();
1123	}
1124	assert(MemsetArg && "MemsetArg should have been set");
1125
1126	AAMDNodes AATags = TheStore->getAAMetadata();
1127	for (Instruction *Store : Stores)
1128	AATags = AATags.merge(Other: Store->getAAMetadata());
1129	if (BytesWritten)
1130	AATags = AATags.extendTo(Len: BytesWritten.value());
1131	else
1132	AATags = AATags.extendTo(Len: -`1`);
1133
1134	CallInst *NewCall;
1135	if (SplatValue) {
1136	NewCall = Builder.CreateMemSet(Ptr: BasePtr, Val: SplatValue, Size: MemsetArg,
1137	Align: MaybeAlign (StoreAlignment),
1138	/isVolatile=/false, AAInfo: AATags);
1139	} else if (ForceMemsetPatternIntrinsic \|\|
1140	isLibFuncEmittable(M, TLI, TheLibFunc: LibFunc_memset_pattern16)) {
1141	assert(isa<SCEVConstant>(StoreSizeSCEV) && "Expected constant store size");
1142
1143	NewCall = Builder.CreateIntrinsic(
1144	ID: Intrinsic::experimental_memset_pattern,
1145	Types: {DestInt8PtrTy, PatternValue->getType(), IntIdxTy},
1146	Args: {BasePtr, PatternValue, MemsetArg,
1147	ConstantInt::getFalse(Context&: M->getContext())});
1148	if (StoreAlignment)
1149	cast<MemSetPatternInst>(Val: NewCall)->setDestAlignment(*StoreAlignment);
1150	NewCall->setAAMetadata(AATags);
1151	} else {
1152	// Neither a memset, nor memset_pattern16
1153	return Changed;
1154	}
1155
1156	NewCall->setDebugLoc(TheStore->getDebugLoc());
1157
1158	if (MSSAU) {
1159	MemoryAccess *NewMemAcc = MSSAU ->createMemoryAccessInBB(
1160	I: NewCall, Definition: nullptr, BB: NewCall->getParent(), Point: MemorySSA::BeforeTerminator);
1161	MSSAU ->insertDef(Def: cast<MemoryDef>(Val: NewMemAcc), RenameUses: true);
1162	}
1163
1164	LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
1165	<< " from store to: " << Ev << " at: " << TheStore
1166	<< "\n");
1167
1168	ORE.emit(RemarkBuilder: [&]() {
1169	OptimizationRemark R(DEBUG_TYPE, "ProcessLoopStridedStore",
1170	NewCall->getDebugLoc(), Preheader);
1171	R << "Transformed loop-strided store in "
1172	<< ore::NV ("Function", TheStore->getFunction())
1173	<< " function into a call to "
1174	<< ore::NV ("NewFunction", NewCall->getCalledFunction())
1175	<< "() intrinsic";
1176	if (!Stores.empty())
1177	R << ore::setExtraArgs ();
1178	for (auto *I : Stores) {
1179	R << ore::NV ("FromBlock", I->getParent()->getName())
1180	<< ore::NV ("ToBlock", Preheader->getName());
1181	}
1182	return R;
1183	});
1184
1185	// Okay, the memset has been formed. Zap the original store and anything that
1186	// feeds into it.
1187	for (auto *I : Stores) {
1188	if (MSSAU)
1189	MSSAU ->removeMemoryAccess(I, OptimizePhis: true);
1190	deleteDeadInstruction(I);
1191	}
1192	if (MSSAU && VerifyMemorySSA)
1193	MSSAU ->getMemorySSA()->verifyMemorySSA();
1194	++NumMemSet;
1195	ExpCleaner.markResultUsed();
1196	return true;
1197	}
1198
1199	/// If the stored value is a strided load in the same loop with the same stride
1200	/// this may be transformable into a memcpy. This kicks in for stuff like
1201	/// for (i) A[i] = B[i];
1202	bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
1203	const SCEV *BECount) {
1204	assert(SI->isUnordered() && "Expected only non-volatile non-ordered stores.");
1205
1206	Value *StorePtr = SI->getPointerOperand();
1207	const SCEVAddRecExpr *StoreEv = cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: StorePtr));
1208	unsigned StoreSize = DL->getTypeStoreSize(Ty: SI->getValueOperand()->getType());
1209
1210	// The store must be feeding a non-volatile load.
1211	LoadInst *LI = cast<LoadInst>(Val: SI->getValueOperand());
1212	assert(LI->isUnordered() && "Expected only non-volatile non-ordered loads.");
1213
1214	// See if the pointer expression is an AddRec like {base,+,1} on the current
1215	// loop, which indicates a strided load. If we have something else, it's a
1216	// random load we can't handle.
1217	Value *LoadPtr = LI->getPointerOperand();
1218	const SCEVAddRecExpr *LoadEv = cast<SCEVAddRecExpr>(Val: SE->getSCEV(V: LoadPtr));
1219
1220	const SCEV *StoreSizeSCEV = SE->getConstant(Ty: StorePtr->getType(), V: StoreSize);
1221	return processLoopStoreOfLoopLoad(DestPtr: StorePtr, SourcePtr: LoadPtr, StoreSize: StoreSizeSCEV,
1222	StoreAlign: SI->getAlign(), LoadAlign: LI->getAlign(), TheStore: SI, TheLoad: LI,
1223	StoreEv, LoadEv, BECount);
1224	}
1225
1226	namespace {
1227	class MemmoveVerifier {
1228	public:
1229	explicit MemmoveVerifier(const Value &LoadBasePtr, const Value &StoreBasePtr,
1230	const DataLayout &DL)
1231	: DL(DL), BP1(llvm::GetPointerBaseWithConstantOffset(
1232	Ptr: LoadBasePtr.stripPointerCasts(), Offset&: LoadOff, DL)),
1233	BP2(llvm::GetPointerBaseWithConstantOffset(
1234	Ptr: StoreBasePtr.stripPointerCasts(), Offset&: StoreOff, DL)),
1235	IsSameObject(BP1 == BP2) {}
1236
1237	bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool IsNegStride,
1238	const Instruction &TheLoad,
1239	bool IsMemCpy) const {
1240	if (IsMemCpy) {
1241	// Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1242	// for negative stride.
1243	if ((!IsNegStride && LoadOff <= StoreOff) \|\|
1244	(IsNegStride && LoadOff >= StoreOff))
1245	return false;
1246	} else {
1247	// Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr
1248	// for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
1249	int64_t LoadSize =
1250	DL.getTypeSizeInBits(Ty: TheLoad.getType()).getFixedValue() / `8`;
1251	if (BP1 != BP2 \|\| LoadSize != int64_t(StoreSize))
1252	return false;
1253	if ((!IsNegStride && LoadOff < StoreOff + int64_t(StoreSize)) \|\|
1254	(IsNegStride && LoadOff + LoadSize > StoreOff))
1255	return false;
1256	}
1257	return true;
1258	}
1259
1260	private:
1261	const DataLayout &DL;
1262	int64_t LoadOff = `0`;
1263	int64_t StoreOff = `0`;
1264	const Value *BP1;
1265	const Value *BP2;
1266
1267	public:
1268	const bool IsSameObject;
1269	};
1270	} // namespace
1271
1272	bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
1273	Value DestPtr, Value SourcePtr, const SCEV *StoreSizeSCEV,
1274	MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore,
1275	Instruction TheLoad, const* SCEVAddRecExpr *StoreEv,
1276	const SCEVAddRecExpr LoadEv, const* SCEV *BECount) {
1277
1278	// FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to
1279	// conservatively bail here, since otherwise we may have to transform
1280	// llvm.memcpy.inline into llvm.memcpy which is illegal.
1281	if (auto *MCI = dyn_cast<MemCpyInst>(Val: TheStore); MCI && MCI->isForceInlined())
1282	return false;
1283
1284	// The trip count of the loop and the base pointer of the addrec SCEV is
1285	// guaranteed to be loop invariant, which means that it should dominate the
1286	// header. This allows us to insert code for it in the preheader.
1287	BasicBlock *Preheader = CurLoop->getLoopPreheader();
1288	IRBuilder<> Builder(Preheader->getTerminator());
1289	SCEVExpander Expander(SE, DL, "loop-idiom");
1290
1291	SCEVExpanderCleaner ExpCleaner(Expander);
1292
1293	bool Changed = false;
1294	const SCEV *StrStart = StoreEv->getStart();
1295	unsigned StrAS = DestPtr->getType()->getPointerAddressSpace();
1296	Type *IntIdxTy = Builder.getIntNTy(N: DL->getIndexSizeInBits(AS: StrAS));
1297
1298	APInt Stride = getStoreStride(StoreEv);
1299	const SCEVConstant *ConstStoreSize = dyn_cast<SCEVConstant>(Val: StoreSizeSCEV);
1300
1301	// TODO: Deal with non-constant size; Currently expect constant store size
1302	assert(ConstStoreSize && "store size is expected to be a constant");
1303
1304	int64_t StoreSize = ConstStoreSize->getValue()->getZExtValue();
1305	bool IsNegStride = StoreSize == -Stride;
1306
1307	// Handle negative strided loops.
1308	if (IsNegStride)
1309	StrStart =
1310	getStartForNegStride(Start: StrStart, BECount, IntPtr: IntIdxTy, StoreSizeSCEV, SE);
1311
1312	// Okay, we have a strided store "p[i]" of a loaded value. We can turn
1313	// this into a memcpy in the loop preheader now if we want. However, this
1314	// would be unsafe to do if there is anything else in the loop that may read
1315	// or write the memory region we're storing to. This includes the load that
1316	// feeds the stores. Check for an alias by generating the base address and
1317	// checking everything.
1318	Value *StoreBasePtr = Expander.expandCodeFor(
1319	SH: StrStart, Ty: Builder.getPtrTy(AddrSpace: StrAS), I: Preheader->getTerminator());
1320
1321	// From here on out, conservatively report to the pass manager that we've
1322	// changed the IR, even if we later clean up these added instructions. There
1323	// may be structural differences e.g. in the order of use lists not accounted
1324	// for in just a textual dump of the IR. This is written as a variable, even
1325	// though statically all the places this dominates could be replaced with
1326	// 'true', with the hope that anyone trying to be clever / "more precise" with
1327	// the return value will read this comment, and leave them alone.
1328	Changed = true;
1329
1330	SmallPtrSet<Instruction *, `2`> IgnoredInsts;
1331	IgnoredInsts.insert(Ptr: TheStore);
1332
1333	bool IsMemCpy = isa<MemCpyInst>(Val: TheStore);
1334	const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
1335
1336	bool LoopAccessStore =
1337	mayLoopAccessLocation(Ptr: StoreBasePtr, Access: ModRefInfo::ModRef, L: CurLoop, BECount,
1338	StoreSizeSCEV, AA&: *AA, IgnoredInsts);
1339	if (LoopAccessStore) {
1340	// For memmove case it's not enough to guarantee that loop doesn't access
1341	// TheStore and TheLoad. Additionally we need to make sure that TheStore is
1342	// the only user of TheLoad.
1343	if (!TheLoad->hasOneUse())
1344	return Changed;
1345	IgnoredInsts.insert(Ptr: TheLoad);
1346	if (mayLoopAccessLocation(Ptr: StoreBasePtr, Access: ModRefInfo::ModRef, L: CurLoop,
1347	BECount, StoreSizeSCEV, AA&: *AA, IgnoredInsts)) {
1348	ORE.emit(RemarkBuilder: [&]() {
1349	return OptimizationRemarkMissed (DEBUG_TYPE, "LoopMayAccessStore",
1350	TheStore)
1351	<< ore::NV ("Inst", InstRemark) << " in "
1352	<< ore::NV ("Function", TheStore->getFunction())
1353	<< " function will not be hoisted: "
1354	<< ore::NV ("Reason", "The loop may access store location");
1355	});
1356	return Changed;
1357	}
1358	IgnoredInsts.erase(Ptr: TheLoad);
1359	}
1360
1361	const SCEV *LdStart = LoadEv->getStart();
1362	unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();
1363
1364	// Handle negative strided loops.
1365	if (IsNegStride)
1366	LdStart =
1367	getStartForNegStride(Start: LdStart, BECount, IntPtr: IntIdxTy, StoreSizeSCEV, SE);
1368
1369	// For a memcpy, we have to make sure that the input array is not being
1370	// mutated by the loop.
1371	Value *LoadBasePtr = Expander.expandCodeFor(SH: LdStart, Ty: Builder.getPtrTy(AddrSpace: LdAS),
1372	I: Preheader->getTerminator());
1373
1374	// If the store is a memcpy instruction, we must check if it will write to
1375	// the load memory locations. So remove it from the ignored stores.
1376	MemmoveVerifier Verifier(LoadBasePtr, StoreBasePtr, *DL);
1377	if (IsMemCpy && !Verifier.IsSameObject)
1378	IgnoredInsts.erase(Ptr: TheStore);
1379	if (mayLoopAccessLocation(Ptr: LoadBasePtr, Access: ModRefInfo::Mod, L: CurLoop, BECount,
1380	StoreSizeSCEV, AA&: *AA, IgnoredInsts)) {
1381	ORE.emit(RemarkBuilder: [&]() {
1382	return OptimizationRemarkMissed (DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
1383	<< ore::NV ("Inst", InstRemark) << " in "
1384	<< ore::NV ("Function", TheStore->getFunction())
1385	<< " function will not be hoisted: "
1386	<< ore::NV ("Reason", "The loop may access load location");
1387	});
1388	return Changed;
1389	}
1390
1391	bool IsAtomic = TheStore->isAtomic() \|\| TheLoad->isAtomic();
1392	bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore;
1393
1394	if (IsAtomic) {
1395	// For now don't support unordered atomic memmove.
1396	if (UseMemMove)
1397	return Changed;
1398
1399	// We cannot allow unaligned ops for unordered load/store, so reject
1400	// anything where the alignment isn't at least the element size.
1401	assert((StoreAlign && LoadAlign) &&
1402	"Expect unordered load/store to have align.");
1403	if (StoreAlign < StoreSize \|\| LoadAlign < StoreSize)
1404	return Changed;
1405
1406	// If the element.atomic memcpy is not lowered into explicit
1407	// loads/stores later, then it will be lowered into an element-size
1408	// specific lib call. If the lib call doesn't exist for our store size, then
1409	// we shouldn't generate the memcpy.
1410	if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
1411	return Changed;
1412	}
1413
1414	if (UseMemMove)
1415	if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, IsNegStride, TheLoad: *TheLoad,
1416	IsMemCpy))
1417	return Changed;
1418
1419	if (avoidLIRForMultiBlockLoop())
1420	return Changed;
1421
1422	// Okay, everything is safe, we can transform this!
1423
1424	const SCEV *NumBytesS =
1425	getNumBytes(BECount, IntPtr: IntIdxTy, StoreSizeSCEV, CurLoop, DL, SE);
1426
1427	Value *NumBytes =
1428	Expander.expandCodeFor(SH: NumBytesS, Ty: IntIdxTy, I: Preheader->getTerminator());
1429
1430	AAMDNodes AATags = TheLoad->getAAMetadata();
1431	AAMDNodes StoreAATags = TheStore->getAAMetadata();
1432	AATags = AATags.merge(Other: StoreAATags);
1433	if (auto CI = dyn_cast<ConstantInt>(Val: NumBytes))
1434	AATags = AATags.extendTo(Len: CI->getZExtValue());
1435	else
1436	AATags = AATags.extendTo(Len: -`1`);
1437
1438	CallInst NewCall = nullptr*;
1439	// Check whether to generate an unordered atomic memcpy:
1440	// If the load or store are atomic, then they must necessarily be unordered
1441	// by previous checks.
1442	if (!IsAtomic) {
1443	if (UseMemMove)
1444	NewCall = Builder.CreateMemMove(Dst: StoreBasePtr, DstAlign: StoreAlign, Src: LoadBasePtr,
1445	SrcAlign: LoadAlign, Size: NumBytes,
1446	/isVolatile=/false, AAInfo: AATags);
1447	else
1448	NewCall =
1449	Builder.CreateMemCpy(Dst: StoreBasePtr, DstAlign: StoreAlign, Src: LoadBasePtr, SrcAlign: LoadAlign,
1450	Size: NumBytes, /isVolatile=/false, AAInfo: AATags);
1451	} else {
1452	// Create the call.
1453	// Note that unordered atomic loads/stores are required* by the spec to*
1454	// have an alignment but non-atomic loads/stores may not.
1455	NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
1456	Dst: StoreBasePtr, DstAlign: StoreAlign, Src: LoadBasePtr, SrcAlign: LoadAlign, Size: NumBytes, ElementSize: StoreSize,
1457	AAInfo: AATags);
1458	}
1459	NewCall->setDebugLoc(TheStore->getDebugLoc());
1460
1461	if (MSSAU) {
1462	MemoryAccess *NewMemAcc = MSSAU ->createMemoryAccessInBB(
1463	I: NewCall, Definition: nullptr, BB: NewCall->getParent(), Point: MemorySSA::BeforeTerminator);
1464	MSSAU ->insertDef(Def: cast<MemoryDef>(Val: NewMemAcc), RenameUses: true);
1465	}
1466
1467	LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
1468	<< " from load ptr=" << LoadEv << " at: " << TheLoad
1469	<< "\n"
1470	<< " from store ptr=" << StoreEv << " at: " << TheStore
1471	<< "\n");
1472
1473	ORE.emit(RemarkBuilder: [&]() {
1474	return OptimizationRemark (DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
1475	NewCall->getDebugLoc(), Preheader)
1476	<< "Formed a call to "
1477	<< ore::NV ("NewFunction", NewCall->getCalledFunction())
1478	<< "() intrinsic from " << ore::NV ("Inst", InstRemark)
1479	<< " instruction in " << ore::NV ("Function", TheStore->getFunction())
1480	<< " function"
1481	<< ore::setExtraArgs ()
1482	<< ore::NV ("FromBlock", TheStore->getParent()->getName())
1483	<< ore::NV ("ToBlock", Preheader->getName());
1484	});
1485
1486	// Okay, a new call to memcpy/memmove has been formed. Zap the original store
1487	// and anything that feeds into it.
1488	if (MSSAU)
1489	MSSAU ->removeMemoryAccess(I: TheStore, OptimizePhis: true);
1490	deleteDeadInstruction(I: TheStore);
1491	if (MSSAU && VerifyMemorySSA)
1492	MSSAU ->getMemorySSA()->verifyMemorySSA();
1493	if (UseMemMove)
1494	++NumMemMove;
1495	else
1496	++NumMemCpy;
1497	ExpCleaner.markResultUsed();
1498	return true;
1499	}
1500
1501	// When compiling for codesize we avoid idiom recognition for a multi-block loop
1502	// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
1503	//
1504	bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
1505	bool IsLoopMemset) {
1506	if (ApplyCodeSizeHeuristics && CurLoop->getNumBlocks() > `1`) {
1507	if (CurLoop->isOutermost() && (!IsMemset \|\| !IsLoopMemset)) {
1508	LLVM_DEBUG(dbgs() << " " << CurLoop->getHeader()->getParent()->getName()
1509	<< " : LIR " << (IsMemset ? "Memset" : "Memcpy")
1510	<< " avoided: multi-block top-level loop\n");
1511	return true;
1512	}
1513	}
1514
1515	return false;
1516	}
1517
1518	bool LoopIdiomRecognize::runOnNoncountableLoop() {
1519	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
1520	<< CurLoop->getHeader()->getParent()->getName()
1521	<< "] Noncountable Loop %"
1522	<< CurLoop->getHeader()->getName() << "\n");
1523
1524	return recognizePopcount() \|\| recognizeAndInsertFFS() \|\|
1525	recognizeShiftUntilBitTest() \|\| recognizeShiftUntilZero() \|\|
1526	recognizeShiftUntilLessThan() \|\| recognizeAndInsertStrLen();
1527	}
1528
1529	/// Check if the given conditional branch is based on the comparison between
1530	/// a variable and zero, and if the variable is non-zero or zero (JmpOnZero is
1531	/// true), the control yields to the loop entry. If the branch matches the
1532	/// behavior, the variable involved in the comparison is returned. This function
1533	/// will be called to see if the precondition and postcondition of the loop are
1534	/// in desirable form.
1535	static Value matchCondition(BranchInst BI, BasicBlock *LoopEntry,
1536	bool JmpOnZero = false) {
1537	if (!BI \|\| !BI->isConditional())
1538	return nullptr;
1539
1540	ICmpInst *Cond = dyn_cast<ICmpInst>(Val: BI->getCondition());
1541	if (!Cond)
1542	return nullptr;
1543
1544	auto *CmpZero = dyn_cast<ConstantInt>(Val: Cond->getOperand(i_nocapture: `1`));
1545	if (!CmpZero \|\| !CmpZero->isZero())
1546	return nullptr;
1547
1548	BasicBlock *TrueSucc = BI->getSuccessor(i: `0`);
1549	BasicBlock *FalseSucc = BI->getSuccessor(i: `1`);
1550	if (JmpOnZero)
1551	std::swap(a&: TrueSucc, b&: FalseSucc);
1552
1553	ICmpInst::Predicate Pred = Cond->getPredicate();
1554	if ((Pred == ICmpInst::ICMP_NE && TrueSucc == LoopEntry) \|\|
1555	(Pred == ICmpInst::ICMP_EQ && FalseSucc == LoopEntry))
1556	return Cond->getOperand(i_nocapture: `0`);
1557
1558	return nullptr;
1559	}
1560
1561	namespace {
1562
1563	class StrlenVerifier {
1564	public:
1565	explicit StrlenVerifier(const Loop CurLoop, ScalarEvolution SE,
1566	const TargetLibraryInfo *TLI)
1567	: CurLoop(CurLoop), SE(SE), TLI(TLI) {}
1568
1569	bool isValidStrlenIdiom() {
1570	// Give up if the loop has multiple blocks, multiple backedges, or
1571	// multiple exit blocks
1572	if (CurLoop->getNumBackEdges() != `1` \|\| CurLoop->getNumBlocks() != `1` \|\|
1573	!CurLoop->getUniqueExitBlock())
1574	return false;
1575
1576	// It should have a preheader and a branch instruction.
1577	BasicBlock *Preheader = CurLoop->getLoopPreheader();
1578	if (!Preheader)
1579	return false;
1580
1581	BranchInst *EntryBI = dyn_cast<BranchInst>(Val: Preheader->getTerminator());
1582	if (!EntryBI)
1583	return false;
1584
1585	// The loop exit must be conditioned on an icmp with 0 the null terminator.
1586	// The icmp operand has to be a load on some SSA reg that increments
1587	// by 1 in the loop.
1588	BasicBlock LoopBody = CurLoop->block_begin();
1589
1590	// Skip if the body is too big as it most likely is not a strlen idiom.
1591	if (!LoopBody \|\| LoopBody->size() >= `15`)
1592	return false;
1593
1594	BranchInst *LoopTerm = dyn_cast<BranchInst>(Val: LoopBody->getTerminator());
1595	Value *LoopCond = matchCondition(BI: LoopTerm, LoopEntry: LoopBody);
1596	if (!LoopCond)
1597	return false;
1598
1599	LoadInst *LoopLoad = dyn_cast<LoadInst>(Val: LoopCond);
1600	if (!LoopLoad \|\| LoopLoad->getPointerAddressSpace() != `0`)
1601	return false;
1602
1603	OperandType = LoopLoad->getType();
1604	if (!OperandType \|\| !OperandType->isIntegerTy())
1605	return false;
1606
1607	// See if the pointer expression is an AddRec with constant step a of form
1608	// ({n,+,a}) where a is the width of the char type.
1609	Value *IncPtr = LoopLoad->getPointerOperand();
1610	const SCEV *LoadEv = SE->getSCEV(V: IncPtr);
1611	const APInt *Step;
1612	if (!match(S: LoadEv,
1613	P: m_scev_AffineAddRec(Op0: m_SCEV(V&: LoadBaseEv), Op1: m_scev_APInt(C&: Step))))
1614	return false;
1615
1616	LLVM_DEBUG(dbgs() << "pointer load scev: " << *LoadEv << "\n");
1617
1618	unsigned StepSize = Step->getZExtValue();
1619
1620	// Verify that StepSize is consistent with platform char width.
1621	OpWidth = OperandType->getIntegerBitWidth();
1622	unsigned WcharSize = TLI->getWCharSize(M: *LoopLoad->getModule());
1623	if (OpWidth != StepSize * `8`)
1624	return false;
1625	if (OpWidth != `8` && OpWidth != `16` && OpWidth != `32`)
1626	return false;
1627	if (OpWidth >= `16`)
1628	if (OpWidth != WcharSize * `8`)
1629	return false;
1630
1631	// Scan every instruction in the loop to ensure there are no side effects.
1632	for (Instruction &I : *LoopBody)
1633	if (I.mayHaveSideEffects())
1634	return false;
1635
1636	BasicBlock *LoopExitBB = CurLoop->getExitBlock();
1637	if (!LoopExitBB)
1638	return false;
1639
1640	for (PHINode &PN : LoopExitBB->phis()) {
1641	if (!SE->isSCEVable(Ty: PN.getType()))
1642	return false;
1643
1644	const SCEV *Ev = SE->getSCEV(V: &PN);
1645	if (!Ev)
1646	return false;
1647
1648	LLVM_DEBUG(dbgs() << "loop exit phi scev: " << *Ev << "\n");
1649
1650	// Since we verified that the loop trip count will be a valid strlen
1651	// idiom, we can expand all lcssa phi with {n,+,1} as (n + strlen) and use
1652	// SCEVExpander materialize the loop output.
1653	const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Val: Ev);
1654	if (!AddRecEv \|\| !AddRecEv->isAffine())
1655	return false;
1656
1657	// We only want RecAddExpr with recurrence step that is constant. This
1658	// is good enough for all the idioms we want to recognize. Later we expand
1659	// and materialize the recurrence as {base,+,a} -> (base + a strlen)*
1660	if (!isa<SCEVConstant>(Val: AddRecEv->getStepRecurrence(SE&: *SE)))
1661	return false;
1662	}
1663
1664	return true;
1665	}
1666
1667	public:
1668	const Loop *CurLoop;
1669	ScalarEvolution *SE;
1670	const TargetLibraryInfo *TLI;
1671
1672	unsigned OpWidth;
1673	ConstantInt *StepSizeCI;
1674	const SCEV *LoadBaseEv;
1675	Type *OperandType;
1676	};
1677
1678	} // namespace
1679
1680	/// The Strlen Idiom we are trying to detect has the following structure
1681	///
1682	/// preheader:
1683	/// ...
1684	/// br label %body, ...
1685	///
1686	/// body:
1687	/// ... ; %0 is incremented by a gep
1688	/// %1 = load i8, ptr %0, align 1
1689	/// %2 = icmp eq i8 %1, 0
1690	/// br i1 %2, label %exit, label %body
1691	///
1692	/// exit:
1693	/// %lcssa = phi [%0, %body], ...
1694	///
1695	/// We expect the strlen idiom to have a load of a character type that
1696	/// is compared against '\0', and such load pointer operand must have scev
1697	/// expression of the form {%str,+,c} where c is a ConstantInt of the
1698	/// appropiate character width for the idiom, and %str is the base of the string
1699	/// And, that all lcssa phis have the form {...,+,n} where n is a constant,
1700	///
1701	/// When transforming the output of the strlen idiom, the lccsa phi are
1702	/// expanded using SCEVExpander as {base scev,+,a} -> (base scev + a strlen)*
1703	/// and all subsequent uses are replaced. For example,
1704	///
1705	/// \code{.c}
1706	/// const char base = str;*
1707	/// while (str != '\0')*
1708	/// ++str;
1709	/// size_t result = str - base;
1710	/// \endcode
1711	///
1712	/// will be transformed as follows: The idiom will be replaced by a strlen
1713	/// computation to compute the address of the null terminator of the string.
1714	///
1715	/// \code{.c}
1716	/// const char base = str;*
1717	/// const char end = base + strlen(str);*
1718	/// size_t result = end - base;
1719	/// \endcode
1720	///
1721	/// In the case we index by an induction variable, as long as the induction
1722	/// variable has a constant int increment, we can replace all such indvars
1723	/// with the closed form computation of strlen
1724	///
1725	/// \code{.c}
1726	/// size_t i = 0;
1727	/// while (str[i] != '\0')
1728	/// ++i;
1729	/// size_t result = i;
1730	/// \endcode
1731	///
1732	/// Will be replaced by
1733	///
1734	/// \code{.c}
1735	/// size_t i = 0 + strlen(str);
1736	/// size_t result = i;
1737	/// \endcode
1738	///
1739	bool LoopIdiomRecognize::recognizeAndInsertStrLen() {
1740	if (DisableLIRP::All)
1741	return false;
1742
1743	StrlenVerifier Verifier(CurLoop, SE, TLI);
1744
1745	if (!Verifier.isValidStrlenIdiom())
1746	return false;
1747
1748	BasicBlock *Preheader = CurLoop->getLoopPreheader();
1749	BasicBlock LoopBody = CurLoop->block_begin();
1750	BasicBlock *LoopExitBB = CurLoop->getExitBlock();
1751	BranchInst *LoopTerm = dyn_cast<BranchInst>(Val: LoopBody->getTerminator());
1752	assert(Preheader && LoopBody && LoopExitBB && LoopTerm &&
1753	"Should be verified to be valid by StrlenVerifier");
1754
1755	if (Verifier.OpWidth == `8`) {
1756	if (DisableLIRP::Strlen)
1757	return false;
1758	if (!isLibFuncEmittable(M: Preheader->getModule(), TLI, TheLibFunc: LibFunc_strlen))
1759	return false;
1760	} else {
1761	if (DisableLIRP::Wcslen)
1762	return false;
1763	if (!isLibFuncEmittable(M: Preheader->getModule(), TLI, TheLibFunc: LibFunc_wcslen))
1764	return false;
1765	}
1766
1767	IRBuilder<> Builder(Preheader->getTerminator());
1768	Builder.SetCurrentDebugLocation(CurLoop->getStartLoc());
1769	SCEVExpander Expander(*SE, Preheader->getModule()->getDataLayout(),
1770	"strlen_idiom");
1771	Value *MaterialzedBase = Expander.expandCodeFor(
1772	SH: Verifier.LoadBaseEv, Ty: Verifier.LoadBaseEv->getType(),
1773	I: Builder.GetInsertPoint());
1774
1775	Value StrLenFunc = nullptr*;
1776	if (Verifier.OpWidth == `8`) {
1777	StrLenFunc = emitStrLen(Ptr: MaterialzedBase, B&: Builder, DL: *DL, TLI);
1778	} else {
1779	StrLenFunc = emitWcsLen(Ptr: MaterialzedBase, B&: Builder, DL: *DL, TLI);
1780	}
1781	assert(StrLenFunc && "Failed to emit strlen function.");
1782
1783	const SCEV *StrlenEv = SE->getSCEV(V: StrLenFunc);
1784	SmallVector<PHINode *, `4`> Cleanup;
1785	for (PHINode &PN : LoopExitBB->phis()) {
1786	// We can now materialize the loop output as all phi have scev {base,+,a}.
1787	// We expand the phi as:
1788	// %strlen = call i64 @strlen(%str)
1789	// %phi.new = base expression + step %strlen*
1790	const SCEV *Ev = SE->getSCEV(V: &PN);
1791	const SCEVAddRecExpr *AddRecEv = dyn_cast<SCEVAddRecExpr>(Val: Ev);
1792	const SCEVConstant *Step =
1793	dyn_cast<SCEVConstant>(Val: AddRecEv->getStepRecurrence(SE&: *SE));
1794	const SCEV *Base = AddRecEv->getStart();
1795
1796	// It is safe to truncate to base since if base is narrower than size_t
1797	// the equivalent user code will have to truncate anyways.
1798	const SCEV *NewEv = SE->getAddExpr(
1799	LHS: Base, RHS: SE->getMulExpr(LHS: Step, RHS: SE->getTruncateOrSignExtend(
1800	V: StrlenEv, Ty: Base->getType())));
1801
1802	Value *MaterializedPHI = Expander.expandCodeFor(SH: NewEv, Ty: NewEv->getType(),
1803	I: Builder.GetInsertPoint());
1804	Expander.clear();
1805	PN.replaceAllUsesWith(V: MaterializedPHI);
1806	Cleanup.push_back(Elt: &PN);
1807	}
1808
1809	// All LCSSA Loop Phi are dead, the left over dead loop body can be cleaned
1810	// up by later passes
1811	for (PHINode *PN : Cleanup)
1812	RecursivelyDeleteDeadPHINode(PN);
1813
1814	// LoopDeletion only delete invariant loops with known trip-count. We can
1815	// update the condition so it will reliablely delete the invariant loop
1816	assert(LoopTerm->getNumSuccessors() == `2` &&
1817	(LoopTerm->getSuccessor(`0`) == LoopBody \|\|
1818	LoopTerm->getSuccessor(`1`) == LoopBody) &&
1819	"loop body must have a successor that is it self");
1820	ConstantInt *NewLoopCond = LoopTerm->getSuccessor(i: `0`) == LoopBody
1821	? Builder.getFalse()
1822	: Builder.getTrue();
1823	LoopTerm->setCondition(NewLoopCond);
1824	SE->forgetLoop(L: CurLoop);
1825
1826	++NumStrLen;
1827	LLVM_DEBUG(dbgs() << " Formed strlen idiom: " << *StrLenFunc << "\n");
1828	ORE.emit(RemarkBuilder: [&]() {
1829	return OptimizationRemark (DEBUG_TYPE, "recognizeAndInsertStrLen",
1830	CurLoop->getStartLoc(), Preheader)
1831	<< "Transformed " << StrLenFunc->getName() << " loop idiom";
1832	});
1833
1834	return true;
1835	}
1836
1837	/// Check if the given conditional branch is based on an unsigned less-than
1838	/// comparison between a variable and a constant, and if the comparison is false
1839	/// the control yields to the loop entry. If the branch matches the behaviour,
1840	/// the variable involved in the comparison is returned.
1841	static Value matchShiftULTCondition(BranchInst BI, BasicBlock *LoopEntry,
1842	APInt &Threshold) {
1843	if (!BI \|\| !BI->isConditional())
1844	return nullptr;
1845
1846	ICmpInst *Cond = dyn_cast<ICmpInst>(Val: BI->getCondition());
1847	if (!Cond)
1848	return nullptr;
1849
1850	ConstantInt *CmpConst = dyn_cast<ConstantInt>(Val: Cond->getOperand(i_nocapture: `1`));
1851	if (!CmpConst)
1852	return nullptr;
1853
1854	BasicBlock *FalseSucc = BI->getSuccessor(i: `1`);
1855	ICmpInst::Predicate Pred = Cond->getPredicate();
1856
1857	if (Pred == ICmpInst::ICMP_ULT && FalseSucc == LoopEntry) {
1858	Threshold = CmpConst->getValue();
1859	return Cond->getOperand(i_nocapture: `0`);
1860	}
1861
1862	return nullptr;
1863	}
1864
1865	// Check if the recurrence variable `VarX` is in the right form to create
1866	// the idiom. Returns the value coerced to a PHINode if so.
1867	static PHINode getRecurrenceVar(Value VarX, Instruction *DefX,
1868	BasicBlock *LoopEntry) {
1869	auto *PhiX = dyn_cast<PHINode>(Val: VarX);
1870	if (PhiX && PhiX->getParent() == LoopEntry &&
1871	(PhiX->getOperand(i_nocapture: `0`) == DefX \|\| PhiX->getOperand(i_nocapture: `1`) == DefX))
1872	return PhiX;
1873	return nullptr;
1874	}
1875
1876	/// Return true if the idiom is detected in the loop.
1877	///
1878	/// Additionally:
1879	/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
1880	/// or nullptr if there is no such.
1881	/// 2) \p CntPhi is set to the corresponding phi node
1882	/// or nullptr if there is no such.
1883	/// 3) \p InitX is set to the value whose CTLZ could be used.
1884	/// 4) \p DefX is set to the instruction calculating Loop exit condition.
1885	/// 5) \p Threshold is set to the constant involved in the unsigned less-than
1886	/// comparison.
1887	///
1888	/// The core idiom we are trying to detect is:
1889	/// \code
1890	/// if (x0 < 2)
1891	/// goto loop-exit // the precondition of the loop
1892	/// cnt0 = init-val
1893	/// do {
1894	/// x = phi (x0, x.next); //PhiX
1895	/// cnt = phi (cnt0, cnt.next)
1896	///
1897	/// cnt.next = cnt + 1;
1898	/// ...
1899	/// x.next = x >> 1; // DefX
1900	/// } while (x >= 4)
1901	/// loop-exit:
1902	/// \endcode
1903	static bool detectShiftUntilLessThanIdiom(Loop CurLoop, const* DataLayout &DL,
1904	Intrinsic::ID &IntrinID,
1905	Value &InitX, Instruction &CntInst,
1906	PHINode &CntPhi, Instruction &DefX,
1907	APInt &Threshold) {
1908	BasicBlock *LoopEntry;
1909
1910	DefX = nullptr;
1911	CntInst = nullptr;
1912	CntPhi = nullptr;
1913	LoopEntry = *(CurLoop->block_begin());
1914
1915	// step 1: Check if the loop-back branch is in desirable form.
1916	if (Value *T = matchShiftULTCondition(
1917	BI: dyn_cast<BranchInst>(Val: LoopEntry->getTerminator()), LoopEntry,
1918	Threshold))
1919	DefX = dyn_cast<Instruction>(Val: T);
1920	else
1921	return false;
1922
1923	// step 2: Check the recurrence of variable X
1924	if (!DefX \|\| !isa<PHINode>(Val: DefX))
1925	return false;
1926
1927	PHINode *VarPhi = cast<PHINode>(Val: DefX);
1928	int Idx = VarPhi->getBasicBlockIndex(BB: LoopEntry);
1929	if (Idx == -`1`)
1930	return false;
1931
1932	DefX = dyn_cast<Instruction>(Val: VarPhi->getIncomingValue(i: Idx));
1933	if (!DefX \|\| DefX->getNumOperands() == `0` \|\| DefX->getOperand(i: `0`) != VarPhi)
1934	return false;
1935
1936	// step 3: detect instructions corresponding to "x.next = x >> 1"
1937	if (DefX->getOpcode() != Instruction::LShr)
1938	return false;
1939
1940	IntrinID = Intrinsic::ctlz;
1941	ConstantInt *Shft = dyn_cast<ConstantInt>(Val: DefX->getOperand(i: `1`));
1942	if (!Shft \|\| !Shft->isOne())
1943	return false;
1944
1945	InitX = VarPhi->getIncomingValueForBlock(BB: CurLoop->getLoopPreheader());
1946
1947	// step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
1948	// or cnt.next = cnt + -1.
1949	// TODO: We can skip the step. If loop trip count is known (CTLZ),
1950	// then all uses of "cnt.next" could be optimized to the trip count
1951	// plus "cnt0". Currently it is not optimized.
1952	// This step could be used to detect POPCNT instruction:
1953	// cnt.next = cnt + (x.next & 1)
1954	for (Instruction &Inst :
1955	llvm::make_range(x: LoopEntry->getFirstNonPHIIt(), y: LoopEntry->end())) {
1956	if (Inst.getOpcode() != Instruction::Add)
1957	continue;
1958
1959	ConstantInt *Inc = dyn_cast<ConstantInt>(Val: Inst.getOperand(i: `1`));
1960	if (!Inc \|\| (!Inc->isOne() && !Inc->isMinusOne()))
1961	continue;
1962
1963	PHINode *Phi = getRecurrenceVar(VarX: Inst.getOperand(i: `0`), DefX: &Inst, LoopEntry);
1964	if (!Phi)
1965	continue;
1966
1967	CntInst = &Inst;
1968	CntPhi = Phi;
1969	break;
1970	}
1971	if (!CntInst)
1972	return false;
1973
1974	return true;
1975	}
1976
1977	/// Return true iff the idiom is detected in the loop.
1978	///
1979	/// Additionally:
1980	/// 1) \p CntInst is set to the instruction counting the population bit.
1981	/// 2) \p CntPhi is set to the corresponding phi node.
1982	/// 3) \p Var is set to the value whose population bits are being counted.
1983	///
1984	/// The core idiom we are trying to detect is:
1985	/// \code
1986	/// if (x0 != 0)
1987	/// goto loop-exit // the precondition of the loop
1988	/// cnt0 = init-val;
1989	/// do {
1990	/// x1 = phi (x0, x2);
1991	/// cnt1 = phi(cnt0, cnt2);
1992	///
1993	/// cnt2 = cnt1 + 1;
1994	/// ...
1995	/// x2 = x1 & (x1 - 1);
1996	/// ...
1997	/// } while(x != 0);
1998	///
1999	/// loop-exit:
2000	/// \endcode
2001	static bool detectPopcountIdiom(Loop CurLoop, BasicBlock PreCondBB,
2002	Instruction &CntInst, PHINode &CntPhi,
2003	Value *&Var) {
2004	// step 1: Check to see if the look-back branch match this pattern:
2005	// "if (a!=0) goto loop-entry".
2006	BasicBlock *LoopEntry;
2007	Instruction DefX2, CountInst;
2008	Value VarX1, VarX0;
2009	PHINode PhiX, CountPhi;
2010
2011	DefX2 = CountInst = nullptr;
2012	VarX1 = VarX0 = nullptr;
2013	PhiX = CountPhi = nullptr;
2014	LoopEntry = *(CurLoop->block_begin());
2015
2016	// step 1: Check if the loop-back branch is in desirable form.
2017	{
2018	if (Value *T = matchCondition(
2019	BI: dyn_cast<BranchInst>(Val: LoopEntry->getTerminator()), LoopEntry))
2020	DefX2 = dyn_cast<Instruction>(Val: T);
2021	else
2022	return false;
2023	}
2024
2025	// step 2: detect instructions corresponding to "x2 = x1 & (x1 - 1)"
2026	{
2027	if (!DefX2 \|\| DefX2->getOpcode() != Instruction::And)
2028	return false;
2029
2030	BinaryOperator *SubOneOp;
2031
2032	if ((SubOneOp = dyn_cast<BinaryOperator>(Val: DefX2->getOperand(i: `0`))))
2033	VarX1 = DefX2->getOperand(i: `1`);
2034	else {
2035	VarX1 = DefX2->getOperand(i: `0`);
2036	SubOneOp = dyn_cast<BinaryOperator>(Val: DefX2->getOperand(i: `1`));
2037	}
2038	if (!SubOneOp \|\| SubOneOp->getOperand(i_nocapture: `0`) != VarX1)
2039	return false;
2040
2041	ConstantInt *Dec = dyn_cast<ConstantInt>(Val: SubOneOp->getOperand(i_nocapture: `1`));
2042	if (!Dec \|\|
2043	!((SubOneOp->getOpcode() == Instruction::Sub && Dec->isOne()) \|\|
2044	(SubOneOp->getOpcode() == Instruction::Add &&
2045	Dec->isMinusOne()))) {
2046	return false;
2047	}
2048	}
2049
2050	// step 3: Check the recurrence of variable X
2051	PhiX = getRecurrenceVar(VarX: VarX1, DefX: DefX2, LoopEntry);
2052	if (!PhiX)
2053	return false;
2054
2055	// step 4: Find the instruction which count the population: cnt2 = cnt1 + 1
2056	{
2057	CountInst = nullptr;
2058	for (Instruction &Inst :
2059	llvm::make_range(x: LoopEntry->getFirstNonPHIIt(), y: LoopEntry->end())) {
2060	if (Inst.getOpcode() != Instruction::Add)
2061	continue;
2062
2063	ConstantInt *Inc = dyn_cast<ConstantInt>(Val: Inst.getOperand(i: `1`));
2064	if (!Inc \|\| !Inc->isOne())
2065	continue;
2066
2067	PHINode *Phi = getRecurrenceVar(VarX: Inst.getOperand(i: `0`), DefX: &Inst, LoopEntry);
2068	if (!Phi)
2069	continue;
2070
2071	// Check if the result of the instruction is live of the loop.
2072	bool LiveOutLoop = false;
2073	for (User *U : Inst.users()) {
2074	if ((cast<Instruction>(Val: U))->getParent() != LoopEntry) {
2075	LiveOutLoop = true;
2076	break;
2077	}
2078	}
2079
2080	if (LiveOutLoop) {
2081	CountInst = &Inst;
2082	CountPhi = Phi;
2083	break;
2084	}
2085	}
2086
2087	if (!CountInst)
2088	return false;
2089	}
2090
2091	// step 5: check if the precondition is in this form:
2092	// "if (x != 0) goto loop-head ; else goto somewhere-we-don't-care;"
2093	{
2094	auto *PreCondBr = dyn_cast<BranchInst>(Val: PreCondBB->getTerminator());
2095	Value *T = matchCondition(BI: PreCondBr, LoopEntry: CurLoop->getLoopPreheader());
2096	if (T != PhiX->getOperand(i_nocapture: `0`) && T != PhiX->getOperand(i_nocapture: `1`))
2097	return false;
2098
2099	CntInst = CountInst;
2100	CntPhi = CountPhi;
2101	Var = T;
2102	}
2103
2104	return true;
2105	}
2106
2107	/// Return true if the idiom is detected in the loop.
2108	///
2109	/// Additionally:
2110	/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ)
2111	/// or nullptr if there is no such.
2112	/// 2) \p CntPhi is set to the corresponding phi node
2113	/// or nullptr if there is no such.
2114	/// 3) \p Var is set to the value whose CTLZ could be used.
2115	/// 4) \p DefX is set to the instruction calculating Loop exit condition.
2116	///
2117	/// The core idiom we are trying to detect is:
2118	/// \code
2119	/// if (x0 == 0)
2120	/// goto loop-exit // the precondition of the loop
2121	/// cnt0 = init-val;
2122	/// do {
2123	/// x = phi (x0, x.next); //PhiX
2124	/// cnt = phi(cnt0, cnt.next);
2125	///
2126	/// cnt.next = cnt + 1;
2127	/// ...
2128	/// x.next = x >> 1; // DefX
2129	/// ...
2130	/// } while(x.next != 0);
2131	///
2132	/// loop-exit:
2133	/// \endcode
2134	static bool detectShiftUntilZeroIdiom(Loop CurLoop, const* DataLayout &DL,
2135	Intrinsic::ID &IntrinID, Value *&InitX,
2136	Instruction &CntInst, PHINode &CntPhi,
2137	Instruction *&DefX) {
2138	BasicBlock *LoopEntry;
2139	Value VarX = nullptr*;
2140
2141	DefX = nullptr;
2142	CntInst = nullptr;
2143	CntPhi = nullptr;
2144	LoopEntry = *(CurLoop->block_begin());
2145
2146	// step 1: Check if the loop-back branch is in desirable form.
2147	if (Value *T = matchCondition(
2148	BI: dyn_cast<BranchInst>(Val: LoopEntry->getTerminator()), LoopEntry))
2149	DefX = dyn_cast<Instruction>(Val: T);
2150	else
2151	return false;
2152
2153	// step 2: detect instructions corresponding to "x.next = x >> 1 or x << 1"
2154	if (!DefX \|\| !DefX->isShift())
2155	return false;
2156	IntrinID = DefX->getOpcode() == Instruction::Shl ? Intrinsic::cttz :
2157	Intrinsic::ctlz;
2158	ConstantInt *Shft = dyn_cast<ConstantInt>(Val: DefX->getOperand(i: `1`));
2159	if (!Shft \|\| !Shft->isOne())
2160	return false;
2161	VarX = DefX->getOperand(i: `0`);
2162
2163	// step 3: Check the recurrence of variable X
2164	PHINode *PhiX = getRecurrenceVar(VarX, DefX, LoopEntry);
2165	if (!PhiX)
2166	return false;
2167
2168	InitX = PhiX->getIncomingValueForBlock(BB: CurLoop->getLoopPreheader());
2169
2170	// Make sure the initial value can't be negative otherwise the ashr in the
2171	// loop might never reach zero which would make the loop infinite.
2172	if (DefX->getOpcode() == Instruction::AShr && !isKnownNonNegative(V: InitX, SQ: DL))
2173	return false;
2174
2175	// step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1
2176	// or cnt.next = cnt + -1.
2177	// TODO: We can skip the step. If loop trip count is known (CTLZ),
2178	// then all uses of "cnt.next" could be optimized to the trip count
2179	// plus "cnt0". Currently it is not optimized.
2180	// This step could be used to detect POPCNT instruction:
2181	// cnt.next = cnt + (x.next & 1)
2182	for (Instruction &Inst :
2183	llvm::make_range(x: LoopEntry->getFirstNonPHIIt(), y: LoopEntry->end())) {
2184	if (Inst.getOpcode() != Instruction::Add)
2185	continue;
2186
2187	ConstantInt *Inc = dyn_cast<ConstantInt>(Val: Inst.getOperand(i: `1`));
2188	if (!Inc \|\| (!Inc->isOne() && !Inc->isMinusOne()))
2189	continue;
2190
2191	PHINode *Phi = getRecurrenceVar(VarX: Inst.getOperand(i: `0`), DefX: &Inst, LoopEntry);
2192	if (!Phi)
2193	continue;
2194
2195	CntInst = &Inst;
2196	CntPhi = Phi;
2197	break;
2198	}
2199	if (!CntInst)
2200	return false;
2201
2202	return true;
2203	}
2204
2205	// Check if CTLZ / CTTZ intrinsic is profitable. Assume it is always
2206	// profitable if we delete the loop.
2207	bool LoopIdiomRecognize::isProfitableToInsertFFS(Intrinsic::ID IntrinID,
2208	Value InitX, bool* ZeroCheck,
2209	size_t CanonicalSize) {
2210	const Value *Args[] = {InitX,
2211	ConstantInt::getBool(Context&: InitX->getContext(), V: ZeroCheck)};
2212
2213	// @llvm.dbg doesn't count as they have no semantic effect.
2214	auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
2215	uint32_t HeaderSize =
2216	std::distance(first: InstWithoutDebugIt.begin(), last: InstWithoutDebugIt.end());
2217
2218	IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
2219	InstructionCost Cost = TTI->getIntrinsicInstrCost(
2220	ICA: Attrs, CostKind: TargetTransformInfo::TCK_SizeAndLatency);
2221	if (HeaderSize != CanonicalSize && Cost > TargetTransformInfo::TCC_Basic)
2222	return false;
2223
2224	return true;
2225	}
2226
2227	/// Convert CTLZ / CTTZ idiom loop into countable loop.
2228	/// If CTLZ / CTTZ inserted as a new trip count returns true; otherwise,
2229	/// returns false.
2230	bool LoopIdiomRecognize::insertFFSIfProfitable(Intrinsic::ID IntrinID,
2231	Value InitX, Instruction DefX,
2232	PHINode *CntPhi,
2233	Instruction *CntInst) {
2234	bool IsCntPhiUsedOutsideLoop = false;
2235	for (User *U : CntPhi->users())
2236	if (!CurLoop->contains(Inst: cast<Instruction>(Val: U))) {
2237	IsCntPhiUsedOutsideLoop = true;
2238	break;
2239	}
2240	bool IsCntInstUsedOutsideLoop = false;
2241	for (User *U : CntInst->users())
2242	if (!CurLoop->contains(Inst: cast<Instruction>(Val: U))) {
2243	IsCntInstUsedOutsideLoop = true;
2244	break;
2245	}
2246	// If both CntInst and CntPhi are used outside the loop the profitability
2247	// is questionable.
2248	if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop)
2249	return false;
2250
2251	// For some CPUs result of CTLZ(X) intrinsic is undefined
2252	// when X is 0. If we can not guarantee X != 0, we need to check this
2253	// when expand.
2254	bool ZeroCheck = false;
2255	// It is safe to assume Preheader exist as it was checked in
2256	// parent function RunOnLoop.
2257	BasicBlock *PH = CurLoop->getLoopPreheader();
2258
2259	// If we are using the count instruction outside the loop, make sure we
2260	// have a zero check as a precondition. Without the check the loop would run
2261	// one iteration for before any check of the input value. This means 0 and 1
2262	// would have identical behavior in the original loop and thus
2263	if (!IsCntPhiUsedOutsideLoop) {
2264	auto *PreCondBB = PH->getSinglePredecessor();
2265	if (!PreCondBB)
2266	return false;
2267	auto *PreCondBI = dyn_cast<BranchInst>(Val: PreCondBB->getTerminator());
2268	if (!PreCondBI)
2269	return false;
2270	if (matchCondition(BI: PreCondBI, LoopEntry: PH) != InitX)
2271	return false;
2272	ZeroCheck = true;
2273	}
2274
2275	// FFS idiom loop has only 6 instructions:
2276	// %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
2277	// %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
2278	// %shr = ashr %n.addr.0, 1
2279	// %tobool = icmp eq %shr, 0
2280	// %inc = add nsw %i.0, 1
2281	// br i1 %tobool
2282	size_t IdiomCanonicalSize = `6`;
2283	if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, CanonicalSize: IdiomCanonicalSize))
2284	return false;
2285
2286	transformLoopToCountable(IntrinID, PreCondBB: PH, CntInst, CntPhi, Var: InitX, DefX,
2287	DL: DefX->getDebugLoc(), ZeroCheck,
2288	IsCntPhiUsedOutsideLoop);
2289	return true;
2290	}
2291
2292	/// Recognize CTLZ or CTTZ idiom in a non-countable loop and convert the loop
2293	/// to countable (with CTLZ / CTTZ trip count). If CTLZ / CTTZ inserted as a new
2294	/// trip count returns true; otherwise, returns false.
2295	bool LoopIdiomRecognize::recognizeAndInsertFFS() {
2296	// Give up if the loop has multiple blocks or multiple backedges.
2297	if (CurLoop->getNumBackEdges() != `1` \|\| CurLoop->getNumBlocks() != `1`)
2298	return false;
2299
2300	Intrinsic::ID IntrinID;
2301	Value *InitX;
2302	Instruction DefX = nullptr*;
2303	PHINode CntPhi = nullptr*;
2304	Instruction CntInst = nullptr*;
2305
2306	if (!detectShiftUntilZeroIdiom(CurLoop, DL: *DL, IntrinID, InitX, CntInst, CntPhi,
2307	DefX))
2308	return false;
2309
2310	return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2311	}
2312
2313	bool LoopIdiomRecognize::recognizeShiftUntilLessThan() {
2314	// Give up if the loop has multiple blocks or multiple backedges.
2315	if (CurLoop->getNumBackEdges() != `1` \|\| CurLoop->getNumBlocks() != `1`)
2316	return false;
2317
2318	Intrinsic::ID IntrinID;
2319	Value *InitX;
2320	Instruction DefX = nullptr*;
2321	PHINode CntPhi = nullptr*;
2322	Instruction CntInst = nullptr*;
2323
2324	APInt LoopThreshold;
2325	if (!detectShiftUntilLessThanIdiom(CurLoop, DL: *DL, IntrinID, InitX, CntInst,
2326	CntPhi, DefX, Threshold&: LoopThreshold))
2327	return false;
2328
2329	if (LoopThreshold == `2`) {
2330	// Treat as regular FFS.
2331	return insertFFSIfProfitable(IntrinID, InitX, DefX, CntPhi, CntInst);
2332	}
2333
2334	// Look for Floor Log2 Idiom.
2335	if (LoopThreshold != `4`)
2336	return false;
2337
2338	// Abort if CntPhi is used outside of the loop.
2339	for (User *U : CntPhi->users())
2340	if (!CurLoop->contains(Inst: cast<Instruction>(Val: U)))
2341	return false;
2342
2343	// It is safe to assume Preheader exist as it was checked in
2344	// parent function RunOnLoop.
2345	BasicBlock *PH = CurLoop->getLoopPreheader();
2346	auto *PreCondBB = PH->getSinglePredecessor();
2347	if (!PreCondBB)
2348	return false;
2349	auto *PreCondBI = dyn_cast<BranchInst>(Val: PreCondBB->getTerminator());
2350	if (!PreCondBI)
2351	return false;
2352
2353	APInt PreLoopThreshold;
2354	if (matchShiftULTCondition(BI: PreCondBI, LoopEntry: PH, Threshold&: PreLoopThreshold) != InitX \|\|
2355	PreLoopThreshold != `2`)
2356	return false;
2357
2358	bool ZeroCheck = true;
2359
2360	// the loop has only 6 instructions:
2361	// %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ]
2362	// %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ]
2363	// %shr = ashr %n.addr.0, 1
2364	// %tobool = icmp ult %n.addr.0, C
2365	// %inc = add nsw %i.0, 1
2366	// br i1 %tobool
2367	size_t IdiomCanonicalSize = `6`;
2368	if (!isProfitableToInsertFFS(IntrinID, InitX, ZeroCheck, CanonicalSize: IdiomCanonicalSize))
2369	return false;
2370
2371	// log2(x) = w − 1 − clz(x)
2372	transformLoopToCountable(IntrinID, PreCondBB: PH, CntInst, CntPhi, Var: InitX, DefX,
2373	DL: DefX->getDebugLoc(), ZeroCheck,
2374	/IsCntPhiUsedOutsideLoop=/false,
2375	/InsertSub=/true);
2376	return true;
2377	}
2378
2379	/// Recognizes a population count idiom in a non-countable loop.
2380	///
2381	/// If detected, transforms the relevant code to issue the popcount intrinsic
2382	/// function call, and returns true; otherwise, returns false.
2383	bool LoopIdiomRecognize::recognizePopcount() {
2384	if (TTI->getPopcntSupport(IntTyWidthInBit: `32`) != TargetTransformInfo::PSK_FastHardware)
2385	return false;
2386
2387	// Counting population are usually conducted by few arithmetic instructions.
2388	// Such instructions can be easily "absorbed" by vacant slots in a
2389	// non-compact loop. Therefore, recognizing popcount idiom only makes sense
2390	// in a compact loop.
2391
2392	// Give up if the loop has multiple blocks or multiple backedges.
2393	if (CurLoop->getNumBackEdges() != `1` \|\| CurLoop->getNumBlocks() != `1`)
2394	return false;
2395
2396	BasicBlock LoopBody = (CurLoop->block_begin());
2397	if (LoopBody->size() >= `20`) {
2398	// The loop is too big, bail out.
2399	return false;
2400	}
2401
2402	// It should have a preheader containing nothing but an unconditional branch.
2403	BasicBlock *PH = CurLoop->getLoopPreheader();
2404	if (!PH \|\| &PH->front() != PH->getTerminator())
2405	return false;
2406	auto *EntryBI = dyn_cast<BranchInst>(Val: PH->getTerminator());
2407	if (!EntryBI \|\| EntryBI->isConditional())
2408	return false;
2409
2410	// It should have a precondition block where the generated popcount intrinsic
2411	// function can be inserted.
2412	auto *PreCondBB = PH->getSinglePredecessor();
2413	if (!PreCondBB)
2414	return false;
2415	auto *PreCondBI = dyn_cast<BranchInst>(Val: PreCondBB->getTerminator());
2416	if (!PreCondBI \|\| PreCondBI->isUnconditional())
2417	return false;
2418
2419	Instruction *CntInst;
2420	PHINode *CntPhi;
2421	Value *Val;
2422	if (!detectPopcountIdiom(CurLoop, PreCondBB, CntInst, CntPhi, Var&: Val))
2423	return false;
2424
2425	transformLoopToPopcount(PreCondBB, CntInst, CntPhi, Var: Val);
2426	return true;
2427	}
2428
2429	static CallInst createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value Val,
2430	const DebugLoc &DL) {
2431	Value *Ops[] = {Val};
2432	Type *Tys[] = {Val->getType()};
2433
2434	CallInst *CI = IRBuilder.CreateIntrinsic(ID: Intrinsic::ctpop, Types: Tys, Args: Ops);
2435	CI->setDebugLoc(DL);
2436
2437	return CI;
2438	}
2439
2440	static CallInst createFFSIntrinsic(IRBuilder<> &IRBuilder, Value Val,
2441	const DebugLoc &DL, bool ZeroCheck,
2442	Intrinsic::ID IID) {
2443	Value *Ops[] = {Val, IRBuilder.getInt1(V: ZeroCheck)};
2444	Type *Tys[] = {Val->getType()};
2445
2446	CallInst *CI = IRBuilder.CreateIntrinsic(ID: IID, Types: Tys, Args: Ops);
2447	CI->setDebugLoc(DL);
2448
2449	return CI;
2450	}
2451
2452	/// Transform the following loop (Using CTLZ, CTTZ is similar):
2453	/// loop:
2454	/// CntPhi = PHI [Cnt0, CntInst]
2455	/// PhiX = PHI [InitX, DefX]
2456	/// CntInst = CntPhi + 1
2457	/// DefX = PhiX >> 1
2458	/// LOOP_BODY
2459	/// Br: loop if (DefX != 0)
2460	/// Use(CntPhi) or Use(CntInst)
2461	///
2462	/// Into:
2463	/// If CntPhi used outside the loop:
2464	/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1)
2465	/// Count = CountPrev + 1
2466	/// else
2467	/// Count = BitWidth(InitX) - CTLZ(InitX)
2468	/// loop:
2469	/// CntPhi = PHI [Cnt0, CntInst]
2470	/// PhiX = PHI [InitX, DefX]
2471	/// PhiCount = PHI [Count, Dec]
2472	/// CntInst = CntPhi + 1
2473	/// DefX = PhiX >> 1
2474	/// Dec = PhiCount - 1
2475	/// LOOP_BODY
2476	/// Br: loop if (Dec != 0)
2477	/// Use(CountPrev + Cnt0) // Use(CntPhi)
2478	/// or
2479	/// Use(Count + Cnt0) // Use(CntInst)
2480	///
2481	/// If LOOP_BODY is empty the loop will be deleted.
2482	/// If CntInst and DefX are not used in LOOP_BODY they will be removed.
2483	void LoopIdiomRecognize::transformLoopToCountable(
2484	Intrinsic::ID IntrinID, BasicBlock Preheader, Instruction CntInst,
2485	PHINode CntPhi, Value InitX, Instruction DefX, const* DebugLoc &DL,
2486	bool ZeroCheck, bool IsCntPhiUsedOutsideLoop, bool InsertSub) {
2487	BranchInst *PreheaderBr = cast<BranchInst>(Val: Preheader->getTerminator());
2488
2489	// Step 1: Insert the CTLZ/CTTZ instruction at the end of the preheader block
2490	IRBuilder<> Builder(PreheaderBr);
2491	Builder.SetCurrentDebugLocation(DL);
2492
2493	// If there are no uses of CntPhi crate:
2494	// Count = BitWidth - CTLZ(InitX);
2495	// NewCount = Count;
2496	// If there are uses of CntPhi create:
2497	// NewCount = BitWidth - CTLZ(InitX >> 1);
2498	// Count = NewCount + 1;
2499	Value *InitXNext;
2500	if (IsCntPhiUsedOutsideLoop) {
2501	if (DefX->getOpcode() == Instruction::AShr)
2502	InitXNext = Builder.CreateAShr(LHS: InitX, RHS: `1`);
2503	else if (DefX->getOpcode() == Instruction::LShr)
2504	InitXNext = Builder.CreateLShr(LHS: InitX, RHS: `1`);
2505	else if (DefX->getOpcode() == Instruction::Shl) // cttz
2506	InitXNext = Builder.CreateShl(LHS: InitX, RHS: `1`);
2507	else
2508	llvm_unreachable("Unexpected opcode!");
2509	} else
2510	InitXNext = InitX;
2511	Value *Count =
2512	createFFSIntrinsic(IRBuilder&: Builder, Val: InitXNext, DL, ZeroCheck, IID: IntrinID);
2513	Type *CountTy = Count->getType();
2514	Count = Builder.CreateSub(
2515	LHS: ConstantInt::get(Ty: CountTy, V: CountTy->getIntegerBitWidth()), RHS: Count);
2516	if (InsertSub)
2517	Count = Builder.CreateSub(LHS: Count, RHS: ConstantInt::get(Ty: CountTy, V: `1`));
2518	Value *NewCount = Count;
2519	if (IsCntPhiUsedOutsideLoop)
2520	Count = Builder.CreateAdd(LHS: Count, RHS: ConstantInt::get(Ty: CountTy, V: `1`));
2521
2522	NewCount = Builder.CreateZExtOrTrunc(V: NewCount, DestTy: CntInst->getType());
2523
2524	Value *CntInitVal = CntPhi->getIncomingValueForBlock(BB: Preheader);
2525	if (cast<ConstantInt>(Val: CntInst->getOperand(i: `1`))->isOne()) {
2526	// If the counter was being incremented in the loop, add NewCount to the
2527	// counter's initial value, but only if the initial value is not zero.
2528	ConstantInt *InitConst = dyn_cast<ConstantInt>(Val: CntInitVal);
2529	if (!InitConst \|\| !InitConst->isZero())
2530	NewCount = Builder.CreateAdd(LHS: NewCount, RHS: CntInitVal);
2531	} else {
2532	// If the count was being decremented in the loop, subtract NewCount from
2533	// the counter's initial value.
2534	NewCount = Builder.CreateSub(LHS: CntInitVal, RHS: NewCount);
2535	}
2536
2537	// Step 2: Insert new IV and loop condition:
2538	// loop:
2539	// ...
2540	// PhiCount = PHI [Count, Dec]
2541	// ...
2542	// Dec = PhiCount - 1
2543	// ...
2544	// Br: loop if (Dec != 0)
2545	BasicBlock Body = (CurLoop->block_begin());
2546	auto *LbBr = cast<BranchInst>(Val: Body->getTerminator());
2547	ICmpInst *LbCond = cast<ICmpInst>(Val: LbBr->getCondition());
2548
2549	PHINode *TcPhi = PHINode::Create(Ty: CountTy, NumReservedValues: `2`, NameStr: "tcphi");
2550	TcPhi->insertBefore(InsertPos: Body->begin());
2551
2552	Builder.SetInsertPoint(LbCond);
2553	Instruction *TcDec = cast<Instruction>(Val: Builder.CreateSub(
2554	LHS: TcPhi, RHS: ConstantInt::get(Ty: CountTy, V: `1`), Name: "tcdec", HasNUW: false, HasNSW: true));
2555
2556	TcPhi->addIncoming(V: Count, BB: Preheader);
2557	TcPhi->addIncoming(V: TcDec, BB: Body);
2558
2559	CmpInst::Predicate Pred =
2560	(LbBr->getSuccessor(i: `0`) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ;
2561	LbCond->setPredicate(Pred);
2562	LbCond->setOperand(i_nocapture: `0`, Val_nocapture: TcDec);
2563	LbCond->setOperand(i_nocapture: `1`, Val_nocapture: ConstantInt::get(Ty: CountTy, V: `0`));
2564
2565	// Step 3: All the references to the original counter outside
2566	// the loop are replaced with the NewCount
2567	if (IsCntPhiUsedOutsideLoop)
2568	CntPhi->replaceUsesOutsideBlock(V: NewCount, BB: Body);
2569	else
2570	CntInst->replaceUsesOutsideBlock(V: NewCount, BB: Body);
2571
2572	// step 4: Forget the "non-computable" trip-count SCEV associated with the
2573	// loop. The loop would otherwise not be deleted even if it becomes empty.
2574	SE->forgetLoop(L: CurLoop);
2575	}
2576
2577	void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB,
2578	Instruction *CntInst,
2579	PHINode CntPhi, Value Var) {
2580	BasicBlock *PreHead = CurLoop->getLoopPreheader();
2581	auto *PreCondBr = cast<BranchInst>(Val: PreCondBB->getTerminator());
2582	const DebugLoc &DL = CntInst->getDebugLoc();
2583
2584	// Assuming before transformation, the loop is following:
2585	// if (x) // the precondition
2586	// do { cnt++; x &= x - 1; } while(x);
2587
2588	// Step 1: Insert the ctpop instruction at the end of the precondition block
2589	IRBuilder<> Builder(PreCondBr);
2590	Value PopCnt, PopCntZext, NewCount, TripCnt;
2591	{
2592	PopCnt = createPopcntIntrinsic(IRBuilder&: Builder, Val: Var, DL);
2593	NewCount = PopCntZext =
2594	Builder.CreateZExtOrTrunc(V: PopCnt, DestTy: cast<IntegerType>(Val: CntPhi->getType()));
2595
2596	if (NewCount != PopCnt)
2597	(cast<Instruction>(Val: NewCount))->setDebugLoc(DL);
2598
2599	// TripCnt is exactly the number of iterations the loop has
2600	TripCnt = NewCount;
2601
2602	// If the population counter's initial value is not zero, insert Add Inst.
2603	Value *CntInitVal = CntPhi->getIncomingValueForBlock(BB: PreHead);
2604	ConstantInt *InitConst = dyn_cast<ConstantInt>(Val: CntInitVal);
2605	if (!InitConst \|\| !InitConst->isZero()) {
2606	NewCount = Builder.CreateAdd(LHS: NewCount, RHS: CntInitVal);
2607	(cast<Instruction>(Val: NewCount))->setDebugLoc(DL);
2608	}
2609	}
2610
2611	// Step 2: Replace the precondition from "if (x == 0) goto loop-exit" to
2612	// "if (NewCount == 0) loop-exit". Without this change, the intrinsic
2613	// function would be partial dead code, and downstream passes will drag
2614	// it back from the precondition block to the preheader.
2615	{
2616	ICmpInst *PreCond = cast<ICmpInst>(Val: PreCondBr->getCondition());
2617
2618	Value *Opnd0 = PopCntZext;
2619	Value *Opnd1 = ConstantInt::get(Ty: PopCntZext->getType(), V: `0`);
2620	if (PreCond->getOperand(i_nocapture: `0`) != Var)
2621	std::swap(a&: Opnd0, b&: Opnd1);
2622
2623	ICmpInst *NewPreCond = cast<ICmpInst>(
2624	Val: Builder.CreateICmp(P: PreCond->getPredicate(), LHS: Opnd0, RHS: Opnd1));
2625	PreCondBr->setCondition(NewPreCond);
2626
2627	RecursivelyDeleteTriviallyDeadInstructions(V: PreCond, TLI);
2628	}
2629
2630	// Step 3: Note that the population count is exactly the trip count of the
2631	// loop in question, which enable us to convert the loop from noncountable
2632	// loop into a countable one. The benefit is twofold:
2633	//
2634	// - If the loop only counts population, the entire loop becomes dead after
2635	// the transformation. It is a lot easier to prove a countable loop dead
2636	// than to prove a noncountable one. (In some C dialects, an infinite loop
2637	// isn't dead even if it computes nothing useful. In general, DCE needs
2638	// to prove a noncountable loop finite before safely delete it.)
2639	//
2640	// - If the loop also performs something else, it remains alive.
2641	// Since it is transformed to countable form, it can be aggressively
2642	// optimized by some optimizations which are in general not applicable
2643	// to a noncountable loop.
2644	//
2645	// After this step, this loop (conceptually) would look like following:
2646	// newcnt = __builtin_ctpop(x);
2647	// t = newcnt;
2648	// if (x)
2649	// do { cnt++; x &= x-1; t--) } while (t > 0);
2650	BasicBlock Body = (CurLoop->block_begin());
2651	{
2652	auto *LbBr = cast<BranchInst>(Val: Body->getTerminator());
2653	ICmpInst *LbCond = cast<ICmpInst>(Val: LbBr->getCondition());
2654	Type *Ty = TripCnt->getType();
2655
2656	PHINode *TcPhi = PHINode::Create(Ty, NumReservedValues: `2`, NameStr: "tcphi");
2657	TcPhi->insertBefore(InsertPos: Body->begin());
2658
2659	Builder.SetInsertPoint(LbCond);
2660	Instruction *TcDec = cast<Instruction>(
2661	Val: Builder.CreateSub(LHS: TcPhi, RHS: ConstantInt::get(Ty, V: `1`),
2662	Name: "tcdec", HasNUW: false, HasNSW: true));
2663
2664	TcPhi->addIncoming(V: TripCnt, BB: PreHead);
2665	TcPhi->addIncoming(V: TcDec, BB: Body);
2666
2667	CmpInst::Predicate Pred =
2668	(LbBr->getSuccessor(i: `0`) == Body) ? CmpInst::ICMP_UGT : CmpInst::ICMP_SLE;
2669	LbCond->setPredicate(Pred);
2670	LbCond->setOperand(i_nocapture: `0`, Val_nocapture: TcDec);
2671	LbCond->setOperand(i_nocapture: `1`, Val_nocapture: ConstantInt::get(Ty, V: `0`));
2672	}
2673
2674	// Step 4: All the references to the original population counter outside
2675	// the loop are replaced with the NewCount -- the value returned from
2676	// __builtin_ctpop().
2677	CntInst->replaceUsesOutsideBlock(V: NewCount, BB: Body);
2678
2679	// step 5: Forget the "non-computable" trip-count SCEV associated with the
2680	// loop. The loop would otherwise not be deleted even if it becomes empty.
2681	SE->forgetLoop(L: CurLoop);
2682	}
2683
2684	/// Match loop-invariant value.
2685	template <typename SubPattern_t> struct match_LoopInvariant {
2686	SubPattern_t SubPattern;
2687	const Loop *L;
2688
2689	match_LoopInvariant(const SubPattern_t &SP, const Loop *L)
2690	: SubPattern(SP), L(L) {}
2691
2692	template <typename ITy> bool match(ITy V) const* {
2693	return L->isLoopInvariant(V) && SubPattern.match(V);
2694	}
2695	};
2696
2697	/// Matches if the value is loop-invariant.
2698	template <typename Ty>
2699	inline match_LoopInvariant<Ty> m_LoopInvariant(const Ty &M, const Loop *L) {
2700	return match_LoopInvariant<Ty>(M, L);
2701	}
2702
2703	/// Return true if the idiom is detected in the loop.
2704	///
2705	/// The core idiom we are trying to detect is:
2706	/// \code
2707	/// entry:
2708	/// <...>
2709	/// %bitmask = shl i32 1, %bitpos
2710	/// br label %loop
2711	///
2712	/// loop:
2713	/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2714	/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2715	/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2716	/// %x.next = shl i32 %x.curr, 1
2717	/// <...>
2718	/// br i1 %x.curr.isbitunset, label %loop, label %end
2719	///
2720	/// end:
2721	/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2722	/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2723	/// <...>
2724	/// \endcode
2725	static bool detectShiftUntilBitTestIdiom(Loop CurLoop, Value &BaseX,
2726	Value &BitMask, Value &BitPos,
2727	Value &CurrX, Instruction &NextX) {
2728	LLVM_DEBUG(dbgs() << DEBUG_TYPE
2729	" Performing shift-until-bittest idiom detection.\n");
2730
2731	// Give up if the loop has multiple blocks or multiple backedges.
2732	if (CurLoop->getNumBlocks() != `1` \|\| CurLoop->getNumBackEdges() != `1`) {
2733	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
2734	return false;
2735	}
2736
2737	BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2738	BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2739	assert(LoopPreheaderBB && "There is always a loop preheader.");
2740
2741	using namespace PatternMatch;
2742
2743	// Step 1: Check if the loop backedge is in desirable form.
2744
2745	CmpPredicate Pred;
2746	Value CmpLHS, CmpRHS;
2747	BasicBlock TrueBB, FalseBB;
2748	if (!match(V: LoopHeaderBB->getTerminator(),
2749	P: m_Br(C: m_ICmp(Pred, L: m_Value(V&: CmpLHS), R: m_Value(V&: CmpRHS)),
2750	T: m_BasicBlock(V&: TrueBB), F: m_BasicBlock(V&: FalseBB)))) {
2751	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
2752	return false;
2753	}
2754
2755	// Step 2: Check if the backedge's condition is in desirable form.
2756
2757	auto MatchVariableBitMask = [&]() {
2758	return ICmpInst::isEquality(P: Pred) && match(V: CmpRHS, P: m_Zero()) &&
2759	match(V: CmpLHS,
2760	P: m_c_And(L: m_Value(V&: CurrX),
2761	R: m_CombineAnd(
2762	L: m_Value(V&: BitMask),
2763	R: m_LoopInvariant(M: m_Shl(L: m_One(), R: m_Value(V&: BitPos)),
2764	L: CurLoop))));
2765	};
2766
2767	auto MatchDecomposableConstantBitMask = [&]() {
2768	auto Res = llvm::decomposeBitTestICmp(
2769	LHS: CmpLHS, RHS: CmpRHS, Pred, /LookThroughTrunc=/true,
2770	/AllowNonZeroC=/false, /DecomposeAnd=/true);
2771	if (Res && Res ->Mask.isPowerOf2()) {
2772	assert(ICmpInst::isEquality(Res->Pred));
2773	Pred = Res ->Pred;
2774	CurrX = Res ->X;
2775	BitMask = ConstantInt::get(Ty: CurrX->getType(), V: Res ->Mask);
2776	BitPos = ConstantInt::get(Ty: CurrX->getType(), V: Res ->Mask.logBase2());
2777	return true;
2778	}
2779	return false;
2780	};
2781
2782	if (!MatchVariableBitMask () && !MatchDecomposableConstantBitMask ()) {
2783	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge comparison.\n");
2784	return false;
2785	}
2786
2787	// Step 3: Check if the recurrence is in desirable form.
2788	auto *CurrXPN = dyn_cast<PHINode>(Val: CurrX);
2789	if (!CurrXPN \|\| CurrXPN->getParent() != LoopHeaderBB) {
2790	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
2791	return false;
2792	}
2793
2794	BaseX = CurrXPN->getIncomingValueForBlock(BB: LoopPreheaderBB);
2795	NextX =
2796	dyn_cast<Instruction>(Val: CurrXPN->getIncomingValueForBlock(BB: LoopHeaderBB));
2797
2798	assert(CurLoop->isLoopInvariant(BaseX) &&
2799	"Expected BaseX to be available in the preheader!");
2800
2801	if (!NextX \|\| !match(V: NextX, P: m_Shl(L: m_Specific(V: CurrX), R: m_One()))) {
2802	// FIXME: support right-shift?
2803	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
2804	return false;
2805	}
2806
2807	// Step 4: Check if the backedge's destinations are in desirable form.
2808
2809	assert(ICmpInst::isEquality(Pred) &&
2810	"Should only get equality predicates here.");
2811
2812	// cmp-br is commutative, so canonicalize to a single variant.
2813	if (Pred != ICmpInst::Predicate::ICMP_EQ) {
2814	Pred = ICmpInst::getInversePredicate(pred: Pred);
2815	std::swap(a&: TrueBB, b&: FalseBB);
2816	}
2817
2818	// We expect to exit loop when comparison yields false,
2819	// so when it yields true we should branch back to loop header.
2820	if (TrueBB != LoopHeaderBB) {
2821	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
2822	return false;
2823	}
2824
2825	// Okay, idiom checks out.
2826	return true;
2827	}
2828
2829	/// Look for the following loop:
2830	/// \code
2831	/// entry:
2832	/// <...>
2833	/// %bitmask = shl i32 1, %bitpos
2834	/// br label %loop
2835	///
2836	/// loop:
2837	/// %x.curr = phi i32 [ %x, %entry ], [ %x.next, %loop ]
2838	/// %x.curr.bitmasked = and i32 %x.curr, %bitmask
2839	/// %x.curr.isbitunset = icmp eq i32 %x.curr.bitmasked, 0
2840	/// %x.next = shl i32 %x.curr, 1
2841	/// <...>
2842	/// br i1 %x.curr.isbitunset, label %loop, label %end
2843	///
2844	/// end:
2845	/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2846	/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2847	/// <...>
2848	/// \endcode
2849	///
2850	/// And transform it into:
2851	/// \code
2852	/// entry:
2853	/// %bitmask = shl i32 1, %bitpos
2854	/// %lowbitmask = add i32 %bitmask, -1
2855	/// %mask = or i32 %lowbitmask, %bitmask
2856	/// %x.masked = and i32 %x, %mask
2857	/// %x.masked.numleadingzeros = call i32 @llvm.ctlz.i32(i32 %x.masked,
2858	/// i1 true)
2859	/// %x.masked.numactivebits = sub i32 32, %x.masked.numleadingzeros
2860	/// %x.masked.leadingonepos = add i32 %x.masked.numactivebits, -1
2861	/// %backedgetakencount = sub i32 %bitpos, %x.masked.leadingonepos
2862	/// %tripcount = add i32 %backedgetakencount, 1
2863	/// %x.curr = shl i32 %x, %backedgetakencount
2864	/// %x.next = shl i32 %x, %tripcount
2865	/// br label %loop
2866	///
2867	/// loop:
2868	/// %loop.iv = phi i32 [ 0, %entry ], [ %loop.iv.next, %loop ]
2869	/// %loop.iv.next = add nuw i32 %loop.iv, 1
2870	/// %loop.ivcheck = icmp eq i32 %loop.iv.next, %tripcount
2871	/// <...>
2872	/// br i1 %loop.ivcheck, label %end, label %loop
2873	///
2874	/// end:
2875	/// %x.curr.res = phi i32 [ %x.curr, %loop ] <...>
2876	/// %x.next.res = phi i32 [ %x.next, %loop ] <...>
2877	/// <...>
2878	/// \endcode
2879	bool LoopIdiomRecognize::recognizeShiftUntilBitTest() {
2880	bool MadeChange = false;
2881
2882	Value X, BitMask, BitPos, XCurr;
2883	Instruction *XNext;
2884	if (!detectShiftUntilBitTestIdiom(CurLoop, BaseX&: X, BitMask, BitPos, CurrX&: XCurr,
2885	NextX&: XNext)) {
2886	LLVM_DEBUG(dbgs() << DEBUG_TYPE
2887	" shift-until-bittest idiom detection failed.\n");
2888	return MadeChange;
2889	}
2890	LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom detected!\n");
2891
2892	// Ok, it is the idiom we were looking for, we could* transform this loop,*
2893	// but is it profitable to transform?
2894
2895	BasicBlock *LoopHeaderBB = CurLoop->getHeader();
2896	BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
2897	assert(LoopPreheaderBB && "There is always a loop preheader.");
2898
2899	BasicBlock *SuccessorBB = CurLoop->getExitBlock();
2900	assert(SuccessorBB && "There is only a single successor.");
2901
2902	IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
2903	Builder.SetCurrentDebugLocation(cast<Instruction>(Val: XCurr)->getDebugLoc());
2904
2905	Intrinsic::ID IntrID = Intrinsic::ctlz;
2906	Type *Ty = X->getType();
2907	unsigned Bitwidth = Ty->getScalarSizeInBits();
2908
2909	TargetTransformInfo::TargetCostKind CostKind =
2910	TargetTransformInfo::TCK_SizeAndLatency;
2911
2912	// The rewrite is considered to be unprofitable iff and only iff the
2913	// intrinsic/shift we'll use are not cheap. Note that we are okay with just
2914	// making the loop countable, even if nothing else changes.
2915	IntrinsicCostAttributes Attrs(
2916	IntrID, Ty, {PoisonValue::get(T: Ty), /is_zero_poison=/Builder.getTrue()});
2917	InstructionCost Cost = TTI->getIntrinsicInstrCost(ICA: Attrs, CostKind);
2918	if (Cost > TargetTransformInfo::TCC_Basic) {
2919	LLVM_DEBUG(dbgs() << DEBUG_TYPE
2920	" Intrinsic is too costly, not beneficial\n");
2921	return MadeChange;
2922	}
2923	if (TTI->getArithmeticInstrCost(Opcode: Instruction::Shl, Ty, CostKind) >
2924	TargetTransformInfo::TCC_Basic) {
2925	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Shift is too costly, not beneficial\n");
2926	return MadeChange;
2927	}
2928
2929	// Ok, transform appears worthwhile.
2930	MadeChange = true;
2931
2932	if (!isGuaranteedNotToBeUndefOrPoison(V: BitPos)) {
2933	// BitMask may be computed from BitPos, Freeze BitPos so we can increase
2934	// it's use count.
2935	std::optional<BasicBlock::iterator> InsertPt = std::nullopt;
2936	if (auto *BitPosI = dyn_cast<Instruction>(Val: BitPos))
2937	InsertPt = BitPosI->getInsertionPointAfterDef();
2938	else
2939	InsertPt = DT->getRoot()->getFirstNonPHIOrDbgOrAlloca();
2940	if (!InsertPt)
2941	return false;
2942	FreezeInst *BitPosFrozen =
2943	new FreezeInst (BitPos, BitPos->getName() + ".fr", *InsertPt);
2944	BitPos->replaceUsesWithIf(New: BitPosFrozen, ShouldReplace: [BitPosFrozen](Use &U) {
2945	return U.getUser() != BitPosFrozen;
2946	});
2947	BitPos = BitPosFrozen;
2948	}
2949
2950	// Step 1: Compute the loop trip count.
2951
2952	Value *LowBitMask = Builder.CreateAdd(LHS: BitMask, RHS: Constant::getAllOnesValue(Ty),
2953	Name: BitPos->getName() + ".lowbitmask");
2954	Value *Mask =
2955	Builder.CreateOr(LHS: LowBitMask, RHS: BitMask, Name: BitPos->getName() + ".mask");
2956	Value *XMasked = Builder.CreateAnd(LHS: X, RHS: Mask, Name: X->getName() + ".masked");
2957	CallInst *XMaskedNumLeadingZeros = Builder.CreateIntrinsic(
2958	ID: IntrID, Types: Ty, Args: {XMasked, /is_zero_poison=/Builder.getTrue()},
2959	/FMFSource=/nullptr, Name: XMasked->getName() + ".numleadingzeros");
2960	Value *XMaskedNumActiveBits = Builder.CreateSub(
2961	LHS: ConstantInt::get(Ty, V: Ty->getScalarSizeInBits()), RHS: XMaskedNumLeadingZeros,
2962	Name: XMasked->getName() + ".numactivebits", /HasNUW=/true,
2963	/HasNSW=/Bitwidth != `2`);
2964	Value *XMaskedLeadingOnePos =
2965	Builder.CreateAdd(LHS: XMaskedNumActiveBits, RHS: Constant::getAllOnesValue(Ty),
2966	Name: XMasked->getName() + ".leadingonepos", /HasNUW=/false,
2967	/HasNSW=/Bitwidth > `2`);
2968
2969	Value *LoopBackedgeTakenCount = Builder.CreateSub(
2970	LHS: BitPos, RHS: XMaskedLeadingOnePos, Name: CurLoop->getName() + ".backedgetakencount",
2971	/HasNUW=/true, /HasNSW=/true);
2972	// We know loop's backedge-taken count, but what's loop's trip count?
2973	// Note that while NUW is always safe, while NSW is only for bitwidths != 2.
2974	Value *LoopTripCount =
2975	Builder.CreateAdd(LHS: LoopBackedgeTakenCount, RHS: ConstantInt::get(Ty, V: `1`),
2976	Name: CurLoop->getName() + ".tripcount", /HasNUW=/true,
2977	/HasNSW=/Bitwidth != `2`);
2978
2979	// Step 2: Compute the recurrence's final value without a loop.
2980
2981	// NewX is always safe to compute, because `LoopBackedgeTakenCount`
2982	// will always be smaller than `bitwidth(X)`, i.e. we never get poison.
2983	Value *NewX = Builder.CreateShl(LHS: X, RHS: LoopBackedgeTakenCount);
2984	NewX->takeName(V: XCurr);
2985	if (auto *I = dyn_cast<Instruction>(Val: NewX))
2986	I->copyIRFlags(V: XNext, /IncludeWrapFlags=/true);
2987
2988	Value *NewXNext;
2989	// Rewriting XNext is more complicated, however, because `X << LoopTripCount`
2990	// will be poison iff `LoopTripCount == bitwidth(X)` (which will happen
2991	// iff `BitPos` is `bitwidth(x) - 1` and `X` is `1`). So unless we know
2992	// that isn't the case, we'll need to emit an alternative, safe IR.
2993	if (XNext->hasNoSignedWrap() \|\| XNext->hasNoUnsignedWrap() \|\|
2994	PatternMatch::match(
2995	V: BitPos, P: PatternMatch::m_SpecificInt_ICMP(
2996	Predicate: ICmpInst::ICMP_NE, Threshold: APInt (Ty->getScalarSizeInBits(),
2997	Ty->getScalarSizeInBits() - `1`))))
2998	NewXNext = Builder.CreateShl(LHS: X, RHS: LoopTripCount);
2999	else {
3000	// Otherwise, just additionally shift by one. It's the smallest solution,
3001	// alternatively, we could check that NewX is INT_MIN (or BitPos is )
3002	// and select 0 instead.
3003	NewXNext = Builder.CreateShl(LHS: NewX, RHS: ConstantInt::get(Ty, V: `1`));
3004	}
3005
3006	NewXNext->takeName(V: XNext);
3007	if (auto *I = dyn_cast<Instruction>(Val: NewXNext))
3008	I->copyIRFlags(V: XNext, /IncludeWrapFlags=/true);
3009
3010	// Step 3: Adjust the successor basic block to recieve the computed
3011	// recurrence's final value instead of the recurrence itself.
3012
3013	XCurr->replaceUsesOutsideBlock(V: NewX, BB: LoopHeaderBB);
3014	XNext->replaceUsesOutsideBlock(V: NewXNext, BB: LoopHeaderBB);
3015
3016	// Step 4: Rewrite the loop into a countable form, with canonical IV.
3017
3018	// The new canonical induction variable.
3019	Builder.SetInsertPoint(TheBB: LoopHeaderBB, IP: LoopHeaderBB->begin());
3020	auto *IV = Builder.CreatePHI(Ty, NumReservedValues: `2`, Name: CurLoop->getName() + ".iv");
3021
3022	// The induction itself.
3023	// Note that while NUW is always safe, while NSW is only for bitwidths != 2.
3024	Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
3025	auto *IVNext =
3026	Builder.CreateAdd(LHS: IV, RHS: ConstantInt::get(Ty, V: `1`), Name: IV->getName() + ".next",
3027	/HasNUW=/true, /HasNSW=/Bitwidth != `2`);
3028
3029	// The loop trip count check.
3030	auto *IVCheck = Builder.CreateICmpEQ(LHS: IVNext, RHS: LoopTripCount,
3031	Name: CurLoop->getName() + ".ivcheck");
3032	Builder.CreateCondBr(Cond: IVCheck, True: SuccessorBB, False: LoopHeaderBB);
3033	LoopHeaderBB->getTerminator()->eraseFromParent();
3034
3035	// Populate the IV PHI.
3036	IV->addIncoming(V: ConstantInt::get(Ty, V: `0`), BB: LoopPreheaderBB);
3037	IV->addIncoming(V: IVNext, BB: LoopHeaderBB);
3038
3039	// Step 5: Forget the "non-computable" trip-count SCEV associated with the
3040	// loop. The loop would otherwise not be deleted even if it becomes empty.
3041
3042	SE->forgetLoop(L: CurLoop);
3043
3044	// Other passes will take care of actually deleting the loop if possible.
3045
3046	LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-bittest idiom optimized!\n");
3047
3048	++NumShiftUntilBitTest;
3049	return MadeChange;
3050	}
3051
3052	/// Return true if the idiom is detected in the loop.
3053	///
3054	/// The core idiom we are trying to detect is:
3055	/// \code
3056	/// entry:
3057	/// <...>
3058	/// %start = <...>
3059	/// %extraoffset = <...>
3060	/// <...>
3061	/// br label %for.cond
3062	///
3063	/// loop:
3064	/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
3065	/// %nbits = add nsw i8 %iv, %extraoffset
3066	/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
3067	/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
3068	/// %iv.next = add i8 %iv, 1
3069	/// <...>
3070	/// br i1 %val.shifted.iszero, label %end, label %loop
3071	///
3072	/// end:
3073	/// %iv.res = phi i8 [ %iv, %loop ] <...>
3074	/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
3075	/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
3076	/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
3077	/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
3078	/// <...>
3079	/// \endcode
3080	static bool detectShiftUntilZeroIdiom(Loop CurLoop, ScalarEvolution SE,
3081	Instruction *&ValShiftedIsZero,
3082	Intrinsic::ID &IntrinID, Instruction *&IV,
3083	Value &Start, Value &Val,
3084	const SCEV *&ExtraOffsetExpr,
3085	bool &InvertedCond) {
3086	LLVM_DEBUG(dbgs() << DEBUG_TYPE
3087	" Performing shift-until-zero idiom detection.\n");
3088
3089	// Give up if the loop has multiple blocks or multiple backedges.
3090	if (CurLoop->getNumBlocks() != `1` \|\| CurLoop->getNumBackEdges() != `1`) {
3091	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad block/backedge count.\n");
3092	return false;
3093	}
3094
3095	Instruction ValShifted, NBits, *IVNext;
3096	Value *ExtraOffset;
3097
3098	BasicBlock *LoopHeaderBB = CurLoop->getHeader();
3099	BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
3100	assert(LoopPreheaderBB && "There is always a loop preheader.");
3101
3102	using namespace PatternMatch;
3103
3104	// Step 1: Check if the loop backedge, condition is in desirable form.
3105
3106	CmpPredicate Pred;
3107	BasicBlock TrueBB, FalseBB;
3108	if (!match(V: LoopHeaderBB->getTerminator(),
3109	P: m_Br(C: m_Instruction(I&: ValShiftedIsZero), T: m_BasicBlock(V&: TrueBB),
3110	F: m_BasicBlock(V&: FalseBB))) \|\|
3111	!match(V: ValShiftedIsZero,
3112	P: m_ICmp(Pred, L: m_Instruction(I&: ValShifted), R: m_Zero())) \|\|
3113	!ICmpInst::isEquality(P: Pred)) {
3114	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge structure.\n");
3115	return false;
3116	}
3117
3118	// Step 2: Check if the comparison's operand is in desirable form.
3119	// FIXME: Val could be a one-input PHI node, which we should look past.
3120	if (!match(V: ValShifted, P: m_Shift(L: m_LoopInvariant(M: m_Value(V&: Val), L: CurLoop),
3121	R: m_Instruction(I&: NBits)))) {
3122	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad comparisons value computation.\n");
3123	return false;
3124	}
3125	IntrinID = ValShifted->getOpcode() == Instruction::Shl ? Intrinsic::cttz
3126	: Intrinsic::ctlz;
3127
3128	// Step 3: Check if the shift amount is in desirable form.
3129
3130	if (match(V: NBits, P: m_c_Add(L: m_Instruction(I&: IV),
3131	R: m_LoopInvariant(M: m_Value(V&: ExtraOffset), L: CurLoop))) &&
3132	(NBits->hasNoSignedWrap() \|\| NBits->hasNoUnsignedWrap()))
3133	ExtraOffsetExpr = SE->getNegativeSCEV(V: SE->getSCEV(V: ExtraOffset));
3134	else if (match(V: NBits,
3135	P: m_Sub(L: m_Instruction(I&: IV),
3136	R: m_LoopInvariant(M: m_Value(V&: ExtraOffset), L: CurLoop))) &&
3137	NBits->hasNoSignedWrap())
3138	ExtraOffsetExpr = SE->getSCEV(V: ExtraOffset);
3139	else {
3140	IV = NBits;
3141	ExtraOffsetExpr = SE->getZero(Ty: NBits->getType());
3142	}
3143
3144	// Step 4: Check if the recurrence is in desirable form.
3145	auto *IVPN = dyn_cast<PHINode>(Val: IV);
3146	if (!IVPN \|\| IVPN->getParent() != LoopHeaderBB) {
3147	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Not an expected PHI node.\n");
3148	return false;
3149	}
3150
3151	Start = IVPN->getIncomingValueForBlock(BB: LoopPreheaderBB);
3152	IVNext = dyn_cast<Instruction>(Val: IVPN->getIncomingValueForBlock(BB: LoopHeaderBB));
3153
3154	if (!IVNext \|\| !match(V: IVNext, P: m_Add(L: m_Specific(V: IVPN), R: m_One()))) {
3155	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad recurrence.\n");
3156	return false;
3157	}
3158
3159	// Step 4: Check if the backedge's destinations are in desirable form.
3160
3161	assert(ICmpInst::isEquality(Pred) &&
3162	"Should only get equality predicates here.");
3163
3164	// cmp-br is commutative, so canonicalize to a single variant.
3165	InvertedCond = Pred != ICmpInst::Predicate::ICMP_EQ;
3166	if (InvertedCond) {
3167	Pred = ICmpInst::getInversePredicate(pred: Pred);
3168	std::swap(a&: TrueBB, b&: FalseBB);
3169	}
3170
3171	// We expect to exit loop when comparison yields true,
3172	// so when it yields false we should branch back to loop header.
3173	if (FalseBB != LoopHeaderBB) {
3174	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Bad backedge flow.\n");
3175	return false;
3176	}
3177
3178	// The new, countable, loop will certainly only run a known number of
3179	// iterations, It won't be infinite. But the old loop might be infinite
3180	// under certain conditions. For logical shifts, the value will become zero
3181	// after at most bitwidth(%Val) loop iterations. However, for arithmetic
3182	// right-shift, iff the sign bit was set, the value will never become zero,
3183	// and the loop may never finish.
3184	if (ValShifted->getOpcode() == Instruction::AShr &&
3185	!isMustProgress(L: CurLoop) && !SE->isKnownNonNegative(S: SE->getSCEV(V: Val))) {
3186	LLVM_DEBUG(dbgs() << DEBUG_TYPE " Can not prove the loop is finite.\n");
3187	return false;
3188	}
3189
3190	// Okay, idiom checks out.
3191	return true;
3192	}
3193
3194	/// Look for the following loop:
3195	/// \code
3196	/// entry:
3197	/// <...>
3198	/// %start = <...>
3199	/// %extraoffset = <...>
3200	/// <...>
3201	/// br label %for.cond
3202	///
3203	/// loop:
3204	/// %iv = phi i8 [ %start, %entry ], [ %iv.next, %for.cond ]
3205	/// %nbits = add nsw i8 %iv, %extraoffset
3206	/// %val.shifted = {{l,a}shr,shl} i8 %val, %nbits
3207	/// %val.shifted.iszero = icmp eq i8 %val.shifted, 0
3208	/// %iv.next = add i8 %iv, 1
3209	/// <...>
3210	/// br i1 %val.shifted.iszero, label %end, label %loop
3211	///
3212	/// end:
3213	/// %iv.res = phi i8 [ %iv, %loop ] <...>
3214	/// %nbits.res = phi i8 [ %nbits, %loop ] <...>
3215	/// %val.shifted.res = phi i8 [ %val.shifted, %loop ] <...>
3216	/// %val.shifted.iszero.res = phi i1 [ %val.shifted.iszero, %loop ] <...>
3217	/// %iv.next.res = phi i8 [ %iv.next, %loop ] <...>
3218	/// <...>
3219	/// \endcode
3220	///
3221	/// And transform it into:
3222	/// \code
3223	/// entry:
3224	/// <...>
3225	/// %start = <...>
3226	/// %extraoffset = <...>
3227	/// <...>
3228	/// %val.numleadingzeros = call i8 @llvm.ct{l,t}z.i8(i8 %val, i1 0)
3229	/// %val.numactivebits = sub i8 8, %val.numleadingzeros
3230	/// %extraoffset.neg = sub i8 0, %extraoffset
3231	/// %tmp = add i8 %val.numactivebits, %extraoffset.neg
3232	/// %iv.final = call i8 @llvm.smax.i8(i8 %tmp, i8 %start)
3233	/// %loop.tripcount = sub i8 %iv.final, %start
3234	/// br label %loop
3235	///
3236	/// loop:
3237	/// %loop.iv = phi i8 [ 0, %entry ], [ %loop.iv.next, %loop ]
3238	/// %loop.iv.next = add i8 %loop.iv, 1
3239	/// %loop.ivcheck = icmp eq i8 %loop.iv.next, %loop.tripcount
3240	/// %iv = add i8 %loop.iv, %start
3241	/// <...>
3242	/// br i1 %loop.ivcheck, label %end, label %loop
3243	///
3244	/// end:
3245	/// %iv.res = phi i8 [ %iv.final, %loop ] <...>
3246	/// <...>
3247	/// \endcode
3248	bool LoopIdiomRecognize::recognizeShiftUntilZero() {
3249	bool MadeChange = false;
3250
3251	Instruction *ValShiftedIsZero;
3252	Intrinsic::ID IntrID;
3253	Instruction *IV;
3254	Value Start, Val;
3255	const SCEV *ExtraOffsetExpr;
3256	bool InvertedCond;
3257	if (!detectShiftUntilZeroIdiom(CurLoop, SE, ValShiftedIsZero, IntrinID&: IntrID, IV,
3258	Start, Val, ExtraOffsetExpr, InvertedCond)) {
3259	LLVM_DEBUG(dbgs() << DEBUG_TYPE
3260	" shift-until-zero idiom detection failed.\n");
3261	return MadeChange;
3262	}
3263	LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom detected!\n");
3264
3265	// Ok, it is the idiom we were looking for, we could* transform this loop,*
3266	// but is it profitable to transform?
3267
3268	BasicBlock *LoopHeaderBB = CurLoop->getHeader();
3269	BasicBlock *LoopPreheaderBB = CurLoop->getLoopPreheader();
3270	assert(LoopPreheaderBB && "There is always a loop preheader.");
3271
3272	BasicBlock *SuccessorBB = CurLoop->getExitBlock();
3273	assert(SuccessorBB && "There is only a single successor.");
3274
3275	IRBuilder<> Builder(LoopPreheaderBB->getTerminator());
3276	Builder.SetCurrentDebugLocation(IV->getDebugLoc());
3277
3278	Type *Ty = Val->getType();
3279	unsigned Bitwidth = Ty->getScalarSizeInBits();
3280
3281	TargetTransformInfo::TargetCostKind CostKind =
3282	TargetTransformInfo::TCK_SizeAndLatency;
3283
3284	// The rewrite is considered to be unprofitable iff and only iff the
3285	// intrinsic we'll use are not cheap. Note that we are okay with just
3286	// making the loop countable, even if nothing else changes.
3287	IntrinsicCostAttributes Attrs(
3288	IntrID, Ty, {PoisonValue::get(T: Ty), /is_zero_poison=/Builder.getFalse()});
3289	InstructionCost Cost = TTI->getIntrinsicInstrCost(ICA: Attrs, CostKind);
3290	if (Cost > TargetTransformInfo::TCC_Basic) {
3291	LLVM_DEBUG(dbgs() << DEBUG_TYPE
3292	" Intrinsic is too costly, not beneficial\n");
3293	return MadeChange;
3294	}
3295
3296	// Ok, transform appears worthwhile.
3297	MadeChange = true;
3298
3299	bool OffsetIsZero = ExtraOffsetExpr->isZero();
3300
3301	// Step 1: Compute the loop's final IV value / trip count.
3302
3303	CallInst *ValNumLeadingZeros = Builder.CreateIntrinsic(
3304	ID: IntrID, Types: Ty, Args: {Val, /is_zero_poison=/Builder.getFalse()},
3305	/FMFSource=/nullptr, Name: Val->getName() + ".numleadingzeros");
3306	Value *ValNumActiveBits = Builder.CreateSub(
3307	LHS: ConstantInt::get(Ty, V: Ty->getScalarSizeInBits()), RHS: ValNumLeadingZeros,
3308	Name: Val->getName() + ".numactivebits", /HasNUW=/true,
3309	/HasNSW=/Bitwidth != `2`);
3310
3311	SCEVExpander Expander(SE, DL, "loop-idiom");
3312	Expander.setInsertPoint(&*Builder.GetInsertPoint());
3313	Value *ExtraOffset = Expander.expandCodeFor(SH: ExtraOffsetExpr);
3314
3315	Value *ValNumActiveBitsOffset = Builder.CreateAdd(
3316	LHS: ValNumActiveBits, RHS: ExtraOffset, Name: ValNumActiveBits->getName() + ".offset",
3317	/HasNUW=/OffsetIsZero, /HasNSW=/true);
3318	Value *IVFinal = Builder.CreateIntrinsic(ID: Intrinsic::smax, Types: {Ty},
3319	Args: {ValNumActiveBitsOffset, Start},
3320	/FMFSource=/nullptr, Name: "iv.final");
3321
3322	auto *LoopBackedgeTakenCount = cast<Instruction>(Val: Builder.CreateSub(
3323	LHS: IVFinal, RHS: Start, Name: CurLoop->getName() + ".backedgetakencount",
3324	/HasNUW=/OffsetIsZero, /HasNSW=/true));
3325	// FIXME: or when the offset was `add nuw`
3326
3327	// We know loop's backedge-taken count, but what's loop's trip count?
3328	Value *LoopTripCount =
3329	Builder.CreateAdd(LHS: LoopBackedgeTakenCount, RHS: ConstantInt::get(Ty, V: `1`),
3330	Name: CurLoop->getName() + ".tripcount", /HasNUW=/true,
3331	/HasNSW=/Bitwidth != `2`);
3332
3333	// Step 2: Adjust the successor basic block to recieve the original
3334	// induction variable's final value instead of the orig. IV itself.
3335
3336	IV->replaceUsesOutsideBlock(V: IVFinal, BB: LoopHeaderBB);
3337
3338	// Step 3: Rewrite the loop into a countable form, with canonical IV.
3339
3340	// The new canonical induction variable.
3341	Builder.SetInsertPoint(TheBB: LoopHeaderBB, IP: LoopHeaderBB->begin());
3342	auto *CIV = Builder.CreatePHI(Ty, NumReservedValues: `2`, Name: CurLoop->getName() + ".iv");
3343
3344	// The induction itself.
3345	Builder.SetInsertPoint(TheBB: LoopHeaderBB, IP: LoopHeaderBB->getFirstNonPHIIt());
3346	auto *CIVNext =
3347	Builder.CreateAdd(LHS: CIV, RHS: ConstantInt::get(Ty, V: `1`), Name: CIV->getName() + ".next",
3348	/HasNUW=/true, /HasNSW=/Bitwidth != `2`);
3349
3350	// The loop trip count check.
3351	auto *CIVCheck = Builder.CreateICmpEQ(LHS: CIVNext, RHS: LoopTripCount,
3352	Name: CurLoop->getName() + ".ivcheck");
3353	auto *NewIVCheck = CIVCheck;
3354	if (InvertedCond) {
3355	NewIVCheck = Builder.CreateNot(V: CIVCheck);
3356	NewIVCheck->takeName(V: ValShiftedIsZero);
3357	}
3358
3359	// The original IV, but rebased to be an offset to the CIV.
3360	auto IVDePHId = Builder.CreateAdd(LHS: CIV, RHS: Start, Name: "", /HasNUW=/*false,
3361	/HasNSW=/true); // FIXME: what about NUW?
3362	IVDePHId->takeName(V: IV);
3363
3364	// The loop terminator.
3365	Builder.SetInsertPoint(LoopHeaderBB->getTerminator());
3366	Builder.CreateCondBr(Cond: CIVCheck, True: SuccessorBB, False: LoopHeaderBB);
3367	LoopHeaderBB->getTerminator()->eraseFromParent();
3368
3369	// Populate the IV PHI.
3370	CIV->addIncoming(V: ConstantInt::get(Ty, V: `0`), BB: LoopPreheaderBB);
3371	CIV->addIncoming(V: CIVNext, BB: LoopHeaderBB);
3372
3373	// Step 4: Forget the "non-computable" trip-count SCEV associated with the
3374	// loop. The loop would otherwise not be deleted even if it becomes empty.
3375
3376	SE->forgetLoop(L: CurLoop);
3377
3378	// Step 5: Try to cleanup the loop's body somewhat.
3379	IV->replaceAllUsesWith(V: IVDePHId);
3380	IV->eraseFromParent();
3381
3382	ValShiftedIsZero->replaceAllUsesWith(V: NewIVCheck);
3383	ValShiftedIsZero->eraseFromParent();
3384
3385	// Other passes will take care of actually deleting the loop if possible.
3386
3387	LLVM_DEBUG(dbgs() << DEBUG_TYPE " shift-until-zero idiom optimized!\n");
3388
3389	++NumShiftUntilZero;
3390	return MadeChange;
3391	}
3392

source code of llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp