MachinePipeliner.cpp source code [llvm/lib/CodeGen/MachinePipeliner.cpp]

1	//===- MachinePipeliner.cpp - Machine Software Pipeliner Pass -------------===//
2	//
3	// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	// See https://llvm.org/LICENSE.txt for license information.
5	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	//
7	//===----------------------------------------------------------------------===//
8	//
9	// An implementation of the Swing Modulo Scheduling (SMS) software pipeliner.
10	//
11	// This SMS implementation is a target-independent back-end pass. When enabled,
12	// the pass runs just prior to the register allocation pass, while the machine
13	// IR is in SSA form. If software pipelining is successful, then the original
14	// loop is replaced by the optimized loop. The optimized loop contains one or
15	// more prolog blocks, the pipelined kernel, and one or more epilog blocks. If
16	// the instructions cannot be scheduled in a given MII, we increase the MII by
17	// one and try again.
18	//
19	// The SMS implementation is an extension of the ScheduleDAGInstrs class. We
20	// represent loop carried dependences in the DAG as order edges to the Phi
21	// nodes. We also perform several passes over the DAG to eliminate unnecessary
22	// edges that inhibit the ability to pipeline. The implementation uses the
23	// DFAPacketizer class to compute the minimum initiation interval and the check
24	// where an instruction may be inserted in the pipelined schedule.
25	//
26	// In order for the SMS pass to work, several target specific hooks need to be
27	// implemented to get information about the loop structure and to rewrite
28	// instructions.
29	//
30	//===----------------------------------------------------------------------===//
31
32	#include "llvm/CodeGen/MachinePipeliner.h"
33	#include "llvm/ADT/ArrayRef.h"
34	#include "llvm/ADT/BitVector.h"
35	#include "llvm/ADT/DenseMap.h"
36	#include "llvm/ADT/PriorityQueue.h"
37	#include "llvm/ADT/STLExtras.h"
38	#include "llvm/ADT/SetOperations.h"
39	#include "llvm/ADT/SetVector.h"
40	#include "llvm/ADT/SmallPtrSet.h"
41	#include "llvm/ADT/SmallSet.h"
42	#include "llvm/ADT/SmallVector.h"
43	#include "llvm/ADT/Statistic.h"
44	#include "llvm/ADT/iterator_range.h"
45	#include "llvm/Analysis/AliasAnalysis.h"
46	#include "llvm/Analysis/MemoryLocation.h"
47	#include "llvm/Analysis/OptimizationRemarkEmitter.h"
48	#include "llvm/Analysis/ValueTracking.h"
49	#include "llvm/CodeGen/DFAPacketizer.h"
50	#include "llvm/CodeGen/LiveIntervals.h"
51	#include "llvm/CodeGen/MachineBasicBlock.h"
52	#include "llvm/CodeGen/MachineDominators.h"
53	#include "llvm/CodeGen/MachineFunction.h"
54	#include "llvm/CodeGen/MachineFunctionPass.h"
55	#include "llvm/CodeGen/MachineInstr.h"
56	#include "llvm/CodeGen/MachineInstrBuilder.h"
57	#include "llvm/CodeGen/MachineLoopInfo.h"
58	#include "llvm/CodeGen/MachineMemOperand.h"
59	#include "llvm/CodeGen/MachineOperand.h"
60	#include "llvm/CodeGen/MachineRegisterInfo.h"
61	#include "llvm/CodeGen/ModuloSchedule.h"
62	#include "llvm/CodeGen/Register.h"
63	#include "llvm/CodeGen/RegisterClassInfo.h"
64	#include "llvm/CodeGen/RegisterPressure.h"
65	#include "llvm/CodeGen/ScheduleDAG.h"
66	#include "llvm/CodeGen/ScheduleDAGMutation.h"
67	#include "llvm/CodeGen/TargetInstrInfo.h"
68	#include "llvm/CodeGen/TargetOpcodes.h"
69	#include "llvm/CodeGen/TargetPassConfig.h"
70	#include "llvm/CodeGen/TargetRegisterInfo.h"
71	#include "llvm/CodeGen/TargetSubtargetInfo.h"
72	#include "llvm/Config/llvm-config.h"
73	#include "llvm/IR/Attributes.h"
74	#include "llvm/IR/Function.h"
75	#include "llvm/MC/LaneBitmask.h"
76	#include "llvm/MC/MCInstrDesc.h"
77	#include "llvm/MC/MCInstrItineraries.h"
78	#include "llvm/Pass.h"
79	#include "llvm/Support/CommandLine.h"
80	#include "llvm/Support/Compiler.h"
81	#include "llvm/Support/Debug.h"
82	#include "llvm/Support/raw_ostream.h"
83	#include <algorithm>
84	#include <cassert>
85	#include <climits>
86	#include <cstdint>
87	#include <deque>
88	#include <functional>
89	#include <iomanip>
90	#include <iterator>
91	#include <map>
92	#include <memory>
93	#include <sstream>
94	#include <tuple>
95	#include <utility>
96	#include <vector>
97
98	using namespace llvm;
99
100	#define DEBUG_TYPE "pipeliner"
101
102	STATISTIC(NumTrytoPipeline, "Number of loops that we attempt to pipeline");
103	STATISTIC(NumPipelined, "Number of loops software pipelined");
104	STATISTIC(NumNodeOrderIssues, "Number of node order issues found");
105	STATISTIC(NumFailBranch, "Pipeliner abort due to unknown branch");
106	STATISTIC(NumFailLoop, "Pipeliner abort due to unsupported loop");
107	STATISTIC(NumFailPreheader, "Pipeliner abort due to missing preheader");
108	STATISTIC(NumFailLargeMaxMII, "Pipeliner abort due to MaxMII too large");
109	STATISTIC(NumFailZeroMII, "Pipeliner abort due to zero MII");
110	STATISTIC(NumFailNoSchedule, "Pipeliner abort due to no schedule found");
111	STATISTIC(NumFailZeroStage, "Pipeliner abort due to zero stage");
112	STATISTIC(NumFailLargeMaxStage, "Pipeliner abort due to too many stages");
113	STATISTIC(NumFailTooManyStores, "Pipeliner abort due to too many stores");
114
115	/// A command line option to turn software pipelining on or off.
116	static cl::opt<bool> EnableSWP("enable-pipeliner", cl::Hidden, cl::init(Val: true),
117	cl::desc ("Enable Software Pipelining"));
118
119	/// A command line option to enable SWP at -Os.
120	static cl::opt<bool> EnableSWPOptSize("enable-pipeliner-opt-size",
121	cl::desc ("Enable SWP at Os."), cl::Hidden,
122	cl::init(Val: false));
123
124	/// A command line argument to limit minimum initial interval for pipelining.
125	static cl::opt<int> SwpMaxMii("pipeliner-max-mii",
126	cl::desc ("Size limit for the MII."),
127	cl::Hidden, cl::init(Val: `27`));
128
129	/// A command line argument to force pipeliner to use specified initial
130	/// interval.
131	static cl::opt<int> SwpForceII("pipeliner-force-ii",
132	cl::desc ("Force pipeliner to use specified II."),
133	cl::Hidden, cl::init(Val: -`1`));
134
135	/// A command line argument to limit the number of stages in the pipeline.
136	static cl::opt<int>
137	SwpMaxStages("pipeliner-max-stages",
138	cl::desc ("Maximum stages allowed in the generated scheduled."),
139	cl::Hidden, cl::init(Val: `3`));
140
141	/// A command line option to disable the pruning of chain dependences due to
142	/// an unrelated Phi.
143	static cl::opt<bool>
144	SwpPruneDeps("pipeliner-prune-deps",
145	cl::desc ("Prune dependences between unrelated Phi nodes."),
146	cl::Hidden, cl::init(Val: true));
147
148	/// A command line option to disable the pruning of loop carried order
149	/// dependences.
150	static cl::opt<bool>
151	SwpPruneLoopCarried("pipeliner-prune-loop-carried",
152	cl::desc ("Prune loop carried order dependences."),
153	cl::Hidden, cl::init(Val: true));
154
155	#ifndef NDEBUG
156	static cl::opt<int> SwpLoopLimit("pipeliner-max", cl::Hidden, cl::init(-`1`));
157	#endif
158
159	static cl::opt<bool> SwpIgnoreRecMII("pipeliner-ignore-recmii",
160	cl::ReallyHidden,
161	cl::desc ("Ignore RecMII"));
162
163	static cl::opt<bool> SwpShowResMask("pipeliner-show-mask", cl::Hidden,
164	cl::init(Val: false));
165	static cl::opt<bool> SwpDebugResource("pipeliner-dbg-res", cl::Hidden,
166	cl::init(Val: false));
167
168	static cl::opt<bool> EmitTestAnnotations(
169	"pipeliner-annotate-for-testing", cl::Hidden, cl::init(Val: false),
170	cl::desc ("Instead of emitting the pipelined code, annotate instructions "
171	"with the generated schedule for feeding into the "
172	"-modulo-schedule-test pass"));
173
174	static cl::opt<bool> ExperimentalCodeGen(
175	"pipeliner-experimental-cg", cl::Hidden, cl::init(Val: false),
176	cl::desc (
177	"Use the experimental peeling code generator for software pipelining"));
178
179	static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
180	cl::desc ("Range to search for II"),
181	cl::Hidden, cl::init(Val: `10`));
182
183	static cl::opt<bool>
184	LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(Val: false),
185	cl::desc ("Limit register pressure of scheduled loop"));
186
187	static cl::opt<int>
188	RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
189	cl::init(Val: `5`),
190	cl::desc ("Margin representing the unused percentage of "
191	"the register pressure limit"));
192
193	static cl::opt<bool>
194	MVECodeGen("pipeliner-mve-cg", cl::Hidden, cl::init(Val: false),
195	cl::desc ("Use the MVE code generator for software pipelining"));
196
197	/// A command line argument to limit the number of store instructions in the
198	/// target basic block.
199	static cl::opt<unsigned> SwpMaxNumStores(
200	"pipeliner-max-num-stores",
201	cl::desc ("Maximum number of stores allwed in the target loop."), cl::Hidden,
202	cl::init(Val: `200`));
203
204	namespace llvm {
205
206	// A command line option to enable the CopyToPhi DAG mutation.
207	cl::opt<bool> SwpEnableCopyToPhi("pipeliner-enable-copytophi", cl::ReallyHidden,
208	cl::init(Val: true),
209	cl::desc ("Enable CopyToPhi DAG Mutation"));
210
211	/// A command line argument to force pipeliner to use specified issue
212	/// width.
213	cl::opt<int> SwpForceIssueWidth(
214	"pipeliner-force-issue-width",
215	cl::desc ("Force pipeliner to use specified issue width."), cl::Hidden,
216	cl::init(Val: -`1`));
217
218	/// A command line argument to set the window scheduling option.
219	static cl::opt<WindowSchedulingFlag> WindowSchedulingOption(
220	"window-sched", cl::Hidden, cl::init(Val: WindowSchedulingFlag::WS_On),
221	cl::desc ("Set how to use window scheduling algorithm."),
222	cl::values(clEnumValN(WindowSchedulingFlag::WS_Off, "off",
223	"Turn off window algorithm."),
224	clEnumValN(WindowSchedulingFlag::WS_On, "on",
225	"Use window algorithm after SMS algorithm fails."),
226	clEnumValN(WindowSchedulingFlag::WS_Force, "force",
227	"Use window algorithm instead of SMS algorithm.")));
228
229	} // end namespace llvm
230
231	unsigned SwingSchedulerDAG::Circuits::MaxPaths = `5`;
232	char MachinePipeliner::ID = `0`;
233	#ifndef NDEBUG
234	int MachinePipeliner::NumTries = `0`;
235	#endif
236	char &llvm::MachinePipelinerID = MachinePipeliner::ID;
237
238	INITIALIZE_PASS_BEGIN(MachinePipeliner, DEBUG_TYPE,
239	"Modulo Software Pipelining", false, false)
240	INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
241	INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
242	INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
243	INITIALIZE_PASS_DEPENDENCY(LiveIntervalsWrapperPass)
244	INITIALIZE_PASS_END(MachinePipeliner, DEBUG_TYPE,
245	"Modulo Software Pipelining", false, false)
246
247	namespace {
248
249	/// This class holds an SUnit corresponding to a memory operation and other
250	/// information related to the instruction.
251	struct SUnitWithMemInfo {
252	SUnit *SU;
253	SmallVector<const Value *, `2`> UnderlyingObjs;
254
255	/// The value of a memory operand.
256	const Value MemOpValue = nullptr*;
257
258	/// The offset of a memory operand.
259	int64_t MemOpOffset = `0`;
260
261	AAMDNodes AATags;
262
263	/// True if all the underlying objects are identified.
264	bool IsAllIdentified = false;
265
266	SUnitWithMemInfo(SUnit *SU);
267
268	bool isTriviallyDisjoint(const SUnitWithMemInfo &Other) const;
269
270	bool isUnknown() const { return MemOpValue == nullptr; }
271
272	private:
273	bool getUnderlyingObjects();
274	};
275
276	/// Add loop-carried chain dependencies. This class handles the same type of
277	/// dependencies added by `ScheduleDAGInstrs::buildSchedGraph`, but takes into
278	/// account dependencies across iterations.
279	class LoopCarriedOrderDepsTracker {
280	// Type of instruction that is relevant to order-dependencies
281	enum class InstrTag {
282	Barrier = `0`, ///< A barrier event instruction.
283	LoadOrStore = `1`, ///< An instruction that may load or store memory, but is
284	///< not a barrier event.
285	FPExceptions = `2`, ///< An instruction that does not match above, but may
286	///< raise floatin-point exceptions.
287	};
288
289	struct TaggedSUnit : PointerIntPair<SUnit *, `2`> {
290	TaggedSUnit(SUnit *SU, InstrTag Tag)
291	: PointerIntPair<SUnit , `2`>(SU, unsigned*(Tag)) {}
292
293	InstrTag getTag() const { return InstrTag(getInt()); }
294	};
295
296	/// Holds loads and stores with memory related information.
297	struct LoadStoreChunk {
298	SmallVector<SUnitWithMemInfo, `4`> Loads;
299	SmallVector<SUnitWithMemInfo, `4`> Stores;
300
301	void append(SUnit *SU);
302	};
303
304	SwingSchedulerDAG *DAG;
305	BatchAAResults *BAA;
306	std::vector<SUnit> &SUnits;
307
308	/// The size of SUnits, for convenience.
309	const unsigned N;
310
311	/// Loop-carried Edges.
312	std::vector<BitVector> LoopCarried;
313
314	/// Instructions related to chain dependencies. They are one of the
315	/// following:
316	///
317	/// 1. Barrier event.
318	/// 2. Load, but neither a barrier event, invariant load, nor may load trap
319	/// value.
320	/// 3. Store, but not a barrier event.
321	/// 4. None of them, but may raise floating-point exceptions.
322	///
323	/// This is used when analyzing loop-carried dependencies that access global
324	/// barrier instructions.
325	std::vector<TaggedSUnit> TaggedSUnits;
326
327	const TargetInstrInfo TII = nullptr*;
328	const TargetRegisterInfo TRI = nullptr*;
329
330	public:
331	LoopCarriedOrderDepsTracker(SwingSchedulerDAG SSD, BatchAAResults BAA,
332	const TargetInstrInfo *TII,
333	const TargetRegisterInfo *TRI);
334
335	/// The main function to compute loop-carried order-dependencies.
336	void computeDependencies();
337
338	const BitVector &getLoopCarried(unsigned Idx) const {
339	return LoopCarried [Idx];
340	}
341
342	private:
343	/// Tags to \p SU if the instruction may affect the order-dependencies.
344	std::optional<InstrTag> getInstrTag(SUnit SU) const*;
345
346	void addLoopCarriedDepenenciesForChunks(const LoadStoreChunk &From,
347	const LoadStoreChunk &To);
348
349	/// Add a loop-carried order dependency between \p Src and \p Dst if we
350	/// cannot prove they are independent. When \p PerformCheapCheck is true, a
351	/// lightweight dependency test (referred to as "cheap check" below) is
352	/// performed at first. Note that the cheap check is retained to maintain the
353	/// existing behavior and not expected to be used anymore.
354	///
355	/// TODO: Remove \p PerformCheapCheck and the corresponding cheap check.
356	void addDependenciesBetweenSUs(const SUnitWithMemInfo &Src,
357	const SUnitWithMemInfo &Dst,
358	bool PerformCheapCheck = false);
359
360	void computeDependenciesAux();
361	};
362
363	} // end anonymous namespace
364
365	/// The "main" function for implementing Swing Modulo Scheduling.
366	bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
367	if (skipFunction(F: mf.getFunction()))
368	return false;
369
370	if (!EnableSWP)
371	return false;
372
373	if (mf.getFunction().getAttributes().hasFnAttr(Kind: Attribute::OptimizeForSize) &&
374	!EnableSWPOptSize.getPosition())
375	return false;
376
377	if (!mf.getSubtarget().enableMachinePipeliner())
378	return false;
379
380	// Cannot pipeline loops without instruction itineraries if we are using
381	// DFA for the pipeliner.
382	if (mf.getSubtarget().useDFAforSMS() &&
383	(!mf.getSubtarget().getInstrItineraryData() \|\|
384	mf.getSubtarget().getInstrItineraryData()->isEmpty()))
385	return false;
386
387	MF = &mf;
388	MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
389	MDT = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
390	ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
391	TII = MF->getSubtarget().getInstrInfo();
392	RegClassInfo.runOnMachineFunction(MF: *MF);
393
394	for (const auto &L : *MLI)
395	scheduleLoop(L&: *L);
396
397	return false;
398	}
399
400	/// Attempt to perform the SMS algorithm on the specified loop. This function is
401	/// the main entry point for the algorithm. The function identifies candidate
402	/// loops, calculates the minimum initiation interval, and attempts to schedule
403	/// the loop.
404	bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
405	bool Changed = false;
406	for (const auto &InnerLoop : L)
407	Changed \|= scheduleLoop(L&: *InnerLoop);
408
409	#ifndef NDEBUG
410	// Stop trying after reaching the limit (if any).
411	int Limit = SwpLoopLimit;
412	if (Limit >= `0`) {
413	if (NumTries >= SwpLoopLimit)
414	return Changed;
415	NumTries++;
416	}
417	#endif
418
419	setPragmaPipelineOptions(L);
420	if (!canPipelineLoop(L)) {
421	LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
422	ORE->emit(RemarkBuilder: [&]() {
423	return MachineOptimizationRemarkMissed (DEBUG_TYPE, "canPipelineLoop",
424	L.getStartLoc(), L.getHeader())
425	<< "Failed to pipeline loop";
426	});
427
428	LI.LoopPipelinerInfo.reset();
429	return Changed;
430	}
431
432	++NumTrytoPipeline;
433	if (useSwingModuloScheduler())
434	Changed = swingModuloScheduler(L);
435
436	if (useWindowScheduler(Changed))
437	Changed = runWindowScheduler(L);
438
439	LI.LoopPipelinerInfo.reset();
440	return Changed;
441	}
442
443	void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
444	// Reset the pragma for the next loop in iteration.
445	disabledByPragma = false;
446	II_setByPragma = `0`;
447
448	MachineBasicBlock *LBLK = L.getTopBlock();
449
450	if (LBLK == nullptr)
451	return;
452
453	const BasicBlock *BBLK = LBLK->getBasicBlock();
454	if (BBLK == nullptr)
455	return;
456
457	const Instruction *TI = BBLK->getTerminator();
458	if (TI == nullptr)
459	return;
460
461	MDNode *LoopID = TI->getMetadata(KindID: LLVMContext::MD_loop);
462	if (LoopID == nullptr)
463	return;
464
465	assert(LoopID->getNumOperands() > `0` && "requires atleast one operand");
466	assert(LoopID->getOperand(`0`) == LoopID && "invalid loop");
467
468	for (const MDOperand &MDO : llvm::drop_begin(RangeOrContainer: LoopID->operands())) {
469	MDNode *MD = dyn_cast<MDNode>(Val: MDO);
470
471	if (MD == nullptr)
472	continue;
473
474	MDString *S = dyn_cast<MDString>(Val: MD->getOperand(I: `0`));
475
476	if (S == nullptr)
477	continue;
478
479	if (S->getString() == "llvm.loop.pipeline.initiationinterval") {
480	assert(MD->getNumOperands() == `2` &&
481	"Pipeline initiation interval hint metadata should have two operands.");
482	II_setByPragma =
483	mdconst::extract<ConstantInt>(MD: MD->getOperand(I: `1`))->getZExtValue();
484	assert(II_setByPragma >= `1` && "Pipeline initiation interval must be positive.");
485	} else if (S->getString() == "llvm.loop.pipeline.disable") {
486	disabledByPragma = true;
487	}
488	}
489	}
490
491	/// Return true if the loop can be software pipelined. The algorithm is
492	/// restricted to loops with a single basic block. Make sure that the
493	/// branch in the loop can be analyzed.
494	bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
495	if (L.getNumBlocks() != `1`) {
496	ORE->emit(RemarkBuilder: [&]() {
497	return MachineOptimizationRemarkAnalysis (DEBUG_TYPE, "canPipelineLoop",
498	L.getStartLoc(), L.getHeader())
499	<< "Not a single basic block: "
500	<< ore::NV ("NumBlocks", L.getNumBlocks());
501	});
502	return false;
503	}
504
505	if (disabledByPragma) {
506	ORE->emit(RemarkBuilder: [&]() {
507	return MachineOptimizationRemarkAnalysis (DEBUG_TYPE, "canPipelineLoop",
508	L.getStartLoc(), L.getHeader())
509	<< "Disabled by Pragma.";
510	});
511	return false;
512	}
513
514	// Check if the branch can't be understood because we can't do pipelining
515	// if that's the case.
516	LI.TBB = nullptr;
517	LI.FBB = nullptr;
518	LI.BrCond.clear();
519	if (TII->analyzeBranch(MBB&: *L.getHeader(), TBB&: LI.TBB, FBB&: LI.FBB, Cond&: LI.BrCond)) {
520	LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
521	NumFailBranch ++;
522	ORE->emit(RemarkBuilder: [&]() {
523	return MachineOptimizationRemarkAnalysis (DEBUG_TYPE, "canPipelineLoop",
524	L.getStartLoc(), L.getHeader())
525	<< "The branch can't be understood";
526	});
527	return false;
528	}
529
530	LI.LoopInductionVar = nullptr;
531	LI.LoopCompare = nullptr;
532	LI.LoopPipelinerInfo = TII->analyzeLoopForPipelining(LoopBB: L.getTopBlock());
533	if (!LI.LoopPipelinerInfo) {
534	LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
535	NumFailLoop ++;
536	ORE->emit(RemarkBuilder: [&]() {
537	return MachineOptimizationRemarkAnalysis (DEBUG_TYPE, "canPipelineLoop",
538	L.getStartLoc(), L.getHeader())
539	<< "The loop structure is not supported";
540	});
541	return false;
542	}
543
544	if (!L.getLoopPreheader()) {
545	LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
546	NumFailPreheader ++;
547	ORE->emit(RemarkBuilder: [&]() {
548	return MachineOptimizationRemarkAnalysis (DEBUG_TYPE, "canPipelineLoop",
549	L.getStartLoc(), L.getHeader())
550	<< "No loop preheader found";
551	});
552	return false;
553	}
554
555	unsigned NumStores = `0`;
556	for (MachineInstr &MI : *L.getHeader())
557	if (MI.mayStore())
558	++NumStores;
559	if (NumStores > SwpMaxNumStores) {
560	LLVM_DEBUG(dbgs() << "Too many stores\n");
561	NumFailTooManyStores ++;
562	ORE->emit(RemarkBuilder: [&]() {
563	return MachineOptimizationRemarkAnalysis (DEBUG_TYPE, "canPipelineLoop",
564	L.getStartLoc(), L.getHeader())
565	<< "Too many store instructions in the loop: "
566	<< ore::NV ("NumStores", NumStores) << " > "
567	<< ore::NV ("SwpMaxNumStores", SwpMaxNumStores) << ".";
568	});
569	return false;
570	}
571
572	// Remove any subregisters from inputs to phi nodes.
573	preprocessPhiNodes(B&: *L.getHeader());
574	return true;
575	}
576
577	void MachinePipeliner::preprocessPhiNodes(MachineBasicBlock &B) {
578	MachineRegisterInfo &MRI = MF->getRegInfo();
579	SlotIndexes &Slots =
580	*getAnalysis<LiveIntervalsWrapperPass>().getLIS().getSlotIndexes();
581
582	for (MachineInstr &PI : B.phis()) {
583	MachineOperand &DefOp = PI.getOperand(i: `0`);
584	assert(DefOp.getSubReg() == `0`);
585	auto *RC = MRI.getRegClass(Reg: DefOp.getReg());
586
587	for (unsigned i = `1`, n = PI.getNumOperands(); i != n; i += `2`) {
588	MachineOperand &RegOp = PI.getOperand(i);
589	if (RegOp.getSubReg() == `0`)
590	continue;
591
592	// If the operand uses a subregister, replace it with a new register
593	// without subregisters, and generate a copy to the new register.
594	Register NewReg = MRI.createVirtualRegister(RegClass: RC);
595	MachineBasicBlock &PredB = *PI.getOperand(i: i+`1`).getMBB();
596	MachineBasicBlock::iterator At = PredB.getFirstTerminator();
597	const DebugLoc &DL = PredB.findDebugLoc(MBBI: At);
598	auto Copy = BuildMI(BB&: PredB, I: At, MIMD: DL, MCID: TII->get(Opcode: TargetOpcode::COPY), DestReg: NewReg)
599	.addReg(RegNo: RegOp.getReg(), flags: getRegState(RegOp),
600	SubReg: RegOp.getSubReg());
601	Slots.insertMachineInstrInMaps(MI&: *Copy);
602	RegOp.setReg(NewReg);
603	RegOp.setSubReg(`0`);
604	}
605	}
606	}
607
608	/// The SMS algorithm consists of the following main steps:
609	/// 1. Computation and analysis of the dependence graph.
610	/// 2. Ordering of the nodes (instructions).
611	/// 3. Attempt to Schedule the loop.
612	bool MachinePipeliner::swingModuloScheduler(MachineLoop &L) {
613	assert(L.getBlocks().size() == `1` && "SMS works on single blocks only.");
614
615	AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
616	SwingSchedulerDAG SMS(
617	*this, L, getAnalysis<LiveIntervalsWrapperPass>().getLIS(), RegClassInfo,
618	II_setByPragma, LI.LoopPipelinerInfo.get(), AA);
619
620	MachineBasicBlock *MBB = L.getHeader();
621	// The kernel should not include any terminator instructions. These
622	// will be added back later.
623	SMS.startBlock(BB: MBB);
624
625	// Compute the number of 'real' instructions in the basic block by
626	// ignoring terminators.
627	unsigned size = MBB->size();
628	for (MachineBasicBlock::iterator I = MBB->getFirstTerminator(),
629	E = MBB->instr_end();
630	I != E; ++I, --size)
631	;
632
633	SMS.enterRegion(bb: MBB, begin: MBB->begin(), end: MBB->getFirstTerminator(), regioninstrs: size);
634	SMS.schedule();
635	SMS.exitRegion();
636
637	SMS.finishBlock();
638	return SMS.hasNewSchedule();
639	}
640
641	void MachinePipeliner::getAnalysisUsage(AnalysisUsage &AU) const {
642	AU.addRequired<AAResultsWrapperPass>();
643	AU.addPreserved<AAResultsWrapperPass>();
644	AU.addRequired<MachineLoopInfoWrapperPass>();
645	AU.addRequired<MachineDominatorTreeWrapperPass>();
646	AU.addRequired<LiveIntervalsWrapperPass>();
647	AU.addRequired<MachineOptimizationRemarkEmitterPass>();
648	AU.addRequired<TargetPassConfig>();
649	MachineFunctionPass::getAnalysisUsage(AU);
650	}
651
652	bool MachinePipeliner::runWindowScheduler(MachineLoop &L) {
653	MachineSchedContext Context;
654	Context.MF = MF;
655	Context.MLI = MLI;
656	Context.MDT = MDT;
657	Context.TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
658	Context.AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
659	Context.LIS = &getAnalysis<LiveIntervalsWrapperPass>().getLIS();
660	Context.RegClassInfo->runOnMachineFunction(MF: *MF);
661	WindowScheduler WS(&Context, L);
662	return WS.run();
663	}
664
665	bool MachinePipeliner::useSwingModuloScheduler() {
666	// SwingModuloScheduler does not work when WindowScheduler is forced.
667	return WindowSchedulingOption != WindowSchedulingFlag::WS_Force;
668	}
669
670	bool MachinePipeliner::useWindowScheduler(bool Changed) {
671	// WindowScheduler does not work for following cases:
672	// 1. when it is off.
673	// 2. when SwingModuloScheduler is successfully scheduled.
674	// 3. when pragma II is enabled.
675	if (II_setByPragma) {
676	LLVM_DEBUG(dbgs() << "Window scheduling is disabled when "
677	"llvm.loop.pipeline.initiationinterval is set.\n");
678	return false;
679	}
680
681	return WindowSchedulingOption == WindowSchedulingFlag::WS_Force \|\|
682	(WindowSchedulingOption == WindowSchedulingFlag::WS_On && !Changed);
683	}
684
685	void SwingSchedulerDAG::setMII(unsigned ResMII, unsigned RecMII) {
686	if (SwpForceII > `0`)
687	MII = SwpForceII;
688	else if (II_setByPragma > `0`)
689	MII = II_setByPragma;
690	else
691	MII = std::max(a: ResMII, b: RecMII);
692	}
693
694	void SwingSchedulerDAG::setMAX_II() {
695	if (SwpForceII > `0`)
696	MAX_II = SwpForceII;
697	else if (II_setByPragma > `0`)
698	MAX_II = II_setByPragma;
699	else
700	MAX_II = MII + SwpIISearchRange;
701	}
702
703	/// We override the schedule function in ScheduleDAGInstrs to implement the
704	/// scheduling part of the Swing Modulo Scheduling algorithm.
705	void SwingSchedulerDAG::schedule() {
706	buildSchedGraph(AA);
707	const LoopCarriedEdges LCE = addLoopCarriedDependences();
708	updatePhiDependences();
709	Topo.InitDAGTopologicalSorting();
710	changeDependences();
711	postProcessDAG();
712	DDG = std::make_unique<SwingSchedulerDDG>(args&: SUnits, args: &EntrySU, args: &ExitSU, args: LCE);
713	LLVM_DEBUG({
714	dump();
715	dbgs() << "===== Loop Carried Edges Begin =====\n";
716	for (SUnit &SU : SUnits)
717	LCE.dump(&SU, TRI, &MRI);
718	dbgs() << "===== Loop Carried Edges End =====\n";
719	});
720
721	NodeSetType NodeSets;
722	findCircuits(NodeSets);
723	NodeSetType Circuits = NodeSets;
724
725	// Calculate the MII.
726	unsigned ResMII = calculateResMII();
727	unsigned RecMII = calculateRecMII(RecNodeSets&: NodeSets);
728
729	fuseRecs(NodeSets);
730
731	// This flag is used for testing and can cause correctness problems.
732	if (SwpIgnoreRecMII)
733	RecMII = `0`;
734
735	setMII(ResMII, RecMII);
736	setMAX_II();
737
738	LLVM_DEBUG(dbgs() << "MII = " << MII << " MAX_II = " << MAX_II
739	<< " (rec=" << RecMII << ", res=" << ResMII << ")\n");
740
741	// Can't schedule a loop without a valid MII.
742	if (MII == `0`) {
743	LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
744	NumFailZeroMII ++;
745	Pass.ORE->emit(RemarkBuilder: [&]() {
746	return MachineOptimizationRemarkAnalysis (
747	DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
748	<< "Invalid Minimal Initiation Interval: 0";
749	});
750	return;
751	}
752
753	// Don't pipeline large loops.
754	if (SwpMaxMii != -`1` && (int)MII > SwpMaxMii) {
755	LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
756	<< ", we don't pipeline large loops\n");
757	NumFailLargeMaxMII ++;
758	Pass.ORE->emit(RemarkBuilder: [&]() {
759	return MachineOptimizationRemarkAnalysis (
760	DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
761	<< "Minimal Initiation Interval too large: "
762	<< ore::NV ("MII", (int)MII) << " > "
763	<< ore::NV ("SwpMaxMii", SwpMaxMii) << "."
764	<< "Refer to -pipeliner-max-mii.";
765	});
766	return;
767	}
768
769	computeNodeFunctions(NodeSets);
770
771	registerPressureFilter(NodeSets);
772
773	colocateNodeSets(NodeSets);
774
775	checkNodeSets(NodeSets);
776
777	LLVM_DEBUG({
778	for (auto &I : NodeSets) {
779	dbgs() << " Rec NodeSet ";
780	I.dump();
781	}
782	});
783
784	llvm::stable_sort(Range&: NodeSets, C: std::greater<NodeSet>());
785
786	groupRemainingNodes(NodeSets);
787
788	removeDuplicateNodes(NodeSets);
789
790	LLVM_DEBUG({
791	for (auto &I : NodeSets) {
792	dbgs() << " NodeSet ";
793	I.dump();
794	}
795	});
796
797	computeNodeOrder(NodeSets);
798
799	// check for node order issues
800	checkValidNodeOrder(Circuits);
801
802	SMSchedule Schedule(Pass.MF, this);
803	Scheduled = schedulePipeline(Schedule);
804
805	if (!Scheduled){
806	LLVM_DEBUG(dbgs() << "No schedule found, return\n");
807	NumFailNoSchedule ++;
808	Pass.ORE->emit(RemarkBuilder: [&]() {
809	return MachineOptimizationRemarkAnalysis (
810	DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
811	<< "Unable to find schedule";
812	});
813	return;
814	}
815
816	unsigned numStages = Schedule.getMaxStageCount();
817	// No need to generate pipeline if there are no overlapped iterations.
818	if (numStages == `0`) {
819	LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
820	NumFailZeroStage ++;
821	Pass.ORE->emit(RemarkBuilder: [&]() {
822	return MachineOptimizationRemarkAnalysis (
823	DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
824	<< "No need to pipeline - no overlapped iterations in schedule.";
825	});
826	return;
827	}
828	// Check that the maximum stage count is less than user-defined limit.
829	if (SwpMaxStages > -`1` && (int)numStages > SwpMaxStages) {
830	LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
831	<< " : too many stages, abort\n");
832	NumFailLargeMaxStage ++;
833	Pass.ORE->emit(RemarkBuilder: [&]() {
834	return MachineOptimizationRemarkAnalysis (
835	DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
836	<< "Too many stages in schedule: "
837	<< ore::NV ("numStages", (int)numStages) << " > "
838	<< ore::NV ("SwpMaxStages", SwpMaxStages)
839	<< ". Refer to -pipeliner-max-stages.";
840	});
841	return;
842	}
843
844	Pass.ORE->emit(RemarkBuilder: [&]() {
845	return MachineOptimizationRemark (DEBUG_TYPE, "schedule", Loop.getStartLoc(),
846	Loop.getHeader())
847	<< "Pipelined succesfully!";
848	});
849
850	// Generate the schedule as a ModuloSchedule.
851	DenseMap<MachineInstr , int*> Cycles, Stages;
852	std::vector<MachineInstr *> OrderedInsts;
853	for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
854	++Cycle) {
855	for (SUnit *SU : Schedule.getInstructions(cycle: Cycle)) {
856	OrderedInsts.push_back(x: SU->getInstr());
857	Cycles [SU->getInstr()] = Cycle;
858	Stages [SU->getInstr()] = Schedule.stageScheduled(SU);
859	}
860	}
861	DenseMap<MachineInstr *, std::pair<Register, int64_t>> NewInstrChanges;
862	for (auto &KV : NewMIs) {
863	Cycles [KV.first] = Cycles [KV.second];
864	Stages [KV.first] = Stages [KV.second];
865	NewInstrChanges [KV.first] = InstrChanges [getSUnit(MI: KV.first)];
866	}
867
868	ModuloSchedule MS(MF, &Loop, std::move(OrderedInsts), std::move(Cycles),
869	std::move(Stages));
870	if (EmitTestAnnotations) {
871	assert(NewInstrChanges.empty() &&
872	"Cannot serialize a schedule with InstrChanges!");
873	ModuloScheduleTestAnnotater MSTI(MF, MS);
874	MSTI.annotate();
875	return;
876	}
877	// The experimental code generator can't work if there are InstChanges.
878	if (ExperimentalCodeGen && NewInstrChanges.empty()) {
879	PeelingModuloScheduleExpander MSE(MF, MS, &LIS);
880	MSE.expand();
881	} else if (MVECodeGen && NewInstrChanges.empty() &&
882	LoopPipelinerInfo->isMVEExpanderSupported() &&
883	ModuloScheduleExpanderMVE::canApply(L&: Loop)) {
884	ModuloScheduleExpanderMVE MSE(MF, MS, LIS);
885	MSE.expand();
886	} else {
887	ModuloScheduleExpander MSE(MF, MS, LIS, std::move(NewInstrChanges));
888	MSE.expand();
889	MSE.cleanup();
890	}
891	++NumPipelined;
892	}
893
894	/// Clean up after the software pipeliner runs.
895	void SwingSchedulerDAG::finishBlock() {
896	for (auto &KV : NewMIs)
897	MF.deleteMachineInstr(MI: KV.second);
898	NewMIs.clear();
899
900	// Call the superclass.
901	ScheduleDAGInstrs::finishBlock();
902	}
903
904	/// Return the register values for the operands of a Phi instruction.
905	/// This function assume the instruction is a Phi.
906	static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
907	Register &InitVal, Register &LoopVal) {
908	assert(Phi.isPHI() && "Expecting a Phi.");
909
910	InitVal = Register ();
911	LoopVal = Register ();
912	for (unsigned i = `1`, e = Phi.getNumOperands(); i != e; i += `2`)
913	if (Phi.getOperand(i: i + `1`).getMBB() != Loop)
914	InitVal = Phi.getOperand(i).getReg();
915	else
916	LoopVal = Phi.getOperand(i).getReg();
917
918	assert(InitVal && LoopVal && "Unexpected Phi structure.");
919	}
920
921	/// Return the Phi register value that comes the loop block.
922	static Register getLoopPhiReg(const MachineInstr &Phi,
923	const MachineBasicBlock *LoopBB) {
924	for (unsigned i = `1`, e = Phi.getNumOperands(); i != e; i += `2`)
925	if (Phi.getOperand(i: i + `1`).getMBB() == LoopBB)
926	return Phi.getOperand(i).getReg();
927	return Register ();
928	}
929
930	/// Return true if SUb can be reached from SUa following the chain edges.
931	static bool isSuccOrder(SUnit SUa, SUnit SUb) {
932	SmallPtrSet<SUnit *, `8`> Visited;
933	SmallVector<SUnit *, `8`> Worklist;
934	Worklist.push_back(Elt: SUa);
935	while (!Worklist.empty()) {
936	const SUnit *SU = Worklist.pop_back_val();
937	for (const auto &SI : SU->Succs) {
938	SUnit *SuccSU = SI.getSUnit();
939	if (SI.getKind() == SDep::Order) {
940	if (Visited.count(Ptr: SuccSU))
941	continue;
942	if (SuccSU == SUb)
943	return true;
944	Worklist.push_back(Elt: SuccSU);
945	Visited.insert(Ptr: SuccSU);
946	}
947	}
948	}
949	return false;
950	}
951
952	SUnitWithMemInfo::SUnitWithMemInfo(SUnit *SU) : SU(SU) {
953	if (!getUnderlyingObjects())
954	return;
955	for (const Value *Obj : UnderlyingObjs)
956	if (!isIdentifiedObject(V: Obj)) {
957	IsAllIdentified = false;
958	break;
959	}
960	}
961
962	bool SUnitWithMemInfo::isTriviallyDisjoint(
963	const SUnitWithMemInfo &Other) const {
964	// If all underlying objects are identified objects and there is no overlap
965	// between them, then these two instructions are disjoint.
966	if (!IsAllIdentified \|\| !Other.IsAllIdentified)
967	return false;
968	for (const Value *Obj : UnderlyingObjs)
969	if (llvm::is_contained(Range: Other.UnderlyingObjs, Element: Obj))
970	return false;
971	return true;
972	}
973
974	/// Collect the underlying objects for the memory references of an instruction.
975	/// This function calls the code in ValueTracking, but first checks that the
976	/// instruction has a memory operand.
977	/// Returns false if we cannot find the underlying objects.
978	bool SUnitWithMemInfo::getUnderlyingObjects() {
979	const MachineInstr *MI = SU->getInstr();
980	if (!MI->hasOneMemOperand())
981	return false;
982	MachineMemOperand MM = MI->memoperands_begin();
983	if (!MM->getValue())
984	return false;
985	MemOpValue = MM->getValue();
986	MemOpOffset = MM->getOffset();
987	llvm::getUnderlyingObjects(V: MemOpValue, Objects&: UnderlyingObjs);
988
989	// TODO: A no alias scope may be valid only in a single iteration. In this
990	// case we need to peel off it like LoopAccessAnalysis does.
991	AATags = MM->getAAInfo();
992	return true;
993	}
994
995	/// Returns true if there is a loop-carried order dependency from \p Src to \p
996	/// Dst.
997	static bool
998	hasLoopCarriedMemDep(const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
999	BatchAAResults &BAA, const TargetInstrInfo *TII,
1000	const TargetRegisterInfo *TRI,
1001	const SwingSchedulerDAG SSD, bool* PerformCheapCheck) {
1002	if (Src.isTriviallyDisjoint(Other: Dst))
1003	return false;
1004	if (isSuccOrder(SUa: Src.SU, SUb: Dst.SU))
1005	return false;
1006
1007	MachineInstr &SrcMI = *Src.SU->getInstr();
1008	MachineInstr &DstMI = *Dst.SU->getInstr();
1009	if (PerformCheapCheck) {
1010	// First, perform the cheaper check that compares the base register.
1011	// If they are the same and the load offset is less than the store
1012	// offset, then mark the dependence as loop carried potentially.
1013	//
1014	// TODO: This check will be removed.
1015	const MachineOperand BaseOp1, BaseOp2;
1016	int64_t Offset1, Offset2;
1017	bool Offset1IsScalable, Offset2IsScalable;
1018	if (TII->getMemOperandWithOffset(MI: SrcMI, BaseOp&: BaseOp1, Offset&: Offset1, OffsetIsScalable&: Offset1IsScalable,
1019	TRI) &&
1020	TII->getMemOperandWithOffset(MI: DstMI, BaseOp&: BaseOp2, Offset&: Offset2, OffsetIsScalable&: Offset2IsScalable,
1021	TRI)) {
1022	if (BaseOp1->isIdenticalTo(Other: *BaseOp2) &&
1023	Offset1IsScalable == Offset2IsScalable &&
1024	(int)Offset1 < (int)Offset2) {
1025	assert(TII->areMemAccessesTriviallyDisjoint(SrcMI, DstMI) &&
1026	"What happened to the chain edge?");
1027	return true;
1028	}
1029	}
1030	}
1031
1032	if (!SSD->mayOverlapInLaterIter(BaseMI: &SrcMI, OtherMI: &DstMI))
1033	return false;
1034
1035	// Second, the more expensive check that uses alias analysis on the
1036	// base registers. If they alias, and the load offset is less than
1037	// the store offset, the mark the dependence as loop carried.
1038	if (Src.isUnknown() \|\| Dst.isUnknown())
1039	return true;
1040	if (Src.MemOpValue == Dst.MemOpValue && Src.MemOpOffset <= Dst.MemOpOffset)
1041	return true;
1042
1043	if (BAA.isNoAlias(
1044	LocA: MemoryLocation::getBeforeOrAfter(Ptr: Src.MemOpValue, AATags: Src.AATags),
1045	LocB: MemoryLocation::getBeforeOrAfter(Ptr: Dst.MemOpValue, AATags: Dst.AATags)))
1046	return false;
1047
1048	// AliasAnalysis sometimes gives up on following the underlying
1049	// object. In such a case, separate checks for underlying objects may
1050	// prove that there are no aliases between two accesses.
1051	for (const Value *SrcObj : Src.UnderlyingObjs)
1052	for (const Value *DstObj : Dst.UnderlyingObjs)
1053	if (!BAA.isNoAlias(LocA: MemoryLocation::getBeforeOrAfter(Ptr: SrcObj, AATags: Src.AATags),
1054	LocB: MemoryLocation::getBeforeOrAfter(Ptr: DstObj, AATags: Dst.AATags)))
1055	return true;
1056
1057	return false;
1058	}
1059
1060	void LoopCarriedOrderDepsTracker::LoadStoreChunk::append(SUnit *SU) {
1061	const MachineInstr *MI = SU->getInstr();
1062	if (!MI->mayLoadOrStore())
1063	return;
1064	(MI->mayStore() ? Stores : Loads).emplace_back(Args&: SU);
1065	}
1066
1067	LoopCarriedOrderDepsTracker::LoopCarriedOrderDepsTracker(
1068	SwingSchedulerDAG SSD, BatchAAResults BAA, const TargetInstrInfo *TII,
1069	const TargetRegisterInfo *TRI)
1070	: DAG(SSD), BAA(BAA), SUnits(DAG->SUnits), N(SUnits.size()),
1071	LoopCarried (N, BitVector (N)), TII(TII), TRI(TRI) {}
1072
1073	void LoopCarriedOrderDepsTracker::computeDependencies() {
1074	// Traverse all instructions and extract only what we are targetting.
1075	for (auto &SU : SUnits) {
1076	auto Tagged = getInstrTag(SU: &SU);
1077
1078	// This instruction has no loop-carried order-dependencies.
1079	if (!Tagged)
1080	continue;
1081	TaggedSUnits.emplace_back(args: &SU, args&: *Tagged);
1082	}
1083
1084	computeDependenciesAux();
1085	}
1086
1087	std::optional<LoopCarriedOrderDepsTracker::InstrTag>
1088	LoopCarriedOrderDepsTracker::getInstrTag(SUnit SU) const* {
1089	MachineInstr *MI = SU->getInstr();
1090	if (TII->isGlobalMemoryObject(MI))
1091	return InstrTag::Barrier;
1092
1093	if (MI->mayStore() \|\|
1094	(MI->mayLoad() && !MI->isDereferenceableInvariantLoad()))
1095	return InstrTag::LoadOrStore;
1096
1097	if (MI->mayRaiseFPException())
1098	return InstrTag::FPExceptions;
1099
1100	return std::nullopt;
1101	}
1102
1103	void LoopCarriedOrderDepsTracker::addDependenciesBetweenSUs(
1104	const SUnitWithMemInfo &Src, const SUnitWithMemInfo &Dst,
1105	bool PerformCheapCheck) {
1106	// Avoid self-dependencies.
1107	if (Src.SU == Dst.SU)
1108	return;
1109
1110	if (hasLoopCarriedMemDep(Src, Dst, BAA&: *BAA, TII, TRI, SSD: DAG, PerformCheapCheck))
1111	LoopCarried [Src.SU->NodeNum].set(Dst.SU->NodeNum);
1112	}
1113
1114	void LoopCarriedOrderDepsTracker::addLoopCarriedDepenenciesForChunks(
1115	const LoadStoreChunk &From, const LoadStoreChunk &To) {
1116	// Add load-to-store dependencies (WAR).
1117	for (const SUnitWithMemInfo &Src : From.Loads)
1118	for (const SUnitWithMemInfo &Dst : To.Stores)
1119	// Perform a cheap check first if this is a forward dependency.
1120	addDependenciesBetweenSUs(Src, Dst, PerformCheapCheck: Src.SU->NodeNum < Dst.SU->NodeNum);
1121
1122	// Add store-to-load dependencies (RAW).
1123	for (const SUnitWithMemInfo &Src : From.Stores)
1124	for (const SUnitWithMemInfo &Dst : To.Loads)
1125	addDependenciesBetweenSUs(Src, Dst);
1126
1127	// Add store-to-store dependencies (WAW).
1128	for (const SUnitWithMemInfo &Src : From.Stores)
1129	for (const SUnitWithMemInfo &Dst : To.Stores)
1130	addDependenciesBetweenSUs(Src, Dst);
1131	}
1132
1133	void LoopCarriedOrderDepsTracker::computeDependenciesAux() {
1134	SmallVector<LoadStoreChunk, `2`> Chunks(`1`);
1135	for (const auto &TSU : TaggedSUnits) {
1136	InstrTag Tag = TSU.getTag();
1137	SUnit *SU = TSU.getPointer();
1138	switch (Tag) {
1139	case InstrTag::Barrier:
1140	Chunks.emplace_back();
1141	break;
1142	case InstrTag::LoadOrStore:
1143	Chunks.back().append(SU);
1144	break;
1145	case InstrTag::FPExceptions:
1146	// TODO: Handle this properly.
1147	break;
1148	}
1149	}
1150
1151	// Add dependencies between memory operations. If there are one or more
1152	// barrier events between two memory instructions, we don't add a
1153	// loop-carried dependence for them.
1154	for (const LoadStoreChunk &Chunk : Chunks)
1155	addLoopCarriedDepenenciesForChunks(From: Chunk, To: Chunk);
1156
1157	// TODO: If there are multiple barrier instructions, dependencies from the
1158	// last barrier instruction (or load/store below it) to the first barrier
1159	// instruction (or load/store above it).
1160	}
1161
1162	/// Add a chain edge between a load and store if the store can be an
1163	/// alias of the load on a subsequent iteration, i.e., a loop carried
1164	/// dependence. This code is very similar to the code in ScheduleDAGInstrs
1165	/// but that code doesn't create loop carried dependences.
1166	/// TODO: Also compute output-dependencies.
1167	LoopCarriedEdges SwingSchedulerDAG::addLoopCarriedDependences() {
1168	LoopCarriedEdges LCE;
1169
1170	// Add loop-carried order-dependencies
1171	LoopCarriedOrderDepsTracker LCODTracker(this, &BAA, TII, TRI);
1172	LCODTracker.computeDependencies();
1173	for (unsigned I = `0`; I != SUnits.size(); I++)
1174	for (const int Succ : LCODTracker.getLoopCarried(Idx: I).set_bits())
1175	LCE.OrderDeps [&SUnits [I]].insert(X: &SUnits [Succ]);
1176
1177	LCE.modifySUnits(SUnits, TII);
1178	return LCE;
1179	}
1180
1181	/// Update the phi dependences to the DAG because ScheduleDAGInstrs no longer
1182	/// processes dependences for PHIs. This function adds true dependences
1183	/// from a PHI to a use, and a loop carried dependence from the use to the
1184	/// PHI. The loop carried dependence is represented as an anti dependence
1185	/// edge. This function also removes chain dependences between unrelated
1186	/// PHIs.
1187	void SwingSchedulerDAG::updatePhiDependences() {
1188	SmallVector<SDep, `4`> RemoveDeps;
1189	const TargetSubtargetInfo &ST = MF.getSubtarget<TargetSubtargetInfo>();
1190
1191	// Iterate over each DAG node.
1192	for (SUnit &I : SUnits) {
1193	RemoveDeps.clear();
1194	// Set to true if the instruction has an operand defined by a Phi.
1195	Register HasPhiUse;
1196	Register HasPhiDef;
1197	MachineInstr *MI = I.getInstr();
1198	// Iterate over each operand, and we process the definitions.
1199	for (const MachineOperand &MO : MI->operands()) {
1200	if (!MO.isReg())
1201	continue;
1202	Register Reg = MO.getReg();
1203	if (MO.isDef()) {
1204	// If the register is used by a Phi, then create an anti dependence.
1205	for (MachineRegisterInfo::use_instr_iterator
1206	UI = MRI.use_instr_begin(RegNo: Reg),
1207	UE = MRI.use_instr_end();
1208	UI != UE; ++UI) {
1209	MachineInstr UseMI = &UI;
1210	SUnit *SU = getSUnit(MI: UseMI);
1211	if (SU != nullptr && UseMI->isPHI()) {
1212	if (!MI->isPHI()) {
1213	SDep Dep(SU, SDep::Anti, Reg);
1214	Dep.setLatency(`1`);
1215	I.addPred(D: Dep);
1216	} else {
1217	HasPhiDef = Reg;
1218	// Add a chain edge to a dependent Phi that isn't an existing
1219	// predecessor.
1220	if (SU->NodeNum < I.NodeNum && !I.isPred(N: SU))
1221	I.addPred(D: SDep (SU, SDep::Barrier));
1222	}
1223	}
1224	}
1225	} else if (MO.isUse()) {
1226	// If the register is defined by a Phi, then create a true dependence.
1227	MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg);
1228	if (DefMI == nullptr)
1229	continue;
1230	SUnit *SU = getSUnit(MI: DefMI);
1231	if (SU != nullptr && DefMI->isPHI()) {
1232	if (!MI->isPHI()) {
1233	SDep Dep(SU, SDep::Data, Reg);
1234	Dep.setLatency(`0`);
1235	ST.adjustSchedDependency(Def: SU, DefOpIdx: `0`, Use: &I, UseOpIdx: MO.getOperandNo(), Dep,
1236	SchedModel: &SchedModel);
1237	I.addPred(D: Dep);
1238	} else {
1239	HasPhiUse = Reg;
1240	// Add a chain edge to a dependent Phi that isn't an existing
1241	// predecessor.
1242	if (SU->NodeNum < I.NodeNum && !I.isPred(N: SU))
1243	I.addPred(D: SDep (SU, SDep::Barrier));
1244	}
1245	}
1246	}
1247	}
1248	// Remove order dependences from an unrelated Phi.
1249	if (!SwpPruneDeps)
1250	continue;
1251	for (auto &PI : I.Preds) {
1252	MachineInstr *PMI = PI.getSUnit()->getInstr();
1253	if (PMI->isPHI() && PI.getKind() == SDep::Order) {
1254	if (I.getInstr()->isPHI()) {
1255	if (PMI->getOperand(i: `0`).getReg() == HasPhiUse)
1256	continue;
1257	if (getLoopPhiReg(Phi: *PMI, LoopBB: PMI->getParent()) == HasPhiDef)
1258	continue;
1259	}
1260	RemoveDeps.push_back(Elt: PI);
1261	}
1262	}
1263	for (const SDep &D : RemoveDeps)
1264	I.removePred(D);
1265	}
1266	}
1267
1268	/// Iterate over each DAG node and see if we can change any dependences
1269	/// in order to reduce the recurrence MII.
1270	void SwingSchedulerDAG::changeDependences() {
1271	// See if an instruction can use a value from the previous iteration.
1272	// If so, we update the base and offset of the instruction and change
1273	// the dependences.
1274	for (SUnit &I : SUnits) {
1275	unsigned BasePos = `0`, OffsetPos = `0`;
1276	Register NewBase;
1277	int64_t NewOffset = `0`;
1278	if (!canUseLastOffsetValue(MI: I.getInstr(), BasePos, OffsetPos, NewBase,
1279	NewOffset))
1280	continue;
1281
1282	// Get the MI and SUnit for the instruction that defines the original base.
1283	Register OrigBase = I.getInstr()->getOperand(i: BasePos).getReg();
1284	MachineInstr *DefMI = MRI.getUniqueVRegDef(Reg: OrigBase);
1285	if (!DefMI)
1286	continue;
1287	SUnit *DefSU = getSUnit(MI: DefMI);
1288	if (!DefSU)
1289	continue;
1290	// Get the MI and SUnit for the instruction that defins the new base.
1291	MachineInstr *LastMI = MRI.getUniqueVRegDef(Reg: NewBase);
1292	if (!LastMI)
1293	continue;
1294	SUnit *LastSU = getSUnit(MI: LastMI);
1295	if (!LastSU)
1296	continue;
1297
1298	if (Topo.IsReachable(SU: &I, TargetSU: LastSU))
1299	continue;
1300
1301	// Remove the dependence. The value now depends on a prior iteration.
1302	SmallVector<SDep, `4`> Deps;
1303	for (const SDep &P : I.Preds)
1304	if (P.getSUnit() == DefSU)
1305	Deps.push_back(Elt: P);
1306	for (const SDep &D : Deps) {
1307	Topo.RemovePred(M: &I, N: D.getSUnit());
1308	I.removePred(D);
1309	}
1310	// Remove the chain dependence between the instructions.
1311	Deps.clear();
1312	for (auto &P : LastSU->Preds)
1313	if (P.getSUnit() == &I && P.getKind() == SDep::Order)
1314	Deps.push_back(Elt: P);
1315	for (const SDep &D : Deps) {
1316	Topo.RemovePred(M: LastSU, N: D.getSUnit());
1317	LastSU->removePred(D);
1318	}
1319
1320	// Add a dependence between the new instruction and the instruction
1321	// that defines the new base.
1322	SDep Dep(&I, SDep::Anti, NewBase);
1323	Topo.AddPred(Y: LastSU, X: &I);
1324	LastSU->addPred(D: Dep);
1325
1326	// Remember the base and offset information so that we can update the
1327	// instruction during code generation.
1328	InstrChanges [&I] = std::make_pair(x&: NewBase, y&: NewOffset);
1329	}
1330	}
1331
1332	/// Create an instruction stream that represents a single iteration and stage of
1333	/// each instruction. This function differs from SMSchedule::finalizeSchedule in
1334	/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
1335	/// function is an approximation of SMSchedule::finalizeSchedule with all
1336	/// non-const operations removed.
1337	static void computeScheduledInsts(const SwingSchedulerDAG *SSD,
1338	SMSchedule &Schedule,
1339	std::vector<MachineInstr *> &OrderedInsts,
1340	DenseMap<MachineInstr , unsigned*> &Stages) {
1341	DenseMap<int, std::deque<SUnit *>> Instrs;
1342
1343	// Move all instructions to the first stage from the later stages.
1344	for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1345	++Cycle) {
1346	for (int Stage = `0`, LastStage = Schedule.getMaxStageCount();
1347	Stage <= LastStage; ++Stage) {
1348	for (SUnit *SU : llvm::reverse(C&: Schedule.getInstructions(
1349	cycle: Cycle + Stage * Schedule.getInitiationInterval()))) {
1350	Instrs [Cycle].push_front(x: SU);
1351	}
1352	}
1353	}
1354
1355	for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
1356	++Cycle) {
1357	std::deque<SUnit *> &CycleInstrs = Instrs [Cycle];
1358	CycleInstrs = Schedule.reorderInstructions(SSD, Instrs: CycleInstrs);
1359	for (SUnit *SU : CycleInstrs) {
1360	MachineInstr *MI = SU->getInstr();
1361	OrderedInsts.push_back(x: MI);
1362	Stages [MI] = Schedule.stageScheduled(SU);
1363	}
1364	}
1365	}
1366
1367	namespace {
1368
1369	// FuncUnitSorter - Comparison operator used to sort instructions by
1370	// the number of functional unit choices.
1371	struct FuncUnitSorter {
1372	const InstrItineraryData *InstrItins;
1373	const MCSubtargetInfo *STI;
1374	DenseMap<InstrStage::FuncUnits, unsigned> Resources;
1375
1376	FuncUnitSorter(const TargetSubtargetInfo &TSI)
1377	: InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
1378
1379	// Compute the number of functional unit alternatives needed
1380	// at each stage, and take the minimum value. We prioritize the
1381	// instructions by the least number of choices first.
1382	unsigned minFuncUnits(const MachineInstr *Inst,
1383	InstrStage::FuncUnits &F) const {
1384	unsigned SchedClass = Inst->getDesc().getSchedClass();
1385	unsigned min = UINT_MAX;
1386	if (InstrItins && !InstrItins->isEmpty()) {
1387	for (const InstrStage &IS :
1388	make_range(x: InstrItins->beginStage(ItinClassIndx: SchedClass),
1389	y: InstrItins->endStage(ItinClassIndx: SchedClass))) {
1390	InstrStage::FuncUnits funcUnits = IS.getUnits();
1391	unsigned numAlternatives = llvm::popcount(Value: funcUnits);
1392	if (numAlternatives < min) {
1393	min = numAlternatives;
1394	F = funcUnits;
1395	}
1396	}
1397	return min;
1398	}
1399	if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1400	const MCSchedClassDesc *SCDesc =
1401	STI->getSchedModel().getSchedClassDesc(SchedClassIdx: SchedClass);
1402	if (!SCDesc->isValid())
1403	// No valid Schedule Class Desc for schedClass, should be
1404	// Pseudo/PostRAPseudo
1405	return min;
1406
1407	for (const MCWriteProcResEntry &PRE :
1408	make_range(x: STI->getWriteProcResBegin(SC: SCDesc),
1409	y: STI->getWriteProcResEnd(SC: SCDesc))) {
1410	if (!PRE.ReleaseAtCycle)
1411	continue;
1412	const MCProcResourceDesc *ProcResource =
1413	STI->getSchedModel().getProcResource(ProcResourceIdx: PRE.ProcResourceIdx);
1414	unsigned NumUnits = ProcResource->NumUnits;
1415	if (NumUnits < min) {
1416	min = NumUnits;
1417	F = PRE.ProcResourceIdx;
1418	}
1419	}
1420	return min;
1421	}
1422	llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1423	}
1424
1425	// Compute the critical resources needed by the instruction. This
1426	// function records the functional units needed by instructions that
1427	// must use only one functional unit. We use this as a tie breaker
1428	// for computing the resource MII. The instrutions that require
1429	// the same, highly used, functional unit have high priority.
1430	void calcCriticalResources(MachineInstr &MI) {
1431	unsigned SchedClass = MI.getDesc().getSchedClass();
1432	if (InstrItins && !InstrItins->isEmpty()) {
1433	for (const InstrStage &IS :
1434	make_range(x: InstrItins->beginStage(ItinClassIndx: SchedClass),
1435	y: InstrItins->endStage(ItinClassIndx: SchedClass))) {
1436	InstrStage::FuncUnits FuncUnits = IS.getUnits();
1437	if (llvm::popcount(Value: FuncUnits) == `1`)
1438	Resources [FuncUnits]++;
1439	}
1440	return;
1441	}
1442	if (STI && STI->getSchedModel().hasInstrSchedModel()) {
1443	const MCSchedClassDesc *SCDesc =
1444	STI->getSchedModel().getSchedClassDesc(SchedClassIdx: SchedClass);
1445	if (!SCDesc->isValid())
1446	// No valid Schedule Class Desc for schedClass, should be
1447	// Pseudo/PostRAPseudo
1448	return;
1449
1450	for (const MCWriteProcResEntry &PRE :
1451	make_range(x: STI->getWriteProcResBegin(SC: SCDesc),
1452	y: STI->getWriteProcResEnd(SC: SCDesc))) {
1453	if (!PRE.ReleaseAtCycle)
1454	continue;
1455	Resources [PRE.ProcResourceIdx]++;
1456	}
1457	return;
1458	}
1459	llvm_unreachable("Should have non-empty InstrItins or hasInstrSchedModel!");
1460	}
1461
1462	/// Return true if IS1 has less priority than IS2.
1463	bool operator()(const MachineInstr IS1, const* MachineInstr IS2) const* {
1464	InstrStage::FuncUnits F1 = `0`, F2 = `0`;
1465	unsigned MFUs1 = minFuncUnits(Inst: IS1, F&: F1);
1466	unsigned MFUs2 = minFuncUnits(Inst: IS2, F&: F2);
1467	if (MFUs1 == MFUs2)
1468	return Resources.lookup(Val: F1) < Resources.lookup(Val: F2);
1469	return MFUs1 > MFUs2;
1470	}
1471	};
1472
1473	/// Calculate the maximum register pressure of the scheduled instructions stream
1474	class HighRegisterPressureDetector {
1475	MachineBasicBlock *OrigMBB;
1476	const MachineRegisterInfo &MRI;
1477	const TargetRegisterInfo *TRI;
1478
1479	const unsigned PSetNum;
1480
1481	// Indexed by PSet ID
1482	// InitSetPressure takes into account the register pressure of live-in
1483	// registers. It's not depend on how the loop is scheduled, so it's enough to
1484	// calculate them once at the beginning.
1485	std::vector<unsigned> InitSetPressure;
1486
1487	// Indexed by PSet ID
1488	// Upper limit for each register pressure set
1489	std::vector<unsigned> PressureSetLimit;
1490
1491	DenseMap<MachineInstr *, RegisterOperands> ROMap;
1492
1493	using Instr2LastUsesTy = DenseMap<MachineInstr *, SmallDenseSet<Register, `4`>>;
1494
1495	public:
1496	using OrderedInstsTy = std::vector<MachineInstr *>;
1497	using Instr2StageTy = DenseMap<MachineInstr , unsigned*>;
1498
1499	private:
1500	static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
1501	if (Pressures.size() == `0`) {
1502	dbgs() << "[]";
1503	} else {
1504	char Prefix = `'['`;
1505	for (unsigned P : Pressures) {
1506	dbgs() << Prefix << P;
1507	Prefix = `' '`;
1508	}
1509	dbgs() << `']'`;
1510	}
1511	}
1512
1513	void dumpPSet(Register Reg) const {
1514	dbgs() << "Reg=" << printReg(Reg, TRI, SubIdx: `0`, MRI: &MRI) << " PSet=";
1515	for (auto PSetIter = MRI.getPressureSets(RegUnit: Reg); PSetIter.isValid();
1516	++PSetIter) {
1517	dbgs() << *PSetIter << `' '`;
1518	}
1519	dbgs() << `'\n'`;
1520	}
1521
1522	void increaseRegisterPressure(std::vector<unsigned> &Pressure,
1523	Register Reg) const {
1524	auto PSetIter = MRI.getPressureSets(RegUnit: Reg);
1525	unsigned Weight = PSetIter.getWeight();
1526	for (; PSetIter.isValid(); ++PSetIter)
1527	Pressure [*PSetIter] += Weight;
1528	}
1529
1530	void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
1531	Register Reg) const {
1532	auto PSetIter = MRI.getPressureSets(RegUnit: Reg);
1533	unsigned Weight = PSetIter.getWeight();
1534	for (; PSetIter.isValid(); ++PSetIter) {
1535	auto &P = Pressure [*PSetIter];
1536	assert(P >= Weight &&
1537	"register pressure must be greater than or equal weight");
1538	P -= Weight;
1539	}
1540	}
1541
1542	// Return true if Reg is reserved one, for example, stack pointer
1543	bool isReservedRegister(Register Reg) const {
1544	return Reg.isPhysical() && MRI.isReserved(PhysReg: Reg.asMCReg());
1545	}
1546
1547	bool isDefinedInThisLoop(Register Reg) const {
1548	return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
1549	}
1550
1551	// Search for live-in variables. They are factored into the register pressure
1552	// from the begining. Live-in variables used by every iteration should be
1553	// considered as alive throughout the loop. For example, the variable `c` in
1554	// following code. \code
1555	// int c = ...;
1556	// for (int i = 0; i < n; i++)
1557	// a[i] += b[i] + c;
1558	// \endcode
1559	void computeLiveIn() {
1560	DenseSet<Register> Used;
1561	for (auto &MI : *OrigMBB) {
1562	if (MI.isDebugInstr())
1563	continue;
1564	for (auto &Use : ROMap [&MI].Uses) {
1565	auto Reg = Use.RegUnit;
1566	// Ignore the variable that appears only on one side of phi instruction
1567	// because it's used only at the first iteration.
1568	if (MI.isPHI() && Reg != getLoopPhiReg(Phi: MI, LoopBB: OrigMBB))
1569	continue;
1570	if (isReservedRegister(Reg))
1571	continue;
1572	if (isDefinedInThisLoop(Reg))
1573	continue;
1574	Used.insert(V: Reg);
1575	}
1576	}
1577
1578	for (auto LiveIn : Used)
1579	increaseRegisterPressure(Pressure&: InitSetPressure, Reg: LiveIn);
1580	}
1581
1582	// Calculate the upper limit of each pressure set
1583	void computePressureSetLimit(const RegisterClassInfo &RCI) {
1584	for (unsigned PSet = `0`; PSet < PSetNum; PSet++)
1585	PressureSetLimit [PSet] = RCI.getRegPressureSetLimit(Idx: PSet);
1586	}
1587
1588	// There are two patterns of last-use.
1589	// - by an instruction of the current iteration
1590	// - by a phi instruction of the next iteration (loop carried value)
1591	//
1592	// Furthermore, following two groups of instructions are executed
1593	// simultaneously
1594	// - next iteration's phi instructions in i-th stage
1595	// - current iteration's instructions in i+1-th stage
1596	//
1597	// This function calculates the last-use of each register while taking into
1598	// account the above two patterns.
1599	Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
1600	Instr2StageTy &Stages) const {
1601	// We treat virtual registers that are defined and used in this loop.
1602	// Following virtual register will be ignored
1603	// - live-in one
1604	// - defined but not used in the loop (potentially live-out)
1605	DenseSet<Register> TargetRegs;
1606	const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
1607	if (isDefinedInThisLoop(Reg))
1608	TargetRegs.insert(V: Reg);
1609	};
1610	for (MachineInstr *MI : OrderedInsts) {
1611	if (MI->isPHI()) {
1612	Register Reg = getLoopPhiReg(Phi: *MI, LoopBB: OrigMBB);
1613	UpdateTargetRegs(Reg);
1614	} else {
1615	for (auto &Use : ROMap.find(Val: MI)->getSecond().Uses)
1616	UpdateTargetRegs(Use.RegUnit);
1617	}
1618	}
1619
1620	const auto InstrScore = [&Stages](MachineInstr *MI) {
1621	return Stages [MI] + MI->isPHI();
1622	};
1623
1624	DenseMap<Register, MachineInstr *> LastUseMI;
1625	for (MachineInstr *MI : llvm::reverse(C: OrderedInsts)) {
1626	for (auto &Use : ROMap.find(Val: MI)->getSecond().Uses) {
1627	auto Reg = Use.RegUnit;
1628	if (!TargetRegs.contains(V: Reg))
1629	continue;
1630	auto [Ite, Inserted] = LastUseMI.try_emplace(Key: Reg, Args&: MI);
1631	if (!Inserted) {
1632	MachineInstr *Orig = Ite ->second;
1633	MachineInstr *New = MI;
1634	if (InstrScore(Orig) < InstrScore(New))
1635	Ite ->second = New;
1636	}
1637	}
1638	}
1639
1640	Instr2LastUsesTy LastUses;
1641	for (auto &Entry : LastUseMI)
1642	LastUses [Entry.second].insert(V: Entry.first);
1643	return LastUses;
1644	}
1645
1646	// Compute the maximum register pressure of the kernel. We'll simulate #Stage
1647	// iterations and check the register pressure at the point where all stages
1648	// overlapping.
1649	//
1650	// An example of unrolled loop where #Stage is 4..
1651	// Iter i+0 i+1 i+2 i+3
1652	// ------------------------
1653	// Stage 0
1654	// Stage 1 0
1655	// Stage 2 1 0
1656	// Stage 3 2 1 0 <- All stages overlap
1657	//
1658	std::vector<unsigned>
1659	computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
1660	Instr2StageTy &Stages,
1661	const unsigned StageCount) const {
1662	using RegSetTy = SmallDenseSet<Register, `16`>;
1663
1664	// Indexed by #Iter. To treat "local" variables of each stage separately, we
1665	// manage the liveness of the registers independently by iterations.
1666	SmallVector<RegSetTy> LiveRegSets(StageCount);
1667
1668	auto CurSetPressure = InitSetPressure;
1669	auto MaxSetPressure = InitSetPressure;
1670	auto LastUses = computeLastUses(OrderedInsts, Stages);
1671
1672	LLVM_DEBUG({
1673	dbgs() << "Ordered instructions:\n";
1674	for (MachineInstr *MI : OrderedInsts) {
1675	dbgs() << "Stage " << Stages[MI] << ": ";
1676	MI->dump();
1677	}
1678	});
1679
1680	const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
1681	Register Reg) {
1682	if (!Reg.isValid() \|\| isReservedRegister(Reg))
1683	return;
1684
1685	bool Inserted = RegSet.insert(V: Reg).second;
1686	if (!Inserted)
1687	return;
1688
1689	LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, `0`, &MRI) << "\n");
1690	increaseRegisterPressure(Pressure&: CurSetPressure, Reg);
1691	LLVM_DEBUG(dumpPSet(Reg));
1692	};
1693
1694	const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
1695	Register Reg) {
1696	if (!Reg.isValid() \|\| isReservedRegister(Reg))
1697	return;
1698
1699	// live-in register
1700	if (!RegSet.contains(V: Reg))
1701	return;
1702
1703	LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, `0`, &MRI) << "\n");
1704	RegSet.erase(V: Reg);
1705	decreaseRegisterPressure(Pressure&: CurSetPressure, Reg);
1706	LLVM_DEBUG(dumpPSet(Reg));
1707	};
1708
1709	for (unsigned I = `0`; I < StageCount; I++) {
1710	for (MachineInstr *MI : OrderedInsts) {
1711	const auto Stage = Stages [MI];
1712	if (I < Stage)
1713	continue;
1714
1715	const unsigned Iter = I - Stage;
1716
1717	for (auto &Def : ROMap.find(Val: MI)->getSecond().Defs)
1718	InsertReg(LiveRegSets [Iter], Def.RegUnit);
1719
1720	for (auto LastUse : LastUses [MI]) {
1721	if (MI->isPHI()) {
1722	if (Iter != `0`)
1723	EraseReg(LiveRegSets [Iter - `1`], LastUse);
1724	} else {
1725	EraseReg(LiveRegSets [Iter], LastUse);
1726	}
1727	}
1728
1729	for (unsigned PSet = `0`; PSet < PSetNum; PSet++)
1730	MaxSetPressure [PSet] =
1731	std::max(a: MaxSetPressure [PSet], b: CurSetPressure [PSet]);
1732
1733	LLVM_DEBUG({
1734	dbgs() << "CurSetPressure=";
1735	dumpRegisterPressures(CurSetPressure);
1736	dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
1737	MI->dump();
1738	});
1739	}
1740	}
1741
1742	return MaxSetPressure;
1743	}
1744
1745	public:
1746	HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
1747	const MachineFunction &MF)
1748	: OrigMBB(OrigMBB), MRI(MF.getRegInfo()),
1749	TRI(MF.getSubtarget().getRegisterInfo()),
1750	PSetNum(TRI->getNumRegPressureSets()), InitSetPressure (PSetNum, `0`),
1751	PressureSetLimit (PSetNum, `0`) {}
1752
1753	// Used to calculate register pressure, which is independent of loop
1754	// scheduling.
1755	void init(const RegisterClassInfo &RCI) {
1756	for (MachineInstr &MI : *OrigMBB) {
1757	if (MI.isDebugInstr())
1758	continue;
1759	ROMap [&MI].collect(MI, TRI: TRI, MRI, TrackLaneMasks: false, IgnoreDead: true*);
1760	}
1761
1762	computeLiveIn();
1763	computePressureSetLimit(RCI);
1764	}
1765
1766	// Calculate the maximum register pressures of the loop and check if they
1767	// exceed the limit
1768	bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
1769	const unsigned MaxStage) const {
1770	assert(`0` <= RegPressureMargin && RegPressureMargin <= `100` &&
1771	"the percentage of the margin must be between 0 to 100");
1772
1773	OrderedInstsTy OrderedInsts;
1774	Instr2StageTy Stages;
1775	computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
1776	const auto MaxSetPressure =
1777	computeMaxSetPressure(OrderedInsts, Stages, StageCount: MaxStage + `1`);
1778
1779	LLVM_DEBUG({
1780	dbgs() << "Dump MaxSetPressure:\n";
1781	for (unsigned I = `0`; I < MaxSetPressure.size(); I++) {
1782	dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
1783	}
1784	dbgs() << `'\n'`;
1785	});
1786
1787	for (unsigned PSet = `0`; PSet < PSetNum; PSet++) {
1788	unsigned Limit = PressureSetLimit [PSet];
1789	unsigned Margin = Limit * RegPressureMargin / `100`;
1790	LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
1791	<< " Margin=" << Margin << "\n");
1792	if (Limit < MaxSetPressure [PSet] + Margin) {
1793	LLVM_DEBUG(
1794	dbgs()
1795	<< "Rejected the schedule because of too high register pressure\n");
1796	return true;
1797	}
1798	}
1799	return false;
1800	}
1801	};
1802
1803	} // end anonymous namespace
1804
1805	/// Calculate the resource constrained minimum initiation interval for the
1806	/// specified loop. We use the DFA to model the resources needed for
1807	/// each instruction, and we ignore dependences. A different DFA is created
1808	/// for each cycle that is required. When adding a new instruction, we attempt
1809	/// to add it to each existing DFA, until a legal space is found. If the
1810	/// instruction cannot be reserved in an existing DFA, we create a new one.
1811	unsigned SwingSchedulerDAG::calculateResMII() {
1812	LLVM_DEBUG(dbgs() << "calculateResMII:\n");
1813	ResourceManager RM(&MF.getSubtarget(), this);
1814	return RM.calculateResMII();
1815	}
1816
1817	/// Calculate the recurrence-constrainted minimum initiation interval.
1818	/// Iterate over each circuit. Compute the delay(c) and distance(c)
1819	/// for each circuit. The II needs to satisfy the inequality
1820	/// delay(c) - IIdistance(c) <= 0. For each circuit, choose the smallest*
1821	/// II that satisfies the inequality, and the RecMII is the maximum
1822	/// of those values.
1823	unsigned SwingSchedulerDAG::calculateRecMII(NodeSetType &NodeSets) {
1824	unsigned RecMII = `0`;
1825
1826	for (NodeSet &Nodes : NodeSets) {
1827	if (Nodes.empty())
1828	continue;
1829
1830	unsigned Delay = Nodes.getLatency();
1831	unsigned Distance = `1`;
1832
1833	// ii = ceil(delay / distance)
1834	unsigned CurMII = (Delay + Distance - `1`) / Distance;
1835	Nodes.setRecMII(CurMII);
1836	if (CurMII > RecMII)
1837	RecMII = CurMII;
1838	}
1839
1840	return RecMII;
1841	}
1842
1843	/// Create the adjacency structure of the nodes in the graph.
1844	void SwingSchedulerDAG::Circuits::createAdjacencyStructure(
1845	SwingSchedulerDAG *DAG) {
1846	BitVector Added(SUnits.size());
1847	DenseMap<int, int> OutputDeps;
1848	for (int i = `0`, e = SUnits.size(); i != e; ++i) {
1849	Added.reset();
1850	// Add any successor to the adjacency matrix and exclude duplicates.
1851	for (auto &OE : DAG->DDG ->getOutEdges(SU: &SUnits [i])) {
1852	// Only create a back-edge on the first and last nodes of a dependence
1853	// chain. This records any chains and adds them later.
1854	if (OE.isOutputDep()) {
1855	int N = OE.getDst()->NodeNum;
1856	int BackEdge = i;
1857	auto Dep = OutputDeps.find(Val: BackEdge);
1858	if (Dep != OutputDeps.end()) {
1859	BackEdge = Dep ->second;
1860	OutputDeps.erase(I: Dep);
1861	}
1862	OutputDeps [N] = BackEdge;
1863	}
1864	// Do not process a boundary node, an artificial node.
1865	if (OE.getDst()->isBoundaryNode() \|\| OE.isArtificial())
1866	continue;
1867
1868	// This code is retained o preserve previous behavior and prevent
1869	// regression. This condition means that anti-dependnecies within an
1870	// iteration are ignored when searching circuits. Therefore it's natural
1871	// to consider this dependence as well.
1872	// FIXME: Remove this code if it doesn't have significant impact on
1873	// performance.
1874	if (OE.isAntiDep())
1875	continue;
1876
1877	int N = OE.getDst()->NodeNum;
1878	if (!Added.test(Idx: N)) {
1879	AdjK [i].push_back(Elt: N);
1880	Added.set(N);
1881	}
1882	}
1883	// A chain edge between a store and a load is treated as a back-edge in the
1884	// adjacency matrix.
1885	for (auto &IE : DAG->DDG ->getInEdges(SU: &SUnits [i])) {
1886	SUnit *Src = IE.getSrc();
1887	SUnit *Dst = IE.getDst();
1888	if (!Dst->getInstr()->mayStore() \|\| !DAG->isLoopCarriedDep(Edge: IE))
1889	continue;
1890	if (IE.isOrderDep() && Src->getInstr()->mayLoad()) {
1891	int N = Src->NodeNum;
1892	if (!Added.test(Idx: N)) {
1893	AdjK [i].push_back(Elt: N);
1894	Added.set(N);
1895	}
1896	}
1897	}
1898	}
1899	// Add back-edges in the adjacency matrix for the output dependences.
1900	for (auto &OD : OutputDeps)
1901	if (!Added.test(Idx: OD.second)) {
1902	AdjK [OD.first].push_back(Elt: OD.second);
1903	Added.set(OD.second);
1904	}
1905	}
1906
1907	/// Identify an elementary circuit in the dependence graph starting at the
1908	/// specified node.
1909	bool SwingSchedulerDAG::Circuits::circuit(int V, int S, NodeSetType &NodeSets,
1910	const SwingSchedulerDAG *DAG,
1911	bool HasBackedge) {
1912	SUnit *SV = &SUnits [V];
1913	bool F = false;
1914	Stack.insert(X: SV);
1915	Blocked.set(V);
1916
1917	for (auto W : AdjK [V]) {
1918	if (NumPaths > MaxPaths)
1919	break;
1920	if (W < S)
1921	continue;
1922	if (W == S) {
1923	if (!HasBackedge)
1924	NodeSets.push_back(Elt: NodeSet (Stack.begin(), Stack.end(), DAG));
1925	F = true;
1926	++NumPaths;
1927	break;
1928	}
1929	if (!Blocked.test(Idx: W)) {
1930	if (circuit(V: W, S, NodeSets, DAG,
1931	HasBackedge: Node2Idx->at(n: W) < Node2Idx->at(n: V) ? true : HasBackedge))
1932	F = true;
1933	}
1934	}
1935
1936	if (F)
1937	unblock(U: V);
1938	else {
1939	for (auto W : AdjK [V]) {
1940	if (W < S)
1941	continue;
1942	B [W].insert(Ptr: SV);
1943	}
1944	}
1945	Stack.pop_back();
1946	return F;
1947	}
1948
1949	/// Unblock a node in the circuit finding algorithm.
1950	void SwingSchedulerDAG::Circuits::unblock(int U) {
1951	Blocked.reset(Idx: U);
1952	SmallPtrSet<SUnit *, `4`> &BU = B [U];
1953	while (!BU.empty()) {
1954	SmallPtrSet<SUnit *, `4`>::iterator SI = BU.begin();
1955	assert(SI != BU.end() && "Invalid B set.");
1956	SUnit W = SI;
1957	BU.erase(Ptr: W);
1958	if (Blocked.test(Idx: W->NodeNum))
1959	unblock(U: W->NodeNum);
1960	}
1961	}
1962
1963	/// Identify all the elementary circuits in the dependence graph using
1964	/// Johnson's circuit algorithm.
1965	void SwingSchedulerDAG::findCircuits(NodeSetType &NodeSets) {
1966	Circuits Cir(SUnits, Topo);
1967	// Create the adjacency structure.
1968	Cir.createAdjacencyStructure(DAG: this);
1969	for (int I = `0`, E = SUnits.size(); I != E; ++I) {
1970	Cir.reset();
1971	Cir.circuit(V: I, S: I, NodeSets, DAG: this);
1972	}
1973	}
1974
1975	// Create artificial dependencies between the source of COPY/REG_SEQUENCE that
1976	// is loop-carried to the USE in next iteration. This will help pipeliner avoid
1977	// additional copies that are needed across iterations. An artificial dependence
1978	// edge is added from USE to SOURCE of COPY/REG_SEQUENCE.
1979
1980	// PHI-------Anti-Dep-----> COPY/REG_SEQUENCE (loop-carried)
1981	// SRCOfCopY------True-Dep---> COPY/REG_SEQUENCE
1982	// PHI-------True-Dep------> USEOfPhi
1983
1984	// The mutation creates
1985	// USEOfPHI -------Artificial-Dep---> SRCOfCopy
1986
1987	// This overall will ensure, the USEOfPHI is scheduled before SRCOfCopy
1988	// (since USE is a predecessor), implies, the COPY/ REG_SEQUENCE is scheduled
1989	// late to avoid additional copies across iterations. The possible scheduling
1990	// order would be
1991	// USEOfPHI --- SRCOfCopy--- COPY/REG_SEQUENCE.
1992
1993	void SwingSchedulerDAG::CopyToPhiMutation::apply(ScheduleDAGInstrs *DAG) {
1994	for (SUnit &SU : DAG->SUnits) {
1995	// Find the COPY/REG_SEQUENCE instruction.
1996	if (!SU.getInstr()->isCopy() && !SU.getInstr()->isRegSequence())
1997	continue;
1998
1999	// Record the loop carried PHIs.
2000	SmallVector<SUnit *, `4`> PHISUs;
2001	// Record the SrcSUs that feed the COPY/REG_SEQUENCE instructions.
2002	SmallVector<SUnit *, `4`> SrcSUs;
2003
2004	for (auto &Dep : SU.Preds) {
2005	SUnit *TmpSU = Dep.getSUnit();
2006	MachineInstr *TmpMI = TmpSU->getInstr();
2007	SDep::Kind DepKind = Dep.getKind();
2008	// Save the loop carried PHI.
2009	if (DepKind == SDep::Anti && TmpMI->isPHI())
2010	PHISUs.push_back(Elt: TmpSU);
2011	// Save the source of COPY/REG_SEQUENCE.
2012	// If the source has no pre-decessors, we will end up creating cycles.
2013	else if (DepKind == SDep::Data && !TmpMI->isPHI() && TmpSU->NumPreds > `0`)
2014	SrcSUs.push_back(Elt: TmpSU);
2015	}
2016
2017	if (PHISUs.size() == `0` \|\| SrcSUs.size() == `0`)
2018	continue;
2019
2020	// Find the USEs of PHI. If the use is a PHI or REG_SEQUENCE, push back this
2021	// SUnit to the container.
2022	SmallVector<SUnit *, `8`> UseSUs;
2023	// Do not use iterator based loop here as we are updating the container.
2024	for (size_t Index = `0`; Index < PHISUs.size(); ++Index) {
2025	for (auto &Dep : PHISUs [Index]->Succs) {
2026	if (Dep.getKind() != SDep::Data)
2027	continue;
2028
2029	SUnit *TmpSU = Dep.getSUnit();
2030	MachineInstr *TmpMI = TmpSU->getInstr();
2031	if (TmpMI->isPHI() \|\| TmpMI->isRegSequence()) {
2032	PHISUs.push_back(Elt: TmpSU);
2033	continue;
2034	}
2035	UseSUs.push_back(Elt: TmpSU);
2036	}
2037	}
2038
2039	if (UseSUs.size() == `0`)
2040	continue;
2041
2042	SwingSchedulerDAG *SDAG = cast<SwingSchedulerDAG>(Val: DAG);
2043	// Add the artificial dependencies if it does not form a cycle.
2044	for (auto *I : UseSUs) {
2045	for (auto *Src : SrcSUs) {
2046	if (!SDAG->Topo.IsReachable(SU: I, TargetSU: Src) && Src != I) {
2047	Src->addPred(D: SDep (I, SDep::Artificial));
2048	SDAG->Topo.AddPred(Y: Src, X: I);
2049	}
2050	}
2051	}
2052	}
2053	}
2054
2055	/// Compute several functions need to order the nodes for scheduling.
2056	/// ASAP - Earliest time to schedule a node.
2057	/// ALAP - Latest time to schedule a node.
2058	/// MOV - Mobility function, difference between ALAP and ASAP.
2059	/// D - Depth of each node.
2060	/// H - Height of each node.
2061	void SwingSchedulerDAG::computeNodeFunctions(NodeSetType &NodeSets) {
2062	ScheduleInfo.resize(new_size: SUnits.size());
2063
2064	LLVM_DEBUG({
2065	for (int I : Topo) {
2066	const SUnit &SU = SUnits[I];
2067	dumpNode(SU);
2068	}
2069	});
2070
2071	int maxASAP = `0`;
2072	// Compute ASAP and ZeroLatencyDepth.
2073	for (int I : Topo) {
2074	int asap = `0`;
2075	int zeroLatencyDepth = `0`;
2076	SUnit *SU = &SUnits [I];
2077	for (const auto &IE : DDG ->getInEdges(SU)) {
2078	SUnit *Pred = IE.getSrc();
2079	if (IE.getLatency() == `0`)
2080	zeroLatencyDepth =
2081	std::max(a: zeroLatencyDepth, b: getZeroLatencyDepth(Node: Pred) + `1`);
2082	if (IE.ignoreDependence(IgnoreAnti: true))
2083	continue;
2084	asap = std::max(a: asap, b: (int)(getASAP(Node: Pred) + IE.getLatency() -
2085	IE.getDistance() * MII));
2086	}
2087	maxASAP = std::max(a: maxASAP, b: asap);
2088	ScheduleInfo [I].ASAP = asap;
2089	ScheduleInfo [I].ZeroLatencyDepth = zeroLatencyDepth;
2090	}
2091
2092	// Compute ALAP, ZeroLatencyHeight, and MOV.
2093	for (int I : llvm::reverse(C&: Topo)) {
2094	int alap = maxASAP;
2095	int zeroLatencyHeight = `0`;
2096	SUnit *SU = &SUnits [I];
2097	for (const auto &OE : DDG ->getOutEdges(SU)) {
2098	SUnit *Succ = OE.getDst();
2099	if (Succ->isBoundaryNode())
2100	continue;
2101	if (OE.getLatency() == `0`)
2102	zeroLatencyHeight =
2103	std::max(a: zeroLatencyHeight, b: getZeroLatencyHeight(Node: Succ) + `1`);
2104	if (OE.ignoreDependence(IgnoreAnti: true))
2105	continue;
2106	alap = std::min(a: alap, b: (int)(getALAP(Node: Succ) - OE.getLatency() +
2107	OE.getDistance() * MII));
2108	}
2109
2110	ScheduleInfo [I].ALAP = alap;
2111	ScheduleInfo [I].ZeroLatencyHeight = zeroLatencyHeight;
2112	}
2113
2114	// After computing the node functions, compute the summary for each node set.
2115	for (NodeSet &I : NodeSets)
2116	I.computeNodeSetInfo(SSD: this);
2117
2118	LLVM_DEBUG({
2119	for (unsigned i = `0`; i < SUnits.size(); i++) {
2120	dbgs() << "\tNode " << i << ":\n";
2121	dbgs() << "\t ASAP = " << getASAP(&SUnits[i]) << "\n";
2122	dbgs() << "\t ALAP = " << getALAP(&SUnits[i]) << "\n";
2123	dbgs() << "\t MOV = " << getMOV(&SUnits[i]) << "\n";
2124	dbgs() << "\t D = " << getDepth(&SUnits[i]) << "\n";
2125	dbgs() << "\t H = " << getHeight(&SUnits[i]) << "\n";
2126	dbgs() << "\t ZLD = " << getZeroLatencyDepth(&SUnits[i]) << "\n";
2127	dbgs() << "\t ZLH = " << getZeroLatencyHeight(&SUnits[i]) << "\n";
2128	}
2129	});
2130	}
2131
2132	/// Compute the Pred_L(O) set, as defined in the paper. The set is defined
2133	/// as the predecessors of the elements of NodeOrder that are not also in
2134	/// NodeOrder.
2135	static bool pred_L(SetVector<SUnit *> &NodeOrder,
2136	SmallSetVector<SUnit , `8`> &Preds, SwingSchedulerDDG DDG,
2137	const NodeSet S = nullptr*) {
2138	Preds.clear();
2139
2140	for (SUnit *SU : NodeOrder) {
2141	for (const auto &IE : DDG->getInEdges(SU)) {
2142	SUnit *PredSU = IE.getSrc();
2143	if (S && S->count(SU: PredSU) == `0`)
2144	continue;
2145	if (IE.ignoreDependence(IgnoreAnti: true))
2146	continue;
2147	if (NodeOrder.count(key: PredSU) == `0`)
2148	Preds.insert(X: PredSU);
2149	}
2150
2151	// FIXME: The following loop-carried dependencies may also need to be
2152	// considered.
2153	// - Physical register dependencies (true-dependence and WAW).
2154	// - Memory dependencies.
2155	for (const auto &OE : DDG->getOutEdges(SU)) {
2156	SUnit *SuccSU = OE.getDst();
2157	if (!OE.isAntiDep())
2158	continue;
2159	if (S && S->count(SU: SuccSU) == `0`)
2160	continue;
2161	if (NodeOrder.count(key: SuccSU) == `0`)
2162	Preds.insert(X: SuccSU);
2163	}
2164	}
2165	return !Preds.empty();
2166	}
2167
2168	/// Compute the Succ_L(O) set, as defined in the paper. The set is defined
2169	/// as the successors of the elements of NodeOrder that are not also in
2170	/// NodeOrder.
2171	static bool succ_L(SetVector<SUnit *> &NodeOrder,
2172	SmallSetVector<SUnit , `8`> &Succs, SwingSchedulerDDG DDG,
2173	const NodeSet S = nullptr*) {
2174	Succs.clear();
2175
2176	for (SUnit *SU : NodeOrder) {
2177	for (const auto &OE : DDG->getOutEdges(SU)) {
2178	SUnit *SuccSU = OE.getDst();
2179	if (S && S->count(SU: SuccSU) == `0`)
2180	continue;
2181	if (OE.ignoreDependence(IgnoreAnti: false))
2182	continue;
2183	if (NodeOrder.count(key: SuccSU) == `0`)
2184	Succs.insert(X: SuccSU);
2185	}
2186
2187	// FIXME: The following loop-carried dependencies may also need to be
2188	// considered.
2189	// - Physical register dependnecies (true-dependnece and WAW).
2190	// - Memory dependencies.
2191	for (const auto &IE : DDG->getInEdges(SU)) {
2192	SUnit *PredSU = IE.getSrc();
2193	if (!IE.isAntiDep())
2194	continue;
2195	if (S && S->count(SU: PredSU) == `0`)
2196	continue;
2197	if (NodeOrder.count(key: PredSU) == `0`)
2198	Succs.insert(X: PredSU);
2199	}
2200	}
2201	return !Succs.empty();
2202	}
2203
2204	/// Return true if there is a path from the specified node to any of the nodes
2205	/// in DestNodes. Keep track and return the nodes in any path.
2206	static bool computePath(SUnit Cur, SetVector<SUnit > &Path,
2207	SetVector<SUnit *> &DestNodes,
2208	SetVector<SUnit *> &Exclude,
2209	SmallPtrSet<SUnit *, `8`> &Visited,
2210	SwingSchedulerDDG *DDG) {
2211	if (Cur->isBoundaryNode())
2212	return false;
2213	if (Exclude.contains(key: Cur))
2214	return false;
2215	if (DestNodes.contains(key: Cur))
2216	return true;
2217	if (!Visited.insert(Ptr: Cur).second)
2218	return Path.contains(key: Cur);
2219	bool FoundPath = false;
2220	for (const auto &OE : DDG->getOutEdges(SU: Cur))
2221	if (!OE.ignoreDependence(IgnoreAnti: false))
2222	FoundPath \|=
2223	computePath(Cur: OE.getDst(), Path, DestNodes, Exclude, Visited, DDG);
2224	for (const auto &IE : DDG->getInEdges(SU: Cur))
2225	if (IE.isAntiDep() && IE.getDistance() == `0`)
2226	FoundPath \|=
2227	computePath(Cur: IE.getSrc(), Path, DestNodes, Exclude, Visited, DDG);
2228	if (FoundPath)
2229	Path.insert(X: Cur);
2230	return FoundPath;
2231	}
2232
2233	/// Compute the live-out registers for the instructions in a node-set.
2234	/// The live-out registers are those that are defined in the node-set,
2235	/// but not used. Except for use operands of Phis.
2236	static void computeLiveOuts(MachineFunction &MF, RegPressureTracker &RPTracker,
2237	NodeSet &NS) {
2238	const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2239	MachineRegisterInfo &MRI = MF.getRegInfo();
2240	SmallVector<VRegMaskOrUnit, `8`> LiveOutRegs;
2241	SmallSet<Register, `4`> Uses;
2242	for (SUnit *SU : NS) {
2243	const MachineInstr *MI = SU->getInstr();
2244	if (MI->isPHI())
2245	continue;
2246	for (const MachineOperand &MO : MI->all_uses()) {
2247	Register Reg = MO.getReg();
2248	if (Reg.isVirtual())
2249	Uses.insert(V: Reg);
2250	else if (MRI.isAllocatable(PhysReg: Reg))
2251	Uses.insert_range(R: TRI->regunits(Reg: Reg.asMCReg()));
2252	}
2253	}
2254	for (SUnit *SU : NS)
2255	for (const MachineOperand &MO : SU->getInstr()->all_defs())
2256	if (!MO.isDead()) {
2257	Register Reg = MO.getReg();
2258	if (Reg.isVirtual()) {
2259	if (!Uses.count(V: Reg))
2260	LiveOutRegs.emplace_back(Args&: Reg, Args: LaneBitmask::getNone());
2261	} else if (MRI.isAllocatable(PhysReg: Reg)) {
2262	for (MCRegUnit Unit : TRI->regunits(Reg: Reg.asMCReg()))
2263	if (!Uses.count(V: Unit))
2264	LiveOutRegs.emplace_back(Args&: Unit, Args: LaneBitmask::getNone());
2265	}
2266	}
2267	RPTracker.addLiveRegs(Regs: LiveOutRegs);
2268	}
2269
2270	/// A heuristic to filter nodes in recurrent node-sets if the register
2271	/// pressure of a set is too high.
2272	void SwingSchedulerDAG::registerPressureFilter(NodeSetType &NodeSets) {
2273	for (auto &NS : NodeSets) {
2274	// Skip small node-sets since they won't cause register pressure problems.
2275	if (NS.size() <= `2`)
2276	continue;
2277	IntervalPressure RecRegPressure;
2278	RegPressureTracker RecRPTracker(RecRegPressure);
2279	RecRPTracker.init(mf: &MF, rci: &RegClassInfo, lis: &LIS, mbb: BB, pos: BB->end(), TrackLaneMasks: false, TrackUntiedDefs: true);
2280	computeLiveOuts(MF, RPTracker&: RecRPTracker, NS);
2281	RecRPTracker.closeBottom();
2282
2283	std::vector<SUnit *> SUnits(NS.begin(), NS.end());
2284	llvm::sort(C&: SUnits, Comp: [](const SUnit A, const* SUnit *B) {
2285	return A->NodeNum > B->NodeNum;
2286	});
2287
2288	for (auto &SU : SUnits) {
2289	// Since we're computing the register pressure for a subset of the
2290	// instructions in a block, we need to set the tracker for each
2291	// instruction in the node-set. The tracker is set to the instruction
2292	// just after the one we're interested in.
2293	MachineBasicBlock::const_iterator CurInstI = SU->getInstr();
2294	RecRPTracker.setPos(std::next(x: CurInstI));
2295
2296	RegPressureDelta RPDelta;
2297	ArrayRef<PressureChange> CriticalPSets;
2298	RecRPTracker.getMaxUpwardPressureDelta(MI: SU->getInstr(), PDiff: nullptr, Delta&: RPDelta,
2299	CriticalPSets,
2300	MaxPressureLimit: RecRegPressure.MaxSetPressure);
2301	if (RPDelta.Excess.isValid()) {
2302	LLVM_DEBUG(
2303	dbgs() << "Excess register pressure: SU(" << SU->NodeNum << ") "
2304	<< TRI->getRegPressureSetName(RPDelta.Excess.getPSet())
2305	<< ":" << RPDelta.Excess.getUnitInc() << "\n");
2306	NS.setExceedPressure(SU);
2307	break;
2308	}
2309	RecRPTracker.recede();
2310	}
2311	}
2312	}
2313
2314	/// A heuristic to colocate node sets that have the same set of
2315	/// successors.
2316	void SwingSchedulerDAG::colocateNodeSets(NodeSetType &NodeSets) {
2317	unsigned Colocate = `0`;
2318	for (int i = `0`, e = NodeSets.size(); i < e; ++i) {
2319	NodeSet &N1 = NodeSets [i];
2320	SmallSetVector<SUnit *, `8`> S1;
2321	if (N1.empty() \|\| !succ_L(NodeOrder&: N1, Succs&: S1, DDG: DDG.get()))
2322	continue;
2323	for (int j = i + `1`; j < e; ++j) {
2324	NodeSet &N2 = NodeSets [j];
2325	if (N1.compareRecMII(RHS&: N2) != `0`)
2326	continue;
2327	SmallSetVector<SUnit *, `8`> S2;
2328	if (N2.empty() \|\| !succ_L(NodeOrder&: N2, Succs&: S2, DDG: DDG.get()))
2329	continue;
2330	if (llvm::set_is_subset(S1, S2) && S1.size() == S2.size()) {
2331	N1.setColocate(++Colocate);
2332	N2.setColocate(Colocate);
2333	break;
2334	}
2335	}
2336	}
2337	}
2338
2339	/// Check if the existing node-sets are profitable. If not, then ignore the
2340	/// recurrent node-sets, and attempt to schedule all nodes together. This is
2341	/// a heuristic. If the MII is large and all the recurrent node-sets are small,
2342	/// then it's best to try to schedule all instructions together instead of
2343	/// starting with the recurrent node-sets.
2344	void SwingSchedulerDAG::checkNodeSets(NodeSetType &NodeSets) {
2345	// Look for loops with a large MII.
2346	if (MII < `17`)
2347	return;
2348	// Check if the node-set contains only a simple add recurrence.
2349	for (auto &NS : NodeSets) {
2350	if (NS.getRecMII() > `2`)
2351	return;
2352	if (NS.getMaxDepth() > MII)
2353	return;
2354	}
2355	NodeSets.clear();
2356	LLVM_DEBUG(dbgs() << "Clear recurrence node-sets\n");
2357	}
2358
2359	/// Add the nodes that do not belong to a recurrence set into groups
2360	/// based upon connected components.
2361	void SwingSchedulerDAG::groupRemainingNodes(NodeSetType &NodeSets) {
2362	SetVector<SUnit *> NodesAdded;
2363	SmallPtrSet<SUnit *, `8`> Visited;
2364	// Add the nodes that are on a path between the previous node sets and
2365	// the current node set.
2366	for (NodeSet &I : NodeSets) {
2367	SmallSetVector<SUnit *, `8`> N;
2368	// Add the nodes from the current node set to the previous node set.
2369	if (succ_L(NodeOrder&: I, Succs&: N, DDG: DDG.get())) {
2370	SetVector<SUnit *> Path;
2371	for (SUnit *NI : N) {
2372	Visited.clear();
2373	computePath(Cur: NI, Path, DestNodes&: NodesAdded, Exclude&: I, Visited, DDG: DDG.get());
2374	}
2375	if (!Path.empty())
2376	I.insert(S: Path.begin(), E: Path.end());
2377	}
2378	// Add the nodes from the previous node set to the current node set.
2379	N.clear();
2380	if (succ_L(NodeOrder&: NodesAdded, Succs&: N, DDG: DDG.get())) {
2381	SetVector<SUnit *> Path;
2382	for (SUnit *NI : N) {
2383	Visited.clear();
2384	computePath(Cur: NI, Path, DestNodes&: I, Exclude&: NodesAdded, Visited, DDG: DDG.get());
2385	}
2386	if (!Path.empty())
2387	I.insert(S: Path.begin(), E: Path.end());
2388	}
2389	NodesAdded.insert_range(R&: I);
2390	}
2391
2392	// Create a new node set with the connected nodes of any successor of a node
2393	// in a recurrent set.
2394	NodeSet NewSet;
2395	SmallSetVector<SUnit *, `8`> N;
2396	if (succ_L(NodeOrder&: NodesAdded, Succs&: N, DDG: DDG.get()))
2397	for (SUnit *I : N)
2398	addConnectedNodes(SU: I, NewSet, NodesAdded);
2399	if (!NewSet.empty())
2400	NodeSets.push_back(Elt: NewSet);
2401
2402	// Create a new node set with the connected nodes of any predecessor of a node
2403	// in a recurrent set.
2404	NewSet.clear();
2405	if (pred_L(NodeOrder&: NodesAdded, Preds&: N, DDG: DDG.get()))
2406	for (SUnit *I : N)
2407	addConnectedNodes(SU: I, NewSet, NodesAdded);
2408	if (!NewSet.empty())
2409	NodeSets.push_back(Elt: NewSet);
2410
2411	// Create new nodes sets with the connected nodes any remaining node that
2412	// has no predecessor.
2413	for (SUnit &SU : SUnits) {
2414	if (NodesAdded.count(key: &SU) == `0`) {
2415	NewSet.clear();
2416	addConnectedNodes(SU: &SU, NewSet, NodesAdded);
2417	if (!NewSet.empty())
2418	NodeSets.push_back(Elt: NewSet);
2419	}
2420	}
2421	}
2422
2423	/// Add the node to the set, and add all of its connected nodes to the set.
2424	void SwingSchedulerDAG::addConnectedNodes(SUnit *SU, NodeSet &NewSet,
2425	SetVector<SUnit *> &NodesAdded) {
2426	NewSet.insert(SU);
2427	NodesAdded.insert(X: SU);
2428	for (auto &OE : DDG ->getOutEdges(SU)) {
2429	SUnit *Successor = OE.getDst();
2430	if (!OE.isArtificial() && !Successor->isBoundaryNode() &&
2431	NodesAdded.count(key: Successor) == `0`)
2432	addConnectedNodes(SU: Successor, NewSet, NodesAdded);
2433	}
2434	for (auto &IE : DDG ->getInEdges(SU)) {
2435	SUnit *Predecessor = IE.getSrc();
2436	if (!IE.isArtificial() && NodesAdded.count(key: Predecessor) == `0`)
2437	addConnectedNodes(SU: Predecessor, NewSet, NodesAdded);
2438	}
2439	}
2440
2441	/// Return true if Set1 contains elements in Set2. The elements in common
2442	/// are returned in a different container.
2443	static bool isIntersect(SmallSetVector<SUnit , `8`> &Set1, const* NodeSet &Set2,
2444	SmallSetVector<SUnit *, `8`> &Result) {
2445	Result.clear();
2446	for (SUnit *SU : Set1) {
2447	if (Set2.count(SU) != `0`)
2448	Result.insert(X: SU);
2449	}
2450	return !Result.empty();
2451	}
2452
2453	/// Merge the recurrence node sets that have the same initial node.
2454	void SwingSchedulerDAG::fuseRecs(NodeSetType &NodeSets) {
2455	for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2456	++I) {
2457	NodeSet &NI = *I;
2458	for (NodeSetType::iterator J = I + `1`; J != E;) {
2459	NodeSet &NJ = *J;
2460	if (NI.getNode(i: `0`)->NodeNum == NJ.getNode(i: `0`)->NodeNum) {
2461	if (NJ.compareRecMII(RHS&: NI) > `0`)
2462	NI.setRecMII(NJ.getRecMII());
2463	for (SUnit SU : J)
2464	I->insert(SU);
2465	NodeSets.erase(CI: J);
2466	E = NodeSets.end();
2467	} else {
2468	++J;
2469	}
2470	}
2471	}
2472	}
2473
2474	/// Remove nodes that have been scheduled in previous NodeSets.
2475	void SwingSchedulerDAG::removeDuplicateNodes(NodeSetType &NodeSets) {
2476	for (NodeSetType::iterator I = NodeSets.begin(), E = NodeSets.end(); I != E;
2477	++I)
2478	for (NodeSetType::iterator J = I + `1`; J != E;) {
2479	J->remove_if(P: [&](SUnit SUJ) { return* I->count(SU: SUJ); });
2480
2481	if (J->empty()) {
2482	NodeSets.erase(CI: J);
2483	E = NodeSets.end();
2484	} else {
2485	++J;
2486	}
2487	}
2488	}
2489
2490	/// Compute an ordered list of the dependence graph nodes, which
2491	/// indicates the order that the nodes will be scheduled. This is a
2492	/// two-level algorithm. First, a partial order is created, which
2493	/// consists of a list of sets ordered from highest to lowest priority.
2494	void SwingSchedulerDAG::computeNodeOrder(NodeSetType &NodeSets) {
2495	SmallSetVector<SUnit *, `8`> R;
2496	NodeOrder.clear();
2497
2498	for (auto &Nodes : NodeSets) {
2499	LLVM_DEBUG(dbgs() << "NodeSet size " << Nodes.size() << "\n");
2500	OrderKind Order;
2501	SmallSetVector<SUnit *, `8`> N;
2502	if (pred_L(NodeOrder, Preds&: N, DDG: DDG.get()) && llvm::set_is_subset(S1: N, S2: Nodes)) {
2503	R.insert_range(R&: N);
2504	Order = BottomUp;
2505	LLVM_DEBUG(dbgs() << " Bottom up (preds) ");
2506	} else if (succ_L(NodeOrder, Succs&: N, DDG: DDG.get()) &&
2507	llvm::set_is_subset(S1: N, S2: Nodes)) {
2508	R.insert_range(R&: N);
2509	Order = TopDown;
2510	LLVM_DEBUG(dbgs() << " Top down (succs) ");
2511	} else if (isIntersect(Set1&: N, Set2: Nodes, Result&: R)) {
2512	// If some of the successors are in the existing node-set, then use the
2513	// top-down ordering.
2514	Order = TopDown;
2515	LLVM_DEBUG(dbgs() << " Top down (intersect) ");
2516	} else if (NodeSets.size() == `1`) {
2517	for (const auto &N : Nodes)
2518	if (N->Succs.size() == `0`)
2519	R.insert(X: N);
2520	Order = BottomUp;
2521	LLVM_DEBUG(dbgs() << " Bottom up (all) ");
2522	} else {
2523	// Find the node with the highest ASAP.
2524	SUnit maxASAP = nullptr*;
2525	for (SUnit *SU : Nodes) {
2526	if (maxASAP == nullptr \|\| getASAP(Node: SU) > getASAP(Node: maxASAP) \|\|
2527	(getASAP(Node: SU) == getASAP(Node: maxASAP) && SU->NodeNum > maxASAP->NodeNum))
2528	maxASAP = SU;
2529	}
2530	R.insert(X: maxASAP);
2531	Order = BottomUp;
2532	LLVM_DEBUG(dbgs() << " Bottom up (default) ");
2533	}
2534
2535	while (!R.empty()) {
2536	if (Order == TopDown) {
2537	// Choose the node with the maximum height. If more than one, choose
2538	// the node wiTH the maximum ZeroLatencyHeight. If still more than one,
2539	// choose the node with the lowest MOV.
2540	while (!R.empty()) {
2541	SUnit maxHeight = nullptr*;
2542	for (SUnit *I : R) {
2543	if (maxHeight == nullptr \|\| getHeight(Node: I) > getHeight(Node: maxHeight))
2544	maxHeight = I;
2545	else if (getHeight(Node: I) == getHeight(Node: maxHeight) &&
2546	getZeroLatencyHeight(Node: I) > getZeroLatencyHeight(Node: maxHeight))
2547	maxHeight = I;
2548	else if (getHeight(Node: I) == getHeight(Node: maxHeight) &&
2549	getZeroLatencyHeight(Node: I) ==
2550	getZeroLatencyHeight(Node: maxHeight) &&
2551	getMOV(Node: I) < getMOV(Node: maxHeight))
2552	maxHeight = I;
2553	}
2554	NodeOrder.insert(X: maxHeight);
2555	LLVM_DEBUG(dbgs() << maxHeight->NodeNum << " ");
2556	R.remove(X: maxHeight);
2557	for (const auto &OE : DDG ->getOutEdges(SU: maxHeight)) {
2558	SUnit *SU = OE.getDst();
2559	if (Nodes.count(SU) == `0`)
2560	continue;
2561	if (NodeOrder.contains(key: SU))
2562	continue;
2563	if (OE.ignoreDependence(IgnoreAnti: false))
2564	continue;
2565	R.insert(X: SU);
2566	}
2567
2568	// FIXME: The following loop-carried dependencies may also need to be
2569	// considered.
2570	// - Physical register dependnecies (true-dependnece and WAW).
2571	// - Memory dependencies.
2572	for (const auto &IE : DDG ->getInEdges(SU: maxHeight)) {
2573	SUnit *SU = IE.getSrc();
2574	if (!IE.isAntiDep())
2575	continue;
2576	if (Nodes.count(SU) == `0`)
2577	continue;
2578	if (NodeOrder.contains(key: SU))
2579	continue;
2580	R.insert(X: SU);
2581	}
2582	}
2583	Order = BottomUp;
2584	LLVM_DEBUG(dbgs() << "\n Switching order to bottom up ");
2585	SmallSetVector<SUnit *, `8`> N;
2586	if (pred_L(NodeOrder, Preds&: N, DDG: DDG.get(), S: &Nodes))
2587	R.insert_range(R&: N);
2588	} else {
2589	// Choose the node with the maximum depth. If more than one, choose
2590	// the node with the maximum ZeroLatencyDepth. If still more than one,
2591	// choose the node with the lowest MOV.
2592	while (!R.empty()) {
2593	SUnit maxDepth = nullptr*;
2594	for (SUnit *I : R) {
2595	if (maxDepth == nullptr \|\| getDepth(Node: I) > getDepth(Node: maxDepth))
2596	maxDepth = I;
2597	else if (getDepth(Node: I) == getDepth(Node: maxDepth) &&
2598	getZeroLatencyDepth(Node: I) > getZeroLatencyDepth(Node: maxDepth))
2599	maxDepth = I;
2600	else if (getDepth(Node: I) == getDepth(Node: maxDepth) &&
2601	getZeroLatencyDepth(Node: I) == getZeroLatencyDepth(Node: maxDepth) &&
2602	getMOV(Node: I) < getMOV(Node: maxDepth))
2603	maxDepth = I;
2604	}
2605	NodeOrder.insert(X: maxDepth);
2606	LLVM_DEBUG(dbgs() << maxDepth->NodeNum << " ");
2607	R.remove(X: maxDepth);
2608	if (Nodes.isExceedSU(SU: maxDepth)) {
2609	Order = TopDown;
2610	R.clear();
2611	R.insert(X: Nodes.getNode(i: `0`));
2612	break;
2613	}
2614	for (const auto &IE : DDG ->getInEdges(SU: maxDepth)) {
2615	SUnit *SU = IE.getSrc();
2616	if (Nodes.count(SU) == `0`)
2617	continue;
2618	if (NodeOrder.contains(key: SU))
2619	continue;
2620	R.insert(X: SU);
2621	}
2622
2623	// FIXME: The following loop-carried dependencies may also need to be
2624	// considered.
2625	// - Physical register dependnecies (true-dependnece and WAW).
2626	// - Memory dependencies.
2627	for (const auto &OE : DDG ->getOutEdges(SU: maxDepth)) {
2628	SUnit *SU = OE.getDst();
2629	if (!OE.isAntiDep())
2630	continue;
2631	if (Nodes.count(SU) == `0`)
2632	continue;
2633	if (NodeOrder.contains(key: SU))
2634	continue;
2635	R.insert(X: SU);
2636	}
2637	}
2638	Order = TopDown;
2639	LLVM_DEBUG(dbgs() << "\n Switching order to top down ");
2640	SmallSetVector<SUnit *, `8`> N;
2641	if (succ_L(NodeOrder, Succs&: N, DDG: DDG.get(), S: &Nodes))
2642	R.insert_range(R&: N);
2643	}
2644	}
2645	LLVM_DEBUG(dbgs() << "\nDone with Nodeset\n");
2646	}
2647
2648	LLVM_DEBUG({
2649	dbgs() << "Node order: ";
2650	for (SUnit *I : NodeOrder)
2651	dbgs() << " " << I->NodeNum << " ";
2652	dbgs() << "\n";
2653	});
2654	}
2655
2656	/// Process the nodes in the computed order and create the pipelined schedule
2657	/// of the instructions, if possible. Return true if a schedule is found.
2658	bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
2659
2660	if (NodeOrder.empty()){
2661	LLVM_DEBUG(dbgs() << "NodeOrder is empty! abort scheduling\n" );
2662	return false;
2663	}
2664
2665	bool scheduleFound = false;
2666	std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
2667	if (LimitRegPressure) {
2668	HRPDetector =
2669	std::make_unique<HighRegisterPressureDetector>(args: Loop.getHeader(), args&: MF);
2670	HRPDetector ->init(RCI: RegClassInfo);
2671	}
2672	// Keep increasing II until a valid schedule is found.
2673	for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
2674	Schedule.reset();
2675	Schedule.setInitiationInterval(II);
2676	LLVM_DEBUG(dbgs() << "Try to schedule with " << II << "\n");
2677
2678	SetVector<SUnit *>::iterator NI = NodeOrder.begin();
2679	SetVector<SUnit *>::iterator NE = NodeOrder.end();
2680	do {
2681	SUnit SU = NI;
2682
2683	// Compute the schedule time for the instruction, which is based
2684	// upon the scheduled time for any predecessors/successors.
2685	int EarlyStart = INT_MIN;
2686	int LateStart = INT_MAX;
2687	Schedule.computeStart(SU, MaxEarlyStart: &EarlyStart, MinLateStart: &LateStart, II, DAG: this);
2688	LLVM_DEBUG({
2689	dbgs() << "\n";
2690	dbgs() << "Inst (" << SU->NodeNum << ") ";
2691	SU->getInstr()->dump();
2692	dbgs() << "\n";
2693	});
2694	LLVM_DEBUG(
2695	dbgs() << format("\tes: %8x ls: %8x\n", EarlyStart, LateStart));
2696
2697	if (EarlyStart > LateStart)
2698	scheduleFound = false;
2699	else if (EarlyStart != INT_MIN && LateStart == INT_MAX)
2700	scheduleFound =
2701	Schedule.insert(SU, StartCycle: EarlyStart, EndCycle: EarlyStart + (int)II - `1`, II);
2702	else if (EarlyStart == INT_MIN && LateStart != INT_MAX)
2703	scheduleFound =
2704	Schedule.insert(SU, StartCycle: LateStart, EndCycle: LateStart - (int)II + `1`, II);
2705	else if (EarlyStart != INT_MIN && LateStart != INT_MAX) {
2706	LateStart = std::min(a: LateStart, b: EarlyStart + (int)II - `1`);
2707	// When scheduling a Phi it is better to start at the late cycle and
2708	// go backwards. The default order may insert the Phi too far away
2709	// from its first dependence.
2710	// Also, do backward search when all scheduled predecessors are
2711	// loop-carried output/order dependencies. Empirically, there are also
2712	// cases where scheduling becomes possible with backward search.
2713	if (SU->getInstr()->isPHI() \|\|
2714	Schedule.onlyHasLoopCarriedOutputOrOrderPreds(SU, DDG: this->getDDG()))
2715	scheduleFound = Schedule.insert(SU, StartCycle: LateStart, EndCycle: EarlyStart, II);
2716	else
2717	scheduleFound = Schedule.insert(SU, StartCycle: EarlyStart, EndCycle: LateStart, II);
2718	} else {
2719	int FirstCycle = Schedule.getFirstCycle();
2720	scheduleFound = Schedule.insert(SU, StartCycle: FirstCycle + getASAP(Node: SU),
2721	EndCycle: FirstCycle + getASAP(Node: SU) + II - `1`, II);
2722	}
2723
2724	// Even if we find a schedule, make sure the schedule doesn't exceed the
2725	// allowable number of stages. We keep trying if this happens.
2726	if (scheduleFound)
2727	if (SwpMaxStages > -`1` &&
2728	Schedule.getMaxStageCount() > (unsigned)SwpMaxStages)
2729	scheduleFound = false;
2730
2731	LLVM_DEBUG({
2732	if (!scheduleFound)
2733	dbgs() << "\tCan't schedule\n";
2734	});
2735	} while (++NI != NE && scheduleFound);
2736
2737	// If a schedule is found, validate it against the validation-only
2738	// dependencies.
2739	if (scheduleFound)
2740	scheduleFound = DDG ->isValidSchedule(Schedule);
2741
2742	// If a schedule is found, ensure non-pipelined instructions are in stage 0
2743	if (scheduleFound)
2744	scheduleFound =
2745	Schedule.normalizeNonPipelinedInstructions(SSD: this, PLI: LoopPipelinerInfo);
2746
2747	// If a schedule is found, check if it is a valid schedule too.
2748	if (scheduleFound)
2749	scheduleFound = Schedule.isValidSchedule(SSD: this);
2750
2751	// If a schedule was found and the option is enabled, check if the schedule
2752	// might generate additional register spills/fills.
2753	if (scheduleFound && LimitRegPressure)
2754	scheduleFound =
2755	!HRPDetector ->detect(SSD: this, Schedule, MaxStage: Schedule.getMaxStageCount());
2756	}
2757
2758	LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
2759	<< " (II=" << Schedule.getInitiationInterval()
2760	<< ")\n");
2761
2762	if (scheduleFound) {
2763	scheduleFound = LoopPipelinerInfo->shouldUseSchedule(SSD&: *this, SMS&: Schedule);
2764	if (!scheduleFound)
2765	LLVM_DEBUG(dbgs() << "Target rejected schedule\n");
2766	}
2767
2768	if (scheduleFound) {
2769	Schedule.finalizeSchedule(SSD: this);
2770	Pass.ORE->emit(RemarkBuilder: [&]() {
2771	return MachineOptimizationRemarkAnalysis (
2772	DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
2773	<< "Schedule found with Initiation Interval: "
2774	<< ore::NV ("II", Schedule.getInitiationInterval())
2775	<< ", MaxStageCount: "
2776	<< ore::NV ("MaxStageCount", Schedule.getMaxStageCount());
2777	});
2778	} else
2779	Schedule.reset();
2780
2781	return scheduleFound && Schedule.getMaxStageCount() > `0`;
2782	}
2783
2784	static Register findUniqueOperandDefinedInLoop(const MachineInstr &MI) {
2785	const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
2786	Register Result;
2787	for (const MachineOperand &Use : MI.all_uses()) {
2788	Register Reg = Use.getReg();
2789	if (!Reg.isVirtual())
2790	return Register ();
2791	if (MRI.getVRegDef(Reg)->getParent() != MI.getParent())
2792	continue;
2793	if (Result)
2794	return Register ();
2795	Result = Reg;
2796	}
2797	return Result;
2798	}
2799
2800	/// When Op is a value that is incremented recursively in a loop and there is a
2801	/// unique instruction that increments it, returns true and sets Value.
2802	static bool findLoopIncrementValue(const MachineOperand &Op, int &Value) {
2803	if (!Op.isReg() \|\| !Op.getReg().isVirtual())
2804	return false;
2805
2806	Register OrgReg = Op.getReg();
2807	Register CurReg = OrgReg;
2808	const MachineBasicBlock *LoopBB = Op.getParent()->getParent();
2809	const MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo();
2810
2811	const TargetInstrInfo *TII =
2812	LoopBB->getParent()->getSubtarget().getInstrInfo();
2813	const TargetRegisterInfo *TRI =
2814	LoopBB->getParent()->getSubtarget().getRegisterInfo();
2815
2816	MachineInstr Phi = nullptr*;
2817	MachineInstr Increment = nullptr*;
2818
2819	// Traverse definitions until it reaches Op or an instruction that does not
2820	// satisfy the condition.
2821	// Acceptable example:
2822	// bb.0:
2823	// %0 = PHI %3, %bb.0, ...
2824	// %2 = ADD %0, Value
2825	// ... = LOAD %2(Op)
2826	// %3 = COPY %2
2827	while (true) {
2828	if (!CurReg.isValid() \|\| !CurReg.isVirtual())
2829	return false;
2830	MachineInstr *Def = MRI.getVRegDef(Reg: CurReg);
2831	if (Def->getParent() != LoopBB)
2832	return false;
2833
2834	if (Def->isCopy()) {
2835	// Ignore copy instructions unless they contain subregisters
2836	if (Def->getOperand(i: `0`).getSubReg() \|\| Def->getOperand(i: `1`).getSubReg())
2837	return false;
2838	CurReg = Def->getOperand(i: `1`).getReg();
2839	} else if (Def->isPHI()) {
2840	// There must be just one Phi
2841	if (Phi)
2842	return false;
2843	Phi = Def;
2844	CurReg = getLoopPhiReg(Phi: *Def, LoopBB);
2845	} else if (TII->getIncrementValue(MI: *Def, Value)) {
2846	// Potentially a unique increment
2847	if (Increment)
2848	// Multiple increments exist
2849	return false;
2850
2851	const MachineOperand *BaseOp;
2852	int64_t Offset;
2853	bool OffsetIsScalable;
2854	if (TII->getMemOperandWithOffset(MI: *Def, BaseOp, Offset, OffsetIsScalable,
2855	TRI)) {
2856	// Pre/post increment instruction
2857	CurReg = BaseOp->getReg();
2858	} else {
2859	// If only one of the operands is defined within the loop, it is assumed
2860	// to be an incremented value.
2861	CurReg = findUniqueOperandDefinedInLoop(MI: *Def);
2862	if (!CurReg.isValid())
2863	return false;
2864	}
2865	Increment = Def;
2866	} else {
2867	return false;
2868	}
2869	if (CurReg == OrgReg)
2870	break;
2871	}
2872
2873	if (!Phi \|\| !Increment)
2874	return false;
2875
2876	return true;
2877	}
2878
2879	/// Return true if we can compute the amount the instruction changes
2880	/// during each iteration. Set Delta to the amount of the change.
2881	bool SwingSchedulerDAG::computeDelta(const MachineInstr &MI, int &Delta) const {
2882	const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
2883	const MachineOperand *BaseOp;
2884	int64_t Offset;
2885	bool OffsetIsScalable;
2886	if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
2887	return false;
2888
2889	// FIXME: This algorithm assumes instructions have fixed-size offsets.
2890	if (OffsetIsScalable)
2891	return false;
2892
2893	if (!BaseOp->isReg())
2894	return false;
2895
2896	return findLoopIncrementValue(Op: *BaseOp, Value&: Delta);
2897	}
2898
2899	/// Check if we can change the instruction to use an offset value from the
2900	/// previous iteration. If so, return true and set the base and offset values
2901	/// so that we can rewrite the load, if necessary.
2902	/// v1 = Phi(v0, v3)
2903	/// v2 = load v1, 0
2904	/// v3 = post_store v1, 4, x
2905	/// This function enables the load to be rewritten as v2 = load v3, 4.
2906	bool SwingSchedulerDAG::canUseLastOffsetValue(MachineInstr *MI,
2907	unsigned &BasePos,
2908	unsigned &OffsetPos,
2909	Register &NewBase,
2910	int64_t &Offset) {
2911	// Get the load instruction.
2912	if (TII->isPostIncrement(MI: *MI))
2913	return false;
2914	unsigned BasePosLd, OffsetPosLd;
2915	if (!TII->getBaseAndOffsetPosition(MI: *MI, BasePos&: BasePosLd, OffsetPos&: OffsetPosLd))
2916	return false;
2917	Register BaseReg = MI->getOperand(i: BasePosLd).getReg();
2918
2919	// Look for the Phi instruction.
2920	MachineRegisterInfo &MRI = MI->getMF()->getRegInfo();
2921	MachineInstr *Phi = MRI.getVRegDef(Reg: BaseReg);
2922	if (!Phi \|\| !Phi->isPHI())
2923	return false;
2924	// Get the register defined in the loop block.
2925	Register PrevReg = getLoopPhiReg(Phi: *Phi, LoopBB: MI->getParent());
2926	if (!PrevReg)
2927	return false;
2928
2929	// Check for the post-increment load/store instruction.
2930	MachineInstr *PrevDef = MRI.getVRegDef(Reg: PrevReg);
2931	if (!PrevDef \|\| PrevDef == MI)
2932	return false;
2933
2934	if (!TII->isPostIncrement(MI: *PrevDef))
2935	return false;
2936
2937	unsigned BasePos1 = `0`, OffsetPos1 = `0`;
2938	if (!TII->getBaseAndOffsetPosition(MI: *PrevDef, BasePos&: BasePos1, OffsetPos&: OffsetPos1))
2939	return false;
2940
2941	// Make sure that the instructions do not access the same memory location in
2942	// the next iteration.
2943	int64_t LoadOffset = MI->getOperand(i: OffsetPosLd).getImm();
2944	int64_t StoreOffset = PrevDef->getOperand(i: OffsetPos1).getImm();
2945	MachineInstr *NewMI = MF.CloneMachineInstr(Orig: MI);
2946	NewMI->getOperand(i: OffsetPosLd).setImm(LoadOffset + StoreOffset);
2947	bool Disjoint = TII->areMemAccessesTriviallyDisjoint(MIa: NewMI, MIb: PrevDef);
2948	MF.deleteMachineInstr(MI: NewMI);
2949	if (!Disjoint)
2950	return false;
2951
2952	// Set the return value once we determine that we return true.
2953	BasePos = BasePosLd;
2954	OffsetPos = OffsetPosLd;
2955	NewBase = PrevReg;
2956	Offset = StoreOffset;
2957	return true;
2958	}
2959
2960	/// Apply changes to the instruction if needed. The changes are need
2961	/// to improve the scheduling and depend up on the final schedule.
2962	void SwingSchedulerDAG::applyInstrChange(MachineInstr *MI,
2963	SMSchedule &Schedule) {
2964	SUnit *SU = getSUnit(MI);
2965	DenseMap<SUnit *, std::pair<Register, int64_t>>::iterator It =
2966	InstrChanges.find(Val: SU);
2967	if (It != InstrChanges.end()) {
2968	std::pair<Register, int64_t> RegAndOffset = It ->second;
2969	unsigned BasePos, OffsetPos;
2970	if (!TII->getBaseAndOffsetPosition(MI: *MI, BasePos, OffsetPos))
2971	return;
2972	Register BaseReg = MI->getOperand(i: BasePos).getReg();
2973	MachineInstr *LoopDef = findDefInLoop(Reg: BaseReg);
2974	int DefStageNum = Schedule.stageScheduled(SU: getSUnit(MI: LoopDef));
2975	int DefCycleNum = Schedule.cycleScheduled(SU: getSUnit(MI: LoopDef));
2976	int BaseStageNum = Schedule.stageScheduled(SU);
2977	int BaseCycleNum = Schedule.cycleScheduled(SU);
2978	if (BaseStageNum < DefStageNum) {
2979	MachineInstr *NewMI = MF.CloneMachineInstr(Orig: MI);
2980	int OffsetDiff = DefStageNum - BaseStageNum;
2981	if (DefCycleNum < BaseCycleNum) {
2982	NewMI->getOperand(i: BasePos).setReg(RegAndOffset.first);
2983	if (OffsetDiff > `0`)
2984	--OffsetDiff;
2985	}
2986	int64_t NewOffset =
2987	MI->getOperand(i: OffsetPos).getImm() + RegAndOffset.second * OffsetDiff;
2988	NewMI->getOperand(i: OffsetPos).setImm(NewOffset);
2989	SU->setInstr(NewMI);
2990	MISUnitMap [NewMI] = SU;
2991	NewMIs [MI] = NewMI;
2992	}
2993	}
2994	}
2995
2996	/// Return the instruction in the loop that defines the register.
2997	/// If the definition is a Phi, then follow the Phi operand to
2998	/// the instruction in the loop.
2999	MachineInstr *SwingSchedulerDAG::findDefInLoop(Register Reg) {
3000	SmallPtrSet<MachineInstr *, `8`> Visited;
3001	MachineInstr *Def = MRI.getVRegDef(Reg);
3002	while (Def->isPHI()) {
3003	if (!Visited.insert(Ptr: Def).second)
3004	break;
3005	for (unsigned i = `1`, e = Def->getNumOperands(); i < e; i += `2`)
3006	if (Def->getOperand(i: i + `1`).getMBB() == BB) {
3007	Def = MRI.getVRegDef(Reg: Def->getOperand(i).getReg());
3008	break;
3009	}
3010	}
3011	return Def;
3012	}
3013
3014	/// Return false if there is no overlap between the region accessed by BaseMI in
3015	/// an iteration and the region accessed by OtherMI in subsequent iterations.
3016	bool SwingSchedulerDAG::mayOverlapInLaterIter(
3017	const MachineInstr BaseMI, const* MachineInstr OtherMI) const* {
3018	int DeltaB, DeltaO, Delta;
3019	if (!computeDelta(MI: BaseMI, Delta&: DeltaB) \|\| !computeDelta(MI: OtherMI, Delta&: DeltaO) \|\|
3020	DeltaB != DeltaO)
3021	return true;
3022	Delta = DeltaB;
3023
3024	const MachineOperand BaseOpB, BaseOpO;
3025	int64_t OffsetB, OffsetO;
3026	bool OffsetBIsScalable, OffsetOIsScalable;
3027	const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
3028	if (!TII->getMemOperandWithOffset(MI: *BaseMI, BaseOp&: BaseOpB, Offset&: OffsetB,
3029	OffsetIsScalable&: OffsetBIsScalable, TRI) \|\|
3030	!TII->getMemOperandWithOffset(MI: *OtherMI, BaseOp&: BaseOpO, Offset&: OffsetO,
3031	OffsetIsScalable&: OffsetOIsScalable, TRI))
3032	return true;
3033
3034	if (OffsetBIsScalable \|\| OffsetOIsScalable)
3035	return true;
3036
3037	if (!BaseOpB->isIdenticalTo(Other: *BaseOpO)) {
3038	// Pass cases with different base operands but same initial values.
3039	// Typically for when pre/post increment is used.
3040
3041	if (!BaseOpB->isReg() \|\| !BaseOpO->isReg())
3042	return true;
3043	Register RegB = BaseOpB->getReg(), RegO = BaseOpO->getReg();
3044	if (!RegB.isVirtual() \|\| !RegO.isVirtual())
3045	return true;
3046
3047	MachineInstr *DefB = MRI.getVRegDef(Reg: BaseOpB->getReg());
3048	MachineInstr *DefO = MRI.getVRegDef(Reg: BaseOpO->getReg());
3049	if (!DefB \|\| !DefO \|\| !DefB->isPHI() \|\| !DefO->isPHI())
3050	return true;
3051
3052	Register InitValB;
3053	Register LoopValB;
3054	Register InitValO;
3055	Register LoopValO;
3056	getPhiRegs(Phi&: *DefB, Loop: BB, InitVal&: InitValB, LoopVal&: LoopValB);
3057	getPhiRegs(Phi&: *DefO, Loop: BB, InitVal&: InitValO, LoopVal&: LoopValO);
3058	MachineInstr *InitDefB = MRI.getVRegDef(Reg: InitValB);
3059	MachineInstr *InitDefO = MRI.getVRegDef(Reg: InitValO);
3060
3061	if (!InitDefB->isIdenticalTo(Other: *InitDefO))
3062	return true;
3063	}
3064
3065	LocationSize AccessSizeB = (*BaseMI->memoperands_begin())->getSize();
3066	LocationSize AccessSizeO = (*OtherMI->memoperands_begin())->getSize();
3067
3068	// This is the main test, which checks the offset values and the loop
3069	// increment value to determine if the accesses may be loop carried.
3070	if (!AccessSizeB.hasValue() \|\| !AccessSizeO.hasValue())
3071	return true;
3072
3073	LLVM_DEBUG({
3074	dbgs() << "Overlap check:\n";
3075	dbgs() << " BaseMI: ";
3076	BaseMI->dump();
3077	dbgs() << " Base + " << OffsetB << " + I * " << Delta
3078	<< ", Len: " << AccessSizeB.getValue() << "\n";
3079	dbgs() << " OtherMI: ";
3080	OtherMI->dump();
3081	dbgs() << " Base + " << OffsetO << " + I * " << Delta
3082	<< ", Len: " << AccessSizeO.getValue() << "\n";
3083	});
3084
3085	// Excessive overlap may be detected in strided patterns.
3086	// For example, the memory addresses of the store and the load in
3087	// for (i=0; i<n; i+=2) a[i+1] = a[i];
3088	// are assumed to overlap.
3089	if (Delta < `0`) {
3090	int64_t BaseMinAddr = OffsetB;
3091	int64_t OhterNextIterMaxAddr = OffsetO + Delta + AccessSizeO.getValue() - `1`;
3092	if (BaseMinAddr > OhterNextIterMaxAddr) {
3093	LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3094	return false;
3095	}
3096	} else {
3097	int64_t BaseMaxAddr = OffsetB + AccessSizeB.getValue() - `1`;
3098	int64_t OtherNextIterMinAddr = OffsetO + Delta;
3099	if (BaseMaxAddr < OtherNextIterMinAddr) {
3100	LLVM_DEBUG(dbgs() << " Result: No overlap\n");
3101	return false;
3102	}
3103	}
3104	LLVM_DEBUG(dbgs() << " Result: Overlap\n");
3105	return true;
3106	}
3107
3108	/// Return true for an order or output dependence that is loop carried
3109	/// potentially. A dependence is loop carried if the destination defines a value
3110	/// that may be used or defined by the source in a subsequent iteration.
3111	bool SwingSchedulerDAG::isLoopCarriedDep(
3112	const SwingSchedulerDDGEdge &Edge) const {
3113	if ((!Edge.isOrderDep() && !Edge.isOutputDep()) \|\| Edge.isArtificial() \|\|
3114	Edge.getDst()->isBoundaryNode())
3115	return false;
3116
3117	if (!SwpPruneLoopCarried)
3118	return true;
3119
3120	if (Edge.isOutputDep())
3121	return true;
3122
3123	MachineInstr *SI = Edge.getSrc()->getInstr();
3124	MachineInstr *DI = Edge.getDst()->getInstr();
3125	assert(SI != nullptr && DI != nullptr && "Expecting SUnit with an MI.");
3126
3127	// Assume ordered loads and stores may have a loop carried dependence.
3128	if (SI->hasUnmodeledSideEffects() \|\| DI->hasUnmodeledSideEffects() \|\|
3129	SI->mayRaiseFPException() \|\| DI->mayRaiseFPException() \|\|
3130	SI->hasOrderedMemoryRef() \|\| DI->hasOrderedMemoryRef())
3131	return true;
3132
3133	if (!DI->mayLoadOrStore() \|\| !SI->mayLoadOrStore())
3134	return false;
3135
3136	// The conservative assumption is that a dependence between memory operations
3137	// may be loop carried. The following code checks when it can be proved that
3138	// there is no loop carried dependence.
3139	return mayOverlapInLaterIter(BaseMI: DI, OtherMI: SI);
3140	}
3141
3142	void SwingSchedulerDAG::postProcessDAG() {
3143	for (auto &M : Mutations)
3144	M ->apply(DAG: this);
3145	}
3146
3147	/// Try to schedule the node at the specified StartCycle and continue
3148	/// until the node is schedule or the EndCycle is reached. This function
3149	/// returns true if the node is scheduled. This routine may search either
3150	/// forward or backward for a place to insert the instruction based upon
3151	/// the relative values of StartCycle and EndCycle.
3152	bool SMSchedule::insert(SUnit SU, int* StartCycle, int EndCycle, int II) {
3153	bool forward = true;
3154	LLVM_DEBUG({
3155	dbgs() << "Trying to insert node between " << StartCycle << " and "
3156	<< EndCycle << " II: " << II << "\n";
3157	});
3158	if (StartCycle > EndCycle)
3159	forward = false;
3160
3161	// The terminating condition depends on the direction.
3162	int termCycle = forward ? EndCycle + `1` : EndCycle - `1`;
3163	for (int curCycle = StartCycle; curCycle != termCycle;
3164	forward ? ++curCycle : --curCycle) {
3165
3166	if (ST.getInstrInfo()->isZeroCost(Opcode: SU->getInstr()->getOpcode()) \|\|
3167	ProcItinResources.canReserveResources(SU&: *SU, Cycle: curCycle)) {
3168	LLVM_DEBUG({
3169	dbgs() << "\tinsert at cycle " << curCycle << " ";
3170	SU->getInstr()->dump();
3171	});
3172
3173	if (!ST.getInstrInfo()->isZeroCost(Opcode: SU->getInstr()->getOpcode()))
3174	ProcItinResources.reserveResources(SU&: *SU, Cycle: curCycle);
3175	ScheduledInstrs [curCycle].push_back(x: SU);
3176	InstrToCycle.insert(x: std::make_pair(x&: SU, y&: curCycle));
3177	if (curCycle > LastCycle)
3178	LastCycle = curCycle;
3179	if (curCycle < FirstCycle)
3180	FirstCycle = curCycle;
3181	return true;
3182	}
3183	LLVM_DEBUG({
3184	dbgs() << "\tfailed to insert at cycle " << curCycle << " ";
3185	SU->getInstr()->dump();
3186	});
3187	}
3188	return false;
3189	}
3190
3191	// Return the cycle of the earliest scheduled instruction in the chain.
3192	int SMSchedule::earliestCycleInChain(const SwingSchedulerDDGEdge &Dep,
3193	const SwingSchedulerDDG *DDG) {
3194	SmallPtrSet<SUnit *, `8`> Visited;
3195	SmallVector<SwingSchedulerDDGEdge, `8`> Worklist;
3196	Worklist.push_back(Elt: Dep);
3197	int EarlyCycle = INT_MAX;
3198	while (!Worklist.empty()) {
3199	const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
3200	SUnit *PrevSU = Cur.getSrc();
3201	if (Visited.count(Ptr: PrevSU))
3202	continue;
3203	std::map<SUnit , int*>::const_iterator it = InstrToCycle.find(x: PrevSU);
3204	if (it == InstrToCycle.end())
3205	continue;
3206	EarlyCycle = std::min(a: EarlyCycle, b: it ->second);
3207	for (const auto &IE : DDG->getInEdges(SU: PrevSU))
3208	if (IE.isOrderDep() \|\| IE.isOutputDep())
3209	Worklist.push_back(Elt: IE);
3210	Visited.insert(Ptr: PrevSU);
3211	}
3212	return EarlyCycle;
3213	}
3214
3215	// Return the cycle of the latest scheduled instruction in the chain.
3216	int SMSchedule::latestCycleInChain(const SwingSchedulerDDGEdge &Dep,
3217	const SwingSchedulerDDG *DDG) {
3218	SmallPtrSet<SUnit *, `8`> Visited;
3219	SmallVector<SwingSchedulerDDGEdge, `8`> Worklist;
3220	Worklist.push_back(Elt: Dep);
3221	int LateCycle = INT_MIN;
3222	while (!Worklist.empty()) {
3223	const SwingSchedulerDDGEdge &Cur = Worklist.pop_back_val();
3224	SUnit *SuccSU = Cur.getDst();
3225	if (Visited.count(Ptr: SuccSU) \|\| SuccSU->isBoundaryNode())
3226	continue;
3227	std::map<SUnit , int*>::const_iterator it = InstrToCycle.find(x: SuccSU);
3228	if (it == InstrToCycle.end())
3229	continue;
3230	LateCycle = std::max(a: LateCycle, b: it ->second);
3231	for (const auto &OE : DDG->getOutEdges(SU: SuccSU))
3232	if (OE.isOrderDep() \|\| OE.isOutputDep())
3233	Worklist.push_back(Elt: OE);
3234	Visited.insert(Ptr: SuccSU);
3235	}
3236	return LateCycle;
3237	}
3238
3239	/// If an instruction has a use that spans multiple iterations, then
3240	/// return true. These instructions are characterized by having a back-ege
3241	/// to a Phi, which contains a reference to another Phi.
3242	static SUnit multipleIterations(SUnit SU, SwingSchedulerDAG *DAG) {
3243	for (auto &P : SU->Preds)
3244	if (P.getKind() == SDep::Anti && P.getSUnit()->getInstr()->isPHI())
3245	for (auto &S : P.getSUnit()->Succs)
3246	if (S.getKind() == SDep::Data && S.getSUnit()->getInstr()->isPHI())
3247	return P.getSUnit();
3248	return nullptr;
3249	}
3250
3251	/// Compute the scheduling start slot for the instruction. The start slot
3252	/// depends on any predecessor or successor nodes scheduled already.
3253	void SMSchedule::computeStart(SUnit SU, int* MaxEarlyStart, int* *MinLateStart,
3254	int II, SwingSchedulerDAG *DAG) {
3255	const SwingSchedulerDDG *DDG = DAG->getDDG();
3256
3257	// Iterate over each instruction that has been scheduled already. The start
3258	// slot computation depends on whether the previously scheduled instruction
3259	// is a predecessor or successor of the specified instruction.
3260	for (int cycle = getFirstCycle(); cycle <= LastCycle; ++cycle) {
3261	for (SUnit *I : getInstructions(cycle)) {
3262	for (const auto &IE : DDG->getInEdges(SU)) {
3263	if (IE.getSrc() == I) {
3264	// FIXME: Add reverse edge to `DDG` instead of calling
3265	// `isLoopCarriedDep`
3266	if (DAG->isLoopCarriedDep(Edge: IE)) {
3267	int End = earliestCycleInChain(Dep: IE, DDG) + (II - `1`);
3268	MinLateStart = std::min(a: MinLateStart, b: End);
3269	}
3270	int EarlyStart = cycle + IE.getLatency() - IE.getDistance() * II;
3271	MaxEarlyStart = std::max(a: MaxEarlyStart, b: EarlyStart);
3272	}
3273	}
3274
3275	for (const auto &OE : DDG->getOutEdges(SU)) {
3276	if (OE.getDst() == I) {
3277	// FIXME: Add reverse edge to `DDG` instead of calling
3278	// `isLoopCarriedDep`
3279	if (DAG->isLoopCarriedDep(Edge: OE)) {
3280	int Start = latestCycleInChain(Dep: OE, DDG) + `1` - II;
3281	MaxEarlyStart = std::max(a: MaxEarlyStart, b: Start);
3282	}
3283	int LateStart = cycle - OE.getLatency() + OE.getDistance() * II;
3284	MinLateStart = std::min(a: MinLateStart, b: LateStart);
3285	}
3286	}
3287
3288	SUnit *BE = multipleIterations(SU: I, DAG);
3289	for (const auto &Dep : SU->Preds) {
3290	// For instruction that requires multiple iterations, make sure that
3291	// the dependent instruction is not scheduled past the definition.
3292	if (BE && Dep.getSUnit() == BE && !SU->getInstr()->isPHI() &&
3293	!SU->isPred(N: I))
3294	MinLateStart = std::min(a: MinLateStart, b: cycle);
3295	}
3296	}
3297	}
3298	}
3299
3300	/// Order the instructions within a cycle so that the definitions occur
3301	/// before the uses. Returns true if the instruction is added to the start
3302	/// of the list, or false if added to the end.
3303	void SMSchedule::orderDependence(const SwingSchedulerDAG SSD, SUnit SU,
3304	std::deque<SUnit > &Insts) const* {
3305	MachineInstr *MI = SU->getInstr();
3306	bool OrderBeforeUse = false;
3307	bool OrderAfterDef = false;
3308	bool OrderBeforeDef = false;
3309	unsigned MoveDef = `0`;
3310	unsigned MoveUse = `0`;
3311	int StageInst1 = stageScheduled(SU);
3312	const SwingSchedulerDDG *DDG = SSD->getDDG();
3313
3314	unsigned Pos = `0`;
3315	for (std::deque<SUnit *>::iterator I = Insts.begin(), E = Insts.end(); I != E;
3316	++I, ++Pos) {
3317	for (MachineOperand &MO : MI->operands()) {
3318	if (!MO.isReg() \|\| !MO.getReg().isVirtual())
3319	continue;
3320
3321	Register Reg = MO.getReg();
3322	unsigned BasePos, OffsetPos;
3323	if (ST.getInstrInfo()->getBaseAndOffsetPosition(MI: *MI, BasePos, OffsetPos))
3324	if (MI->getOperand(i: BasePos).getReg() == Reg)
3325	if (Register NewReg = SSD->getInstrBaseReg(SU))
3326	Reg = NewReg;
3327	bool Reads, Writes;
3328	std::tie(args&: Reads, args&: Writes) =
3329	(*I)->getInstr()->readsWritesVirtualRegister(Reg);
3330	if (MO.isDef() && Reads && stageScheduled(SU: *I) <= StageInst1) {
3331	OrderBeforeUse = true;
3332	if (MoveUse == `0`)
3333	MoveUse = Pos;
3334	} else if (MO.isDef() && Reads && stageScheduled(SU: *I) > StageInst1) {
3335	// Add the instruction after the scheduled instruction.
3336	OrderAfterDef = true;
3337	MoveDef = Pos;
3338	} else if (MO.isUse() && Writes && stageScheduled(SU: *I) == StageInst1) {
3339	if (cycleScheduled(SU: I) == cycleScheduled(SU) && !(I)->isSucc(N: SU)) {
3340	OrderBeforeUse = true;
3341	if (MoveUse == `0`)
3342	MoveUse = Pos;
3343	} else {
3344	OrderAfterDef = true;
3345	MoveDef = Pos;
3346	}
3347	} else if (MO.isUse() && Writes && stageScheduled(SU: *I) > StageInst1) {
3348	OrderBeforeUse = true;
3349	if (MoveUse == `0`)
3350	MoveUse = Pos;
3351	if (MoveUse != `0`) {
3352	OrderAfterDef = true;
3353	MoveDef = Pos - `1`;
3354	}
3355	} else if (MO.isUse() && Writes && stageScheduled(SU: *I) < StageInst1) {
3356	// Add the instruction before the scheduled instruction.
3357	OrderBeforeUse = true;
3358	if (MoveUse == `0`)
3359	MoveUse = Pos;
3360	} else if (MO.isUse() && stageScheduled(SU: *I) == StageInst1 &&
3361	isLoopCarriedDefOfUse(SSD, Def: (*I)->getInstr(), MO)) {
3362	if (MoveUse == `0`) {
3363	OrderBeforeDef = true;
3364	MoveUse = Pos;
3365	}
3366	}
3367	}
3368	// Check for order dependences between instructions. Make sure the source
3369	// is ordered before the destination.
3370	for (auto &OE : DDG->getOutEdges(SU)) {
3371	if (OE.getDst() != *I)
3372	continue;
3373	if (OE.isOrderDep() && stageScheduled(SU: *I) == StageInst1) {
3374	OrderBeforeUse = true;
3375	if (Pos < MoveUse)
3376	MoveUse = Pos;
3377	}
3378	// We did not handle HW dependences in previous for loop,
3379	// and we normally set Latency = 0 for Anti/Output deps,
3380	// so may have nodes in same cycle with Anti/Output dependent on HW regs.
3381	else if ((OE.isAntiDep() \|\| OE.isOutputDep()) &&
3382	stageScheduled(SU: *I) == StageInst1) {
3383	OrderBeforeUse = true;
3384	if ((MoveUse == `0`) \|\| (Pos < MoveUse))
3385	MoveUse = Pos;
3386	}
3387	}
3388	for (auto &IE : DDG->getInEdges(SU)) {
3389	if (IE.getSrc() != *I)
3390	continue;
3391	if ((IE.isAntiDep() \|\| IE.isOutputDep() \|\| IE.isOrderDep()) &&
3392	stageScheduled(SU: *I) == StageInst1) {
3393	OrderAfterDef = true;
3394	MoveDef = Pos;
3395	}
3396	}
3397	}
3398
3399	// A circular dependence.
3400	if (OrderAfterDef && OrderBeforeUse && MoveUse == MoveDef)
3401	OrderBeforeUse = false;
3402
3403	// OrderAfterDef takes precedences over OrderBeforeDef. The latter is due
3404	// to a loop-carried dependence.
3405	if (OrderBeforeDef)
3406	OrderBeforeUse = !OrderAfterDef \|\| (MoveUse > MoveDef);
3407
3408	// The uncommon case when the instruction order needs to be updated because
3409	// there is both a use and def.
3410	if (OrderBeforeUse && OrderAfterDef) {
3411	SUnit *UseSU = Insts.at(n: MoveUse);
3412	SUnit *DefSU = Insts.at(n: MoveDef);
3413	if (MoveUse > MoveDef) {
3414	Insts.erase(position: Insts.begin() + MoveUse);
3415	Insts.erase(position: Insts.begin() + MoveDef);
3416	} else {
3417	Insts.erase(position: Insts.begin() + MoveDef);
3418	Insts.erase(position: Insts.begin() + MoveUse);
3419	}
3420	orderDependence(SSD, SU: UseSU, Insts);
3421	orderDependence(SSD, SU, Insts);
3422	orderDependence(SSD, SU: DefSU, Insts);
3423	return;
3424	}
3425	// Put the new instruction first if there is a use in the list. Otherwise,
3426	// put it at the end of the list.
3427	if (OrderBeforeUse)
3428	Insts.push_front(x: SU);
3429	else
3430	Insts.push_back(x: SU);
3431	}
3432
3433	/// Return true if the scheduled Phi has a loop carried operand.
3434	bool SMSchedule::isLoopCarried(const SwingSchedulerDAG *SSD,
3435	MachineInstr &Phi) const {
3436	if (!Phi.isPHI())
3437	return false;
3438	assert(Phi.isPHI() && "Expecting a Phi.");
3439	SUnit *DefSU = SSD->getSUnit(MI: &Phi);
3440	unsigned DefCycle = cycleScheduled(SU: DefSU);
3441	int DefStage = stageScheduled(SU: DefSU);
3442
3443	Register InitVal;
3444	Register LoopVal;
3445	getPhiRegs(Phi, Loop: Phi.getParent(), InitVal, LoopVal);
3446	SUnit *UseSU = SSD->getSUnit(MI: MRI.getVRegDef(Reg: LoopVal));
3447	if (!UseSU)
3448	return true;
3449	if (UseSU->getInstr()->isPHI())
3450	return true;
3451	unsigned LoopCycle = cycleScheduled(SU: UseSU);
3452	int LoopStage = stageScheduled(SU: UseSU);
3453	return (LoopCycle > DefCycle) \|\| (LoopStage <= DefStage);
3454	}
3455
3456	/// Return true if the instruction is a definition that is loop carried
3457	/// and defines the use on the next iteration.
3458	/// v1 = phi(v2, v3)
3459	/// (Def) v3 = op v1
3460	/// (MO) = v1
3461	/// If MO appears before Def, then v1 and v3 may get assigned to the same
3462	/// register.
3463	bool SMSchedule::isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD,
3464	MachineInstr *Def,
3465	MachineOperand &MO) const {
3466	if (!MO.isReg())
3467	return false;
3468	if (Def->isPHI())
3469	return false;
3470	MachineInstr *Phi = MRI.getVRegDef(Reg: MO.getReg());
3471	if (!Phi \|\| !Phi->isPHI() \|\| Phi->getParent() != Def->getParent())
3472	return false;
3473	if (!isLoopCarried(SSD, Phi&: *Phi))
3474	return false;
3475	Register LoopReg = getLoopPhiReg(Phi: *Phi, LoopBB: Phi->getParent());
3476	for (MachineOperand &DMO : Def->all_defs()) {
3477	if (DMO.getReg() == LoopReg)
3478	return true;
3479	}
3480	return false;
3481	}
3482
3483	/// Return true if all scheduled predecessors are loop-carried output/order
3484	/// dependencies.
3485	bool SMSchedule::onlyHasLoopCarriedOutputOrOrderPreds(
3486	SUnit SU, const* SwingSchedulerDDG DDG) const* {
3487	for (const auto &IE : DDG->getInEdges(SU))
3488	if (InstrToCycle.count(x: IE.getSrc()))
3489	return false;
3490	return true;
3491	}
3492
3493	/// Determine transitive dependences of unpipelineable instructions
3494	SmallSet<SUnit *, `8`> SMSchedule::computeUnpipelineableNodes(
3495	SwingSchedulerDAG SSD, TargetInstrInfo::PipelinerLoopInfo PLI) {
3496	SmallSet<SUnit *, `8`> DoNotPipeline;
3497	SmallVector<SUnit *, `8`> Worklist;
3498
3499	for (auto &SU : SSD->SUnits)
3500	if (SU.isInstr() && PLI->shouldIgnoreForPipelining(MI: SU.getInstr()))
3501	Worklist.push_back(Elt: &SU);
3502
3503	const SwingSchedulerDDG *DDG = SSD->getDDG();
3504	while (!Worklist.empty()) {
3505	auto SU = Worklist.pop_back_val();
3506	if (DoNotPipeline.count(Ptr: SU))
3507	continue;
3508	LLVM_DEBUG(dbgs() << "Do not pipeline SU(" << SU->NodeNum << ")\n");
3509	DoNotPipeline.insert(Ptr: SU);
3510	for (const auto &IE : DDG->getInEdges(SU))
3511	Worklist.push_back(Elt: IE.getSrc());
3512
3513	// To preserve previous behavior and prevent regression
3514	// FIXME: Remove if this doesn't have significant impact on
3515	for (const auto &OE : DDG->getOutEdges(SU))
3516	if (OE.getDistance() == `1`)
3517	Worklist.push_back(Elt: OE.getDst());
3518	}
3519	return DoNotPipeline;
3520	}
3521
3522	// Determine all instructions upon which any unpipelineable instruction depends
3523	// and ensure that they are in stage 0. If unable to do so, return false.
3524	bool SMSchedule::normalizeNonPipelinedInstructions(
3525	SwingSchedulerDAG SSD, TargetInstrInfo::PipelinerLoopInfo PLI) {
3526	SmallSet<SUnit *, `8`> DNP = computeUnpipelineableNodes(SSD, PLI);
3527
3528	int NewLastCycle = INT_MIN;
3529	for (SUnit &SU : SSD->SUnits) {
3530	if (!SU.isInstr())
3531	continue;
3532	if (!DNP.contains(Ptr: &SU) \|\| stageScheduled(SU: &SU) == `0`) {
3533	NewLastCycle = std::max(a: NewLastCycle, b: InstrToCycle [&SU]);
3534	continue;
3535	}
3536
3537	// Put the non-pipelined instruction as early as possible in the schedule
3538	int NewCycle = getFirstCycle();
3539	for (const auto &IE : SSD->getDDG()->getInEdges(SU: &SU))
3540	if (IE.getDistance() == `0`)
3541	NewCycle = std::max(a: InstrToCycle [IE.getSrc()], b: NewCycle);
3542
3543	// To preserve previous behavior and prevent regression
3544	// FIXME: Remove if this doesn't have significant impact on performance
3545	for (auto &OE : SSD->getDDG()->getOutEdges(SU: &SU))
3546	if (OE.getDistance() == `1`)
3547	NewCycle = std::max(a: InstrToCycle [OE.getDst()], b: NewCycle);
3548
3549	int OldCycle = InstrToCycle [&SU];
3550	if (OldCycle != NewCycle) {
3551	InstrToCycle [&SU] = NewCycle;
3552	auto &OldS = getInstructions(cycle: OldCycle);
3553	llvm::erase(C&: OldS, V: &SU);
3554	getInstructions(cycle: NewCycle).emplace_back(args: &SU);
3555	LLVM_DEBUG(dbgs() << "SU(" << SU.NodeNum
3556	<< ") is not pipelined; moving from cycle " << OldCycle
3557	<< " to " << NewCycle << " Instr:" << *SU.getInstr());
3558	}
3559
3560	// We traverse the SUs in the order of the original basic block. Computing
3561	// NewCycle in this order normally works fine because all dependencies
3562	// (except for loop-carried dependencies) don't violate the original order.
3563	// However, an artificial dependency (e.g., added by CopyToPhiMutation) can
3564	// break it. That is, there may be exist an artificial dependency from
3565	// bottom to top. In such a case, NewCycle may become too large to be
3566	// scheduled in Stage 0. For example, assume that Inst0 is in DNP in the
3567	// following case:
3568	//
3569	// \| Inst0 <-+
3570	// SU order \| \| artificial dep
3571	// \| Inst1 --+
3572	// v
3573	//
3574	// If Inst1 is scheduled at cycle N and is not at Stage 0, then NewCycle of
3575	// Inst0 must be greater than or equal to N so that Inst0 is not be
3576	// scheduled at Stage 0. In such cases, we reject this schedule at this
3577	// time.
3578	// FIXME: The reason for this is the existence of artificial dependencies
3579	// that are contradict to the original SU order. If ignoring artificial
3580	// dependencies does not affect correctness, then it is better to ignore
3581	// them.
3582	if (FirstCycle + InitiationInterval <= NewCycle)
3583	return false;
3584
3585	NewLastCycle = std::max(a: NewLastCycle, b: NewCycle);
3586	}
3587	LastCycle = NewLastCycle;
3588	return true;
3589	}
3590
3591	// Check if the generated schedule is valid. This function checks if
3592	// an instruction that uses a physical register is scheduled in a
3593	// different stage than the definition. The pipeliner does not handle
3594	// physical register values that may cross a basic block boundary.
3595	// Furthermore, if a physical def/use pair is assigned to the same
3596	// cycle, orderDependence does not guarantee def/use ordering, so that
3597	// case should be considered invalid. (The test checks for both
3598	// earlier and same-cycle use to be more robust.)
3599	bool SMSchedule::isValidSchedule(SwingSchedulerDAG *SSD) {
3600	for (SUnit &SU : SSD->SUnits) {
3601	if (!SU.hasPhysRegDefs)
3602	continue;
3603	int StageDef = stageScheduled(SU: &SU);
3604	int CycleDef = InstrToCycle [&SU];
3605	assert(StageDef != -`1` && "Instruction should have been scheduled.");
3606	for (auto &OE : SSD->getDDG()->getOutEdges(SU: &SU)) {
3607	SUnit *Dst = OE.getDst();
3608	if (OE.isAssignedRegDep() && !Dst->isBoundaryNode())
3609	if (OE.getReg().isPhysical()) {
3610	if (stageScheduled(SU: Dst) != StageDef)
3611	return false;
3612	if (InstrToCycle [Dst] <= CycleDef)
3613	return false;
3614	}
3615	}
3616	}
3617	return true;
3618	}
3619
3620	/// A property of the node order in swing-modulo-scheduling is
3621	/// that for nodes outside circuits the following holds:
3622	/// none of them is scheduled after both a successor and a
3623	/// predecessor.
3624	/// The method below checks whether the property is met.
3625	/// If not, debug information is printed and statistics information updated.
3626	/// Note that we do not use an assert statement.
3627	/// The reason is that although an invalid node order may prevent
3628	/// the pipeliner from finding a pipelined schedule for arbitrary II,
3629	/// it does not lead to the generation of incorrect code.
3630	void SwingSchedulerDAG::checkValidNodeOrder(const NodeSetType &Circuits) const {
3631
3632	// a sorted vector that maps each SUnit to its index in the NodeOrder
3633	typedef std::pair<SUnit , unsigned*> UnitIndex;
3634	std::vector<UnitIndex> Indices(NodeOrder.size(), std::make_pair(x: nullptr, y: `0`));
3635
3636	for (unsigned i = `0`, s = NodeOrder.size(); i < s; ++i)
3637	Indices.push_back(x: std::make_pair(x: NodeOrder [i], y&: i));
3638
3639	auto CompareKey = [](UnitIndex i1, UnitIndex i2) {
3640	return std::get<`0`>(in&: i1) < std::get<`0`>(in&: i2);
3641	};
3642
3643	// sort, so that we can perform a binary search
3644	llvm::sort(C&: Indices, Comp: CompareKey);
3645
3646	bool Valid = true;
3647	(void)Valid;
3648	// for each SUnit in the NodeOrder, check whether
3649	// it appears after both a successor and a predecessor
3650	// of the SUnit. If this is the case, and the SUnit
3651	// is not part of circuit, then the NodeOrder is not
3652	// valid.
3653	for (unsigned i = `0`, s = NodeOrder.size(); i < s; ++i) {
3654	SUnit *SU = NodeOrder [i];
3655	unsigned Index = i;
3656
3657	bool PredBefore = false;
3658	bool SuccBefore = false;
3659
3660	SUnit *Succ;
3661	SUnit *Pred;
3662	(void)Succ;
3663	(void)Pred;
3664
3665	for (const auto &IE : DDG ->getInEdges(SU)) {
3666	SUnit *PredSU = IE.getSrc();
3667	unsigned PredIndex = std::get<`1`>(
3668	in&: *llvm::lower_bound(Range&: Indices, Value: std::make_pair(x&: PredSU, y: `0`), C: CompareKey));
3669	if (!PredSU->getInstr()->isPHI() && PredIndex < Index) {
3670	PredBefore = true;
3671	Pred = PredSU;
3672	break;
3673	}
3674	}
3675
3676	for (const auto &OE : DDG ->getOutEdges(SU)) {
3677	SUnit *SuccSU = OE.getDst();
3678	// Do not process a boundary node, it was not included in NodeOrder,
3679	// hence not in Indices either, call to std::lower_bound() below will
3680	// return Indices.end().
3681	if (SuccSU->isBoundaryNode())
3682	continue;
3683	unsigned SuccIndex = std::get<`1`>(
3684	in&: *llvm::lower_bound(Range&: Indices, Value: std::make_pair(x&: SuccSU, y: `0`), C: CompareKey));
3685	if (!SuccSU->getInstr()->isPHI() && SuccIndex < Index) {
3686	SuccBefore = true;
3687	Succ = SuccSU;
3688	break;
3689	}
3690	}
3691
3692	if (PredBefore && SuccBefore && !SU->getInstr()->isPHI()) {
3693	// instructions in circuits are allowed to be scheduled
3694	// after both a successor and predecessor.
3695	bool InCircuit = llvm::any_of(
3696	Range: Circuits, P: [SU](const NodeSet &Circuit) { return Circuit.count(SU); });
3697	if (InCircuit)
3698	LLVM_DEBUG(dbgs() << "In a circuit, predecessor ");
3699	else {
3700	Valid = false;
3701	NumNodeOrderIssues ++;
3702	LLVM_DEBUG(dbgs() << "Predecessor ");
3703	}
3704	LLVM_DEBUG(dbgs() << Pred->NodeNum << " and successor " << Succ->NodeNum
3705	<< " are scheduled before node " << SU->NodeNum
3706	<< "\n");
3707	}
3708	}
3709
3710	LLVM_DEBUG({
3711	if (!Valid)
3712	dbgs() << "Invalid node order found!\n";
3713	});
3714	}
3715
3716	/// Attempt to fix the degenerate cases when the instruction serialization
3717	/// causes the register lifetimes to overlap. For example,
3718	/// p' = store_pi(p, b)
3719	/// = load p, offset
3720	/// In this case p and p' overlap, which means that two registers are needed.
3721	/// Instead, this function changes the load to use p' and updates the offset.
3722	void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
3723	Register OverlapReg;
3724	Register NewBaseReg;
3725	for (SUnit *SU : Instrs) {
3726	MachineInstr *MI = SU->getInstr();
3727	for (unsigned i = `0`, e = MI->getNumOperands(); i < e; ++i) {
3728	const MachineOperand &MO = MI->getOperand(i);
3729	// Look for an instruction that uses p. The instruction occurs in the
3730	// same cycle but occurs later in the serialized order.
3731	if (MO.isReg() && MO.isUse() && MO.getReg() == OverlapReg) {
3732	// Check that the instruction appears in the InstrChanges structure,
3733	// which contains instructions that can have the offset updated.
3734	DenseMap<SUnit *, std::pair<Register, int64_t>>::iterator It =
3735	InstrChanges.find(Val: SU);
3736	if (It != InstrChanges.end()) {
3737	unsigned BasePos, OffsetPos;
3738	// Update the base register and adjust the offset.
3739	if (TII->getBaseAndOffsetPosition(MI: *MI, BasePos, OffsetPos)) {
3740	MachineInstr *NewMI = MF.CloneMachineInstr(Orig: MI);
3741	NewMI->getOperand(i: BasePos).setReg(NewBaseReg);
3742	int64_t NewOffset =
3743	MI->getOperand(i: OffsetPos).getImm() - It ->second.second;
3744	NewMI->getOperand(i: OffsetPos).setImm(NewOffset);
3745	SU->setInstr(NewMI);
3746	MISUnitMap [NewMI] = SU;
3747	NewMIs [MI] = NewMI;
3748	}
3749	}
3750	OverlapReg = Register ();
3751	NewBaseReg = Register ();
3752	break;
3753	}
3754	// Look for an instruction of the form p' = op(p), which uses and defines
3755	// two virtual registers that get allocated to the same physical register.
3756	unsigned TiedUseIdx = `0`;
3757	if (MI->isRegTiedToUseOperand(DefOpIdx: i, UseOpIdx: &TiedUseIdx)) {
3758	// OverlapReg is p in the example above.
3759	OverlapReg = MI->getOperand(i: TiedUseIdx).getReg();
3760	// NewBaseReg is p' in the example above.
3761	NewBaseReg = MI->getOperand(i).getReg();
3762	break;
3763	}
3764	}
3765	}
3766	}
3767
3768	std::deque<SUnit *>
3769	SMSchedule::reorderInstructions(const SwingSchedulerDAG *SSD,
3770	const std::deque<SUnit > &Instrs) const* {
3771	std::deque<SUnit *> NewOrderPhi;
3772	for (SUnit *SU : Instrs) {
3773	if (SU->getInstr()->isPHI())
3774	NewOrderPhi.push_back(x: SU);
3775	}
3776	std::deque<SUnit *> NewOrderI;
3777	for (SUnit *SU : Instrs) {
3778	if (!SU->getInstr()->isPHI())
3779	orderDependence(SSD, SU, Insts&: NewOrderI);
3780	}
3781	llvm::append_range(C&: NewOrderPhi, R&: NewOrderI);
3782	return NewOrderPhi;
3783	}
3784
3785	/// After the schedule has been formed, call this function to combine
3786	/// the instructions from the different stages/cycles. That is, this
3787	/// function creates a schedule that represents a single iteration.
3788	void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
3789	// Move all instructions to the first stage from later stages.
3790	for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3791	for (int stage = `1`, lastStage = getMaxStageCount(); stage <= lastStage;
3792	++stage) {
3793	std::deque<SUnit *> &cycleInstrs =
3794	ScheduledInstrs [cycle + (stage * InitiationInterval)];
3795	for (SUnit *SU : llvm::reverse(C&: cycleInstrs))
3796	ScheduledInstrs [cycle].push_front(x: SU);
3797	}
3798	}
3799
3800	// Erase all the elements in the later stages. Only one iteration should
3801	// remain in the scheduled list, and it contains all the instructions.
3802	for (int cycle = getFinalCycle() + `1`; cycle <= LastCycle; ++cycle)
3803	ScheduledInstrs.erase(Val: cycle);
3804
3805	// Change the registers in instruction as specified in the InstrChanges
3806	// map. We need to use the new registers to create the correct order.
3807	for (const SUnit &SU : SSD->SUnits)
3808	SSD->applyInstrChange(MI: SU.getInstr(), Schedule&: *this);
3809
3810	// Reorder the instructions in each cycle to fix and improve the
3811	// generated code.
3812	for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
3813	std::deque<SUnit *> &cycleInstrs = ScheduledInstrs [Cycle];
3814	cycleInstrs = reorderInstructions(SSD, Instrs: cycleInstrs);
3815	SSD->fixupRegisterOverlaps(Instrs&: cycleInstrs);
3816	}
3817
3818	LLVM_DEBUG(dump(););
3819	}
3820
3821	void NodeSet::print(raw_ostream &os) const {
3822	os << "Num nodes " << size() << " rec " << RecMII << " mov " << MaxMOV
3823	<< " depth " << MaxDepth << " col " << Colocate << "\n";
3824	for (const auto &I : Nodes)
3825	os << " SU(" << I->NodeNum << ") " << *(I->getInstr());
3826	os << "\n";
3827	}
3828
3829	#if !defined(NDEBUG) \|\| defined(LLVM_ENABLE_DUMP)
3830	/// Print the schedule information to the given output.
3831	void SMSchedule::print(raw_ostream &os) const {
3832	// Iterate over each cycle.
3833	for (int cycle = getFirstCycle(); cycle <= getFinalCycle(); ++cycle) {
3834	// Iterate over each instruction in the cycle.
3835	const_sched_iterator cycleInstrs = ScheduledInstrs.find(cycle);
3836	for (SUnit *CI : cycleInstrs->second) {
3837	os << "cycle " << cycle << " (" << stageScheduled(CI) << ") ";
3838	os << "(" << CI->NodeNum << ") ";
3839	CI->getInstr()->print(os);
3840	os << "\n";
3841	}
3842	}
3843	}
3844
3845	/// Utility function used for debugging to print the schedule.
3846	LLVM_DUMP_METHOD void SMSchedule::dump() const { print(dbgs()); }
3847	LLVM_DUMP_METHOD void NodeSet::dump() const { print(dbgs()); }
3848
3849	void ResourceManager::dumpMRT() const {
3850	LLVM_DEBUG({
3851	if (UseDFA)
3852	return;
3853	std::stringstream SS;
3854	SS << "MRT:\n";
3855	SS << std::setw(`4`) << "Slot";
3856	for (unsigned I = `1`, E = SM.getNumProcResourceKinds(); I < E; ++I)
3857	SS << std::setw(`3`) << I;
3858	SS << std::setw(`7`) << "#Mops"
3859	<< "\n";
3860	for (int Slot = `0`; Slot < InitiationInterval; ++Slot) {
3861	SS << std::setw(`4`) << Slot;
3862	for (unsigned I = `1`, E = SM.getNumProcResourceKinds(); I < E; ++I)
3863	SS << std::setw(`3`) << MRT[Slot][I];
3864	SS << std::setw(`7`) << NumScheduledMops[Slot] << "\n";
3865	}
3866	dbgs() << SS.str();
3867	});
3868	}
3869	#endif
3870
3871	void ResourceManager::initProcResourceVectors(
3872	const MCSchedModel &SM, SmallVectorImpl<uint64_t> &Masks) {
3873	unsigned ProcResourceID = `0`;
3874
3875	// We currently limit the resource kinds to 64 and below so that we can use
3876	// uint64_t for Masks
3877	assert(SM.getNumProcResourceKinds() < `64` &&
3878	"Too many kinds of resources, unsupported");
3879	// Create a unique bitmask for every processor resource unit.
3880	// Skip resource at index 0, since it always references 'InvalidUnit'.
3881	Masks.resize(N: SM.getNumProcResourceKinds());
3882	for (unsigned I = `1`, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3883	const MCProcResourceDesc &Desc = *SM.getProcResource(ProcResourceIdx: I);
3884	if (Desc.SubUnitsIdxBegin)
3885	continue;
3886	Masks [I] = `1ULL` << ProcResourceID;
3887	ProcResourceID++;
3888	}
3889	// Create a unique bitmask for every processor resource group.
3890	for (unsigned I = `1`, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3891	const MCProcResourceDesc &Desc = *SM.getProcResource(ProcResourceIdx: I);
3892	if (!Desc.SubUnitsIdxBegin)
3893	continue;
3894	Masks [I] = `1ULL` << ProcResourceID;
3895	for (unsigned U = `0`; U < Desc.NumUnits; ++U)
3896	Masks [I] \|= Masks [Desc.SubUnitsIdxBegin[U]];
3897	ProcResourceID++;
3898	}
3899	LLVM_DEBUG({
3900	if (SwpShowResMask) {
3901	dbgs() << "ProcResourceDesc:\n";
3902	for (unsigned I = `1`, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3903	const MCProcResourceDesc *ProcResource = SM.getProcResource(I);
3904	dbgs() << format(" %16s(%2d): Mask: 0x%08x, NumUnits:%2d\n",
3905	ProcResource->Name, I, Masks[I],
3906	ProcResource->NumUnits);
3907	}
3908	dbgs() << " -----------------\n";
3909	}
3910	});
3911	}
3912
3913	bool ResourceManager::canReserveResources(SUnit &SU, int Cycle) {
3914	LLVM_DEBUG({
3915	if (SwpDebugResource)
3916	dbgs() << "canReserveResources:\n";
3917	});
3918	if (UseDFA)
3919	return DFAResources [positiveModulo(Dividend: Cycle, Divisor: InitiationInterval)]
3920	->canReserveResources(MID: &SU.getInstr()->getDesc());
3921
3922	const MCSchedClassDesc *SCDesc = DAG->getSchedClass(SU: &SU);
3923	if (!SCDesc->isValid()) {
3924	LLVM_DEBUG({
3925	dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3926	dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3927	});
3928	return true;
3929	}
3930
3931	reserveResources(SCDesc, Cycle);
3932	bool Result = !isOverbooked();
3933	unreserveResources(SCDesc, Cycle);
3934
3935	LLVM_DEBUG(if (SwpDebugResource) dbgs() << "return " << Result << "\n\n");
3936	return Result;
3937	}
3938
3939	void ResourceManager::reserveResources(SUnit &SU, int Cycle) {
3940	LLVM_DEBUG({
3941	if (SwpDebugResource)
3942	dbgs() << "reserveResources:\n";
3943	});
3944	if (UseDFA)
3945	return DFAResources [positiveModulo(Dividend: Cycle, Divisor: InitiationInterval)]
3946	->reserveResources(MID: &SU.getInstr()->getDesc());
3947
3948	const MCSchedClassDesc *SCDesc = DAG->getSchedClass(SU: &SU);
3949	if (!SCDesc->isValid()) {
3950	LLVM_DEBUG({
3951	dbgs() << "No valid Schedule Class Desc for schedClass!\n";
3952	dbgs() << "isPseudo:" << SU.getInstr()->isPseudo() << "\n";
3953	});
3954	return;
3955	}
3956
3957	reserveResources(SCDesc, Cycle);
3958
3959	LLVM_DEBUG({
3960	if (SwpDebugResource) {
3961	dumpMRT();
3962	dbgs() << "reserveResources: done!\n\n";
3963	}
3964	});
3965	}
3966
3967	void ResourceManager::reserveResources(const MCSchedClassDesc *SCDesc,
3968	int Cycle) {
3969	assert(!UseDFA);
3970	for (const MCWriteProcResEntry &PRE : make_range(
3971	x: STI->getWriteProcResBegin(SC: SCDesc), y: STI->getWriteProcResEnd(SC: SCDesc)))
3972	for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3973	++MRT [positiveModulo(Dividend: C, Divisor: InitiationInterval)][PRE.ProcResourceIdx];
3974
3975	for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3976	++NumScheduledMops [positiveModulo(Dividend: C, Divisor: InitiationInterval)];
3977	}
3978
3979	void ResourceManager::unreserveResources(const MCSchedClassDesc *SCDesc,
3980	int Cycle) {
3981	assert(!UseDFA);
3982	for (const MCWriteProcResEntry &PRE : make_range(
3983	x: STI->getWriteProcResBegin(SC: SCDesc), y: STI->getWriteProcResEnd(SC: SCDesc)))
3984	for (int C = Cycle; C < Cycle + PRE.ReleaseAtCycle; ++C)
3985	--MRT [positiveModulo(Dividend: C, Divisor: InitiationInterval)][PRE.ProcResourceIdx];
3986
3987	for (int C = Cycle; C < Cycle + SCDesc->NumMicroOps; ++C)
3988	--NumScheduledMops [positiveModulo(Dividend: C, Divisor: InitiationInterval)];
3989	}
3990
3991	bool ResourceManager::isOverbooked() const {
3992	assert(!UseDFA);
3993	for (int Slot = `0`; Slot < InitiationInterval; ++Slot) {
3994	for (unsigned I = `1`, E = SM.getNumProcResourceKinds(); I < E; ++I) {
3995	const MCProcResourceDesc *Desc = SM.getProcResource(ProcResourceIdx: I);
3996	if (MRT [Slot][I] > Desc->NumUnits)
3997	return true;
3998	}
3999	if (NumScheduledMops [Slot] > IssueWidth)
4000	return true;
4001	}
4002	return false;
4003	}
4004
4005	int ResourceManager::calculateResMIIDFA() const {
4006	assert(UseDFA);
4007
4008	// Sort the instructions by the number of available choices for scheduling,
4009	// least to most. Use the number of critical resources as the tie breaker.
4010	FuncUnitSorter FUS = FuncUnitSorter (*ST);
4011	for (SUnit &SU : DAG->SUnits)
4012	FUS.calcCriticalResources(MI&: *SU.getInstr());
4013	PriorityQueue<MachineInstr , std::vector<MachineInstr >, FuncUnitSorter>
4014	FuncUnitOrder(FUS);
4015
4016	for (SUnit &SU : DAG->SUnits)
4017	FuncUnitOrder.push(x: SU.getInstr());
4018
4019	SmallVector<std::unique_ptr<DFAPacketizer>, `8`> Resources;
4020	Resources.push_back(
4021	Elt: std::unique_ptr<DFAPacketizer>(TII->CreateTargetScheduleState(*ST)));
4022
4023	while (!FuncUnitOrder.empty()) {
4024	MachineInstr *MI = FuncUnitOrder.top();
4025	FuncUnitOrder.pop();
4026	if (TII->isZeroCost(Opcode: MI->getOpcode()))
4027	continue;
4028
4029	// Attempt to reserve the instruction in an existing DFA. At least one
4030	// DFA is needed for each cycle.
4031	unsigned NumCycles = DAG->getSUnit(MI)->Latency;
4032	unsigned ReservedCycles = `0`;
4033	auto *RI = Resources.begin();
4034	auto *RE = Resources.end();
4035	LLVM_DEBUG({
4036	dbgs() << "Trying to reserve resource for " << NumCycles
4037	<< " cycles for \n";
4038	MI->dump();
4039	});
4040	for (unsigned C = `0`; C < NumCycles; ++C)
4041	while (RI != RE) {
4042	if ((RI)->canReserveResources(MI&: MI)) {
4043	(RI)->reserveResources(MI&: MI);
4044	++ReservedCycles;
4045	break;
4046	}
4047	RI++;
4048	}
4049	LLVM_DEBUG(dbgs() << "ReservedCycles:" << ReservedCycles
4050	<< ", NumCycles:" << NumCycles << "\n");
4051	// Add new DFAs, if needed, to reserve resources.
4052	for (unsigned C = ReservedCycles; C < NumCycles; ++C) {
4053	LLVM_DEBUG(if (SwpDebugResource) dbgs()
4054	<< "NewResource created to reserve resources"
4055	<< "\n");
4056	auto NewResource = TII->CreateTargetScheduleState(ST);
4057	assert(NewResource->canReserveResources(*MI) && "Reserve error.");
4058	NewResource->reserveResources(MI&: *MI);
4059	Resources.push_back(Elt: std::unique_ptr<DFAPacketizer>(NewResource));
4060	}
4061	}
4062
4063	int Resmii = Resources.size();
4064	LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
4065	return Resmii;
4066	}
4067
4068	int ResourceManager::calculateResMII() const {
4069	if (UseDFA)
4070	return calculateResMIIDFA();
4071
4072	// Count each resource consumption and divide it by the number of units.
4073	// ResMII is the max value among them.
4074
4075	int NumMops = `0`;
4076	SmallVector<uint64_t> ResourceCount(SM.getNumProcResourceKinds());
4077	for (SUnit &SU : DAG->SUnits) {
4078	if (TII->isZeroCost(Opcode: SU.getInstr()->getOpcode()))
4079	continue;
4080
4081	const MCSchedClassDesc *SCDesc = DAG->getSchedClass(SU: &SU);
4082	if (!SCDesc->isValid())
4083	continue;
4084
4085	LLVM_DEBUG({
4086	if (SwpDebugResource) {
4087	DAG->dumpNode(SU);
4088	dbgs() << " #Mops: " << SCDesc->NumMicroOps << "\n"
4089	<< " WriteProcRes: ";
4090	}
4091	});
4092	NumMops += SCDesc->NumMicroOps;
4093	for (const MCWriteProcResEntry &PRE :
4094	make_range(x: STI->getWriteProcResBegin(SC: SCDesc),
4095	y: STI->getWriteProcResEnd(SC: SCDesc))) {
4096	LLVM_DEBUG({
4097	if (SwpDebugResource) {
4098	const MCProcResourceDesc *Desc =
4099	SM.getProcResource(PRE.ProcResourceIdx);
4100	dbgs() << Desc->Name << ": " << PRE.ReleaseAtCycle << ", ";
4101	}
4102	});
4103	ResourceCount [PRE.ProcResourceIdx] += PRE.ReleaseAtCycle;
4104	}
4105	LLVM_DEBUG(if (SwpDebugResource) dbgs() << "\n");
4106	}
4107
4108	int Result = (NumMops + IssueWidth - `1`) / IssueWidth;
4109	LLVM_DEBUG({
4110	if (SwpDebugResource)
4111	dbgs() << "#Mops: " << NumMops << ", "
4112	<< "IssueWidth: " << IssueWidth << ", "
4113	<< "Cycles: " << Result << "\n";
4114	});
4115
4116	LLVM_DEBUG({
4117	if (SwpDebugResource) {
4118	std::stringstream SS;
4119	SS << std::setw(`2`) << "ID" << std::setw(`16`) << "Name" << std::setw(`10`)
4120	<< "Units" << std::setw(`10`) << "Consumed" << std::setw(`10`) << "Cycles"
4121	<< "\n";
4122	dbgs() << SS.str();
4123	}
4124	});
4125	for (unsigned I = `1`, E = SM.getNumProcResourceKinds(); I < E; ++I) {
4126	const MCProcResourceDesc *Desc = SM.getProcResource(ProcResourceIdx: I);
4127	int Cycles = (ResourceCount [I] + Desc->NumUnits - `1`) / Desc->NumUnits;
4128	LLVM_DEBUG({
4129	if (SwpDebugResource) {
4130	std::stringstream SS;
4131	SS << std::setw(`2`) << I << std::setw(`16`) << Desc->Name << std::setw(`10`)
4132	<< Desc->NumUnits << std::setw(`10`) << ResourceCount[I]
4133	<< std::setw(`10`) << Cycles << "\n";
4134	dbgs() << SS.str();
4135	}
4136	});
4137	if (Cycles > Result)
4138	Result = Cycles;
4139	}
4140	return Result;
4141	}
4142
4143	void ResourceManager::init(int II) {
4144	InitiationInterval = II;
4145	DFAResources.clear();
4146	DFAResources.resize(N: II);
4147	for (auto &I : DFAResources)
4148	I.reset(p: ST->getInstrInfo()->CreateTargetScheduleState(*ST));
4149	MRT.clear();
4150	MRT.resize(N: II, NV: SmallVector<uint64_t>(SM.getNumProcResourceKinds()));
4151	NumScheduledMops.clear();
4152	NumScheduledMops.resize(N: II);
4153	}
4154
4155	bool SwingSchedulerDDGEdge::ignoreDependence(bool IgnoreAnti) const {
4156	if (Pred.isArtificial() \|\| Dst->isBoundaryNode())
4157	return true;
4158	// Currently, dependence that is an anti-dependences but not a loop-carried is
4159	// also ignored. This behavior is preserved to prevent regression.
4160	// FIXME: Remove if this doesn't have significant impact on performance
4161	return IgnoreAnti && (Pred.getKind() == SDep::Kind::Anti \|\| Distance != `0`);
4162	}
4163
4164	SwingSchedulerDDG::SwingSchedulerDDGEdges &
4165	SwingSchedulerDDG::getEdges(const SUnit *SU) {
4166	if (SU == EntrySU)
4167	return EntrySUEdges;
4168	if (SU == ExitSU)
4169	return ExitSUEdges;
4170	return EdgesVec [SU->NodeNum];
4171	}
4172
4173	const SwingSchedulerDDG::SwingSchedulerDDGEdges &
4174	SwingSchedulerDDG::getEdges(const SUnit SU) const* {
4175	if (SU == EntrySU)
4176	return EntrySUEdges;
4177	if (SU == ExitSU)
4178	return ExitSUEdges;
4179	return EdgesVec [SU->NodeNum];
4180	}
4181
4182	void SwingSchedulerDDG::addEdge(const SUnit *SU,
4183	const SwingSchedulerDDGEdge &Edge) {
4184	assert(!Edge.isValidationOnly() &&
4185	"Validation-only edges are not expected here.");
4186	auto &Edges = getEdges(SU);
4187	if (Edge.getSrc() == SU)
4188	Edges.Succs.push_back(Elt: Edge);
4189	else
4190	Edges.Preds.push_back(Elt: Edge);
4191	}
4192
4193	void SwingSchedulerDDG::initEdges(SUnit *SU) {
4194	for (const auto &PI : SU->Preds) {
4195	SwingSchedulerDDGEdge Edge(SU, PI, /IsSucc=/false,
4196	/IsValidationOnly=/false);
4197	addEdge(SU, Edge);
4198	}
4199
4200	for (const auto &SI : SU->Succs) {
4201	SwingSchedulerDDGEdge Edge(SU, SI, /IsSucc=/true,
4202	/IsValidationOnly=/false);
4203	addEdge(SU, Edge);
4204	}
4205	}
4206
4207	SwingSchedulerDDG::SwingSchedulerDDG(std::vector<SUnit> &SUnits, SUnit *EntrySU,
4208	SUnit ExitSU, const* LoopCarriedEdges &LCE)
4209	: EntrySU(EntrySU), ExitSU(ExitSU) {
4210	EdgesVec.resize(new_size: SUnits.size());
4211
4212	// Add non-loop-carried edges based on the DAG.
4213	initEdges(SU: EntrySU);
4214	initEdges(SU: ExitSU);
4215	for (auto &SU : SUnits)
4216	initEdges(SU: &SU);
4217
4218	// Add loop-carried edges, which are not represented in the DAG.
4219	for (SUnit &SU : SUnits) {
4220	SUnit *Src = &SU;
4221	if (const LoopCarriedEdges::OrderDep *OD = LCE.getOrderDepOrNull(Key: Src)) {
4222	SDep Base(Src, SDep::Barrier);
4223	Base.setLatency(`1`);
4224	for (SUnit Dst : OD) {
4225	SwingSchedulerDDGEdge Edge(Dst, Base, /IsSucc=/false,
4226	/IsValidationOnly=/true);
4227	Edge.setDistance(`1`);
4228	ValidationOnlyEdges.push_back(Elt: Edge);
4229	}
4230	}
4231	}
4232	}
4233
4234	const SwingSchedulerDDG::EdgesType &
4235	SwingSchedulerDDG::getInEdges(const SUnit SU) const* {
4236	return getEdges(SU).Preds;
4237	}
4238
4239	const SwingSchedulerDDG::EdgesType &
4240	SwingSchedulerDDG::getOutEdges(const SUnit SU) const* {
4241	return getEdges(SU).Succs;
4242	}
4243
4244	/// Check if \p Schedule doesn't violate the validation-only dependencies.
4245	bool SwingSchedulerDDG::isValidSchedule(const SMSchedule &Schedule) const {
4246	unsigned II = Schedule.getInitiationInterval();
4247
4248	auto ExpandCycle = [&](SUnit *SU) {
4249	int Stage = Schedule.stageScheduled(SU);
4250	int Cycle = Schedule.cycleScheduled(SU);
4251	return Cycle + (Stage * II);
4252	};
4253
4254	for (const SwingSchedulerDDGEdge &Edge : ValidationOnlyEdges) {
4255	SUnit *Src = Edge.getSrc();
4256	SUnit *Dst = Edge.getDst();
4257	if (!Src->isInstr() \|\| !Dst->isInstr())
4258	continue;
4259	int CycleSrc = ExpandCycle (Src);
4260	int CycleDst = ExpandCycle (Dst);
4261	int MaxLateStart = CycleDst + Edge.getDistance() * II - Edge.getLatency();
4262	if (CycleSrc > MaxLateStart) {
4263	LLVM_DEBUG({
4264	dbgs() << "Validation failed for edge from " << Src->NodeNum << " to "
4265	<< Dst->NodeNum << "\n";
4266	});
4267	return false;
4268	}
4269	}
4270	return true;
4271	}
4272
4273	void LoopCarriedEdges::modifySUnits(std::vector<SUnit> &SUnits,
4274	const TargetInstrInfo *TII) {
4275	for (SUnit &SU : SUnits) {
4276	SUnit *Src = &SU;
4277	if (auto *OrderDep = getOrderDepOrNull(Key: Src)) {
4278	SDep Dep(Src, SDep::Barrier);
4279	Dep.setLatency(`1`);
4280	for (SUnit Dst : OrderDep) {
4281	SUnit *From = Src;
4282	SUnit *To = Dst;
4283	if (From->NodeNum > To->NodeNum)
4284	std::swap(a&: From, b&: To);
4285
4286	// Add a forward edge if the following conditions are met:
4287	//
4288	// - The instruction of the source node (FromMI) may read memory.
4289	// - The instruction of the target node (ToMI) may modify memory, but
4290	// does not read it.
4291	// - Neither instruction is a global barrier.
4292	// - The load appears before the store in the original basic block.
4293	// - There are no barrier or store instructions between the two nodes.
4294	// - The target node is unreachable from the source node in the current
4295	// DAG.
4296	//
4297	// TODO: These conditions are inherited from a previous implementation,
4298	// and some may no longer be necessary. For now, we conservatively
4299	// retain all of them to avoid regressions, but the logic could
4300	// potentially be simplified
4301	MachineInstr *FromMI = From->getInstr();
4302	MachineInstr *ToMI = To->getInstr();
4303	if (FromMI->mayLoad() && !ToMI->mayLoad() && ToMI->mayStore() &&
4304	!TII->isGlobalMemoryObject(MI: FromMI) &&
4305	!TII->isGlobalMemoryObject(MI: ToMI) && !isSuccOrder(SUa: From, SUb: To)) {
4306	SDep Pred = Dep;
4307	Pred.setSUnit(From);
4308	To->addPred(D: Pred);
4309	}
4310	}
4311	}
4312	}
4313	}
4314
4315	void LoopCarriedEdges::dump(SUnit SU, const* TargetRegisterInfo *TRI,
4316	const MachineRegisterInfo MRI) const* {
4317	const auto *Order = getOrderDepOrNull(Key: SU);
4318
4319	if (!Order)
4320	return;
4321
4322	const auto DumpSU = [](const SUnit *SU) {
4323	std::ostringstream OSS;
4324	OSS << "SU(" << SU->NodeNum << ")";
4325	return OSS.str();
4326	};
4327
4328	dbgs() << " Loop carried edges from " << DumpSU (SU) << "\n"
4329	<< " Order\n";
4330	for (SUnit Dst : Order)
4331	dbgs() << " " << DumpSU (Dst) << "\n";
4332	}
4333

source code of llvm/lib/CodeGen/MachinePipeliner.cpp